In [51]:
import spacy
import sys
import os
import glob
from spacy.tokens import Doc
from spacy.language import Language
import pickle
from unidecode import unidecode
import sddk
import pandas as pd
import re
import cupy
import importlib
import json

In [35]:
current_working_directory = os.getcwd()
module_path = os.path.abspath(os.path.join(current_working_directory, '../../latin-preprocessing/'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [36]:
# Now try importing the module
import tomela

In [37]:
# in case there is an update
# importlib.reload(tomela)

## Extract sentence data

In [87]:
source_path = "/srv/data/corpus-corporum/cc_rawtexts/"
filenames_list = os.listdir(source_path)
filenames_list[:10]

['11862.txt',
 '15393.txt',
 '8089.txt',
 '20102.txt',
 '7275.txt',
 '10052.txt',
 '10765.txt',
 '7562.txt',
 '11896.txt',
 '9411.txt']

In [88]:
target_path = "/srv/data/corpus-corporum/cc_sents_jsons/"
try:
    os.mkdir(target_path)
except:
    pass

In [89]:
files_ready = os.listdir(target_path)
len(files_ready)

5

In [None]:
%%time
for n, filename in enumerate(filenames_list):
    if filename not in files_ready:
        id = filename.rpartition(".txt")[0]
        with open(source_path + filename, "r", encoding="utf-8") as f:
            rawtext = f.read()
        doc = tomela.from_rawtext_to_doc(rawtext, lowertext=False)
        doc_sentdata = [(sent.text, [(t.text, t.lemma_, t.pos_, (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))) for t in sent]) for sent in doc.sents]
        #with open(target_path + id + ".pickle", "wb") as f:
        #    pickle.dump(doc_sentdata, f)
        sent_data_updated = []
        for n, sent_data in enumerate(doc_sentdata):
            sent_data_updated.append((id, n, sent_data[0], sent_data[1]))
        with open(target_path + id + ".json", "w") as f:
            json.dump(sent_data_updated, f)
        if n in range(0, len(filenames_list), 50): # print out the progress each 50 files...
            print(n)

In [58]:
# test reading the data back with one file...
fn = os.listdir(target_path)[0]
sents_data = json.load(open(target_path + fn, "r"))

In [62]:
sents_data[:5]

[['8089',
  0,
  'Benedictiones #benedictio in Annuntiatione sanctae Mariae.@# Dominus Jesus Christus, qui in saeculorum fine processit ex Uirgine, cor uestrum uirginitatis incorruptae corona clarificet.',
  [['Benedictiones', 'benedictio', 'NOUN', [0, 13]],
   ['#', '#', 'PUNCT', [14, 15]],
   ['benedictio', 'benedictio', 'NOUN', [15, 25]],
   ['in', 'in', 'ADP', [26, 28]],
   ['Annuntiatione', 'annuntiatio', 'NOUN', [29, 42]],
   ['sanctae', 'sanctus', 'ADJ', [43, 50]],
   ['Mariae.@', 'Mariae.@', 'ADJ', [51, 59]],
   ['#', '#', 'PUNCT', [59, 60]],
   ['Dominus', 'Dominus', 'NOUN', [61, 68]],
   ['Jesus', 'Jesus', 'PROPN', [69, 74]],
   ['Christus', 'Christus', 'PROPN', [75, 83]],
   [',', ',', 'PUNCT', [83, 84]],
   ['qui', 'qui', 'PRON', [85, 88]],
   ['in', 'in', 'ADP', [89, 91]],
   ['saeculorum', 'saeculum', 'NOUN', [92, 102]],
   ['fine', 'finis', 'NOUN', [103, 107]],
   ['processit', 'procedo', 'VERB', [108, 117]],
   ['ex', 'ex', 'ADP', [118, 120]],
   ['Uirgine', 'uirgo', 'P

## Extract lemmatized & filtered sentences

In [71]:
source_path = target_path
target_path = "/srv/data/corpus-corporum/cc_lemmatized_sents/"
try:
    os.mkdir(target_path)
except:
    pass

In [72]:
source_path

'/srv/data/corpus-corporum/cc_sents_jsons/'

In [73]:
fns = os.listdir(source_path)
fns[:5]

['8089.json', '11862.json', '15393.json', '20102.json', '7275.json']

In [81]:
for fn in fns:
    lemmatized_sents = []
    with open(os.path.join(source_path, fn), "r") as file:
        sents_data = json.load(file)
        for (doc_id, sent_id, sent_text, sent_data) in sents_data:
            lemmasent = []
            for wordform, lemma, tag, position in sent_data:
                if tag in ["NOUN", "PROPN", "ADJ", "VERB"]:
                    lemmasent.append(lemma)
            lemmatized_sents.append(" ".join(lemmasent) + "\n")

    with open(os.path.join(target_path, fn.replace(".json", ".txt")), "w", encoding="utf-8") as f:
        f.writelines(lemmatized_sents)

In [82]:
os.listdir(target_path)[0]

'11862.txt'

In [84]:
with open(os.path.join(target_path, os.listdir(target_path)[0]), "r", encoding="utf-8") as f:
    lemmatized_sents = f.readlines()

In [85]:
lemmatized_sents[:10]

['Decem categoria Capas\n',
 'scientia disciplina ars diuersus oratio tracto filum quiuis genus polleo inuenio oratio uolo origo tracto mirandus Aristoteles philosopeh diligentia disserendo cupidus coepio examen scio praetermitto necessarius\n',
 '\n',
 'doceo grammaticus pars oratio uoco appello oratio pars indico uocabulum signo\n',
 'oratio pars auctor Aristoteles nomen uerbum debeo accipio caeterus facio compago oratio pars debeo nomino\n',
 'nomen persona demonstro uerbum facio patior\n',
 'Capitulum\n',
 'doceo debeo aduerto compendium oratio coarto gradus uocabulum capio concluseo\n',
 'diuersus innumerabilis mortalis nuncupatio comprehendo possum nomen latus diuersitas uocabulum homo dico nosco\n',
 '\n']