In [21]:
import spacy
import os
import glob
from spacy.tokens import Doc
from spacy.language import Language
import pickle
from unidecode import unidecode
import sddk
import pandas as pd
import re
import sys
import importlib
import json

For preprocessing the latin texts, we will use a module located outside of the current repository, specifically at the same level one level up.

The module can be clonned from here: https://github.com/CCS-ZCU/latin-preprocessing and imported to python following the steps below:

In [2]:
# for preprocessing the latin texts, we will use a module located outside of the current repository, specifically at the same level 
current_working_directory = os.getcwd()
relative_path = '../../latin-preprocessing/' # change according to your location...
module_path = os.path.abspath(os.path.join(current_working_directory, relative_path))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
# Now import the module
import tomela

CuPy is able to use the GPU.
GPU is available for SpaCy.


Tomela contains tuned latin preprocessing pipeline relying on spaCy and latinCy. You can check the pipeline as here:

In [3]:
tomela.nlp.pipeline

[('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x7c2e4ff563e0>),
 ('normer', <function la_core_web_lg.functions.normer(doc)>),
 ('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7c2e4ff56560>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7c2e4ff56620>),
 ('morphologizer',
  <spacy.pipeline.morphologizer.Morphologizer at 0x7c2e4ff54b20>),
 ('trainable_lemmatizer',
  <spacy.pipeline.edit_tree_lemmatizer.EditTreeLemmatizer at 0x7c2e4ff54d00>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7c2e5ef4b7d0>),
 ('lookup_lemmatizer',
  <function la_core_web_lg.functions.make_lookup_lemmatizer_function(doc)>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7c2e45fe8200>)]

In [4]:
doc = tomela.nlp("Veritas, vt vlla dicit, semper est universalis et a principiis fundamentalis oritur (lib. 3, cap. VI)")
for token in doc:
    print((token.text, token.lemma_, token.pos_))

('Veritas', 'ueritas', 'NOUN')
(',', ',', 'PUNCT')
('vt', 'vt', 'ADV')
('vlla', 'vllus', 'NOUN')
('dicit', 'dico', 'VERB')
(',', ',', 'PUNCT')
('semper', 'semper', 'ADV')
('est', 'sum', 'AUX')
('universalis', 'uniuersalis', 'ADJ')
('et', 'et', 'CCONJ')
('a', 'ab', 'ADP')
('principiis', 'principium', 'NOUN')
('fundamentalis', 'fundamentalis', 'ADJ')
('oritur', 'orior', 'VERB')
('(', '(', 'PUNCT')
('lib', 'liber', 'NOUN')
('.', '.', 'PUNCT')
('3', '3', 'NUM')
(',', ',', 'PUNCT')
('cap', 'capitulum', 'NOUN')
('.', '.', 'PUNCT')
('VI', 'uis', 'NUM')
(')', ')', 'PUNCT')


In [5]:
# list files in the right folder
path = "/srv/data/tome/tome-corpus/tome_2-0/tome 2.0"
filedirs = os.listdir(path)
filedirs[:5]

['100005_Pantheus1530_Voarchadumia_ONB',
 '100048_Pseudo-Aquinas1579_Secreta_alchemiae_magnalia_ONB',
 '100034_Penotus1594_Tractatus_varii_MDZ_MBS',
 '100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS',
 '100039_Pseudo-Democritus1572_Ars_magna_MDZ_MBS']

In [42]:
fns = [(dirpath.partition("_")[0], fn) for dirpath in filedirs for fn in glob.glob(os.path.join(path, dirpath, '*.txt'))]
fns[:5]

[('100005',
  '/srv/data/tome/tome-corpus/tome_2-0/tome 2.0/100005_Pantheus1530_Voarchadumia_ONB/100005_Pantheus1530_Voarchadumia_ONB.txt'),
 ('100048',
  '/srv/data/tome/tome-corpus/tome_2-0/tome 2.0/100048_Pseudo-Aquinas1579_Secreta_alchemiae_magnalia_ONB/100048_Pseudo-Aquinas1579_Secreta_alchemiae_magnalia_ONB.txt'),
 ('100034',
  '/srv/data/tome/tome-corpus/tome_2-0/tome 2.0/100034_Penotus1594_Tractatus_varii_MDZ_MBS/100034_Penotus1594_Tractatus_varii_MDZ_MBS.txt'),
 ('100029',
  '/srv/data/tome/tome-corpus/tome_2-0/tome 2.0/100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS/100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS.txt'),
 ('100039',
  '/srv/data/tome/tome-corpus/tome_2-0/tome 2.0/100039_Pseudo-Democritus1572_Ars_magna_MDZ_MBS/100039_Pseudo-Democritus1572_Ars_magna_MDZ_MBS.txt')]

In [43]:
# test loading one specific text
fn = fns[0][1]
with open(fn, "r", encoding="utf-8") as f:
    rawtext = f.read()
rawtext[:1000]

"MARIA\n\n\nAVLA PVRIFICATIONIS AVRI\nNON SINE SALE & NON SINE ARGILLA\nVOARCH ¬\nADVMIA\ncontra Alchi'miam : Ars distincta ab\nArchimi'a, & Sophia: cum Additio¬\nnibus & Proportionibus: Numeris: &\nFiguris oportubit Ioannis Augustini\nPanthei Veneti sacerdotis.\nVenetiis. Diebus. Aprilis.\nM. D. XXX.\n\n\nMORIENVS\n\n\nCONCESSIO IMPRESSIONIS.\nConcessio Reuerendissimi. D. Legati apostolici.\nALTOBELLVS AVEROL¬\nDVS Dei & apostolicae sedis gratia\nEpiscopus Polen. S. D. N. Papae Re¬\nferen. & per totum Venetorum domi¬\nnium, cum potestate Legati Cardina¬\nIlis de latere, Legatus apostolicus. Dile¬\ncto nobis in CHRISTO Ioanni Augustino Pantheo\nVeneto sacerdoti, salutem in domino sempiternam: & caetera.\nMandamus igitur & praecipimus authoritate apostolica,\nqua ex munere legationis nostrae huiusmodi fungimur in\nhac parte: ne quis legationi nostrae subiectus: id ipsum\nopusculum siue Latina, siue Vernacula lingua perscriptum:\nin locis legationis nostrae huiusmodi imprimere, aut im¬\n

In [14]:
# in case you locally tune the module, ensure that you have loaded the latest version!
importlib.reload(tomela)

CuPy is able to use the GPU.
GPU is available for SpaCy.


<module 'tomela' from '/home/jupyter-vojta/notebooks/latin-preprocessing/tomela/__init__.py'>

In [15]:
tomela.text_cleaner(rawtext)[:5000]

"Maria Aula Purificationis Auri Non Sine Sale & Non Sine Argilla Uoarch Adumia contra Alchi'miam : Ars distincta ab Archimi'a, & Sophia: cum Additionibus & Proportionibus: Numeris: & Figuris oportubit Ioannis Augustini Panthei Ueneti sacerdotis. Uenetiis. Diebus. Aprilis. M. D. Xxx. Morienus Concessio Impressionis. Concessio Reuerendissimi. D. Legati apostolici. Altobellus Aueroldus Dei & apostolicae sedis gratia Episcopus Polen. S. D. N. Papae Referen, & per totum Uenetorum dominium, cum potestate Legati Cardinailis de latere, Legatus apostolicus. Dilecto nobis in Christo Ioanni Augustino Pantheo Ueneto sacerdoti, salutem in domino sempiternam: & caetera. Mandamus igitur & praecipimus authoritate apostolica, qua ex munere legationis nostrae huiusmodi fungimur in hac parte: ne quis legationi nostrae subiectus: id ipsum opusculum siue Latina, siue Uernacula lingua perscriptum: in locis legationis nostrae huiusmodi imprimere, aut impressum uenundare, uendendumue tradere ullis in locis al

In [16]:
doc = tomela.from_rawtext_to_doc(rawtext, lowertext=False)

In [17]:
target_path = "/srv/data/tome/tome-corpus/sents_data_jsons_v2-0/"
try:
    os.mkdir(target_path)
except:
    pass

In [44]:
%%time
for id, fn in fns:
    with open(fn, "r", encoding="utf-8") as f:
        rawtext = f.read()
    doc = tomela.from_rawtext_to_doc(rawtext)
    doc_sentdata = [(sent.text, [(t.text, t.lemma_, t.pos_, (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))) for t in sent]) for sent in doc.sents]
    sent_data_updated = []
    for n_sent, sent_data in enumerate(doc_sentdata):
        sent_data_updated.append((id, n_sent, sent_data[0], sent_data[1]))
    with open(target_path + id + ".json", "w") as f:
        json.dump(sent_data_updated, f)

CPU times: user 2min 47s, sys: 7.81 s, total: 2min 55s
Wall time: 2min 54s


In [34]:
# Extract the lemmatized sentences

In [45]:
fns_jsons = os.listdir(target_path)
fns_jsons[:10]

['100044.json',
 '100034.json',
 '100014.json',
 '100060.json',
 '100010.json',
 '100043.json',
 '100041.json',
 '100012.json',
 '100025.json',
 '100019.json']

In [46]:
sents_data = json.load(open(target_path + fns_jsons[4], "r"))
sents_data[100:103]

[['100010',
  100,
  'Et eodem libro, cap. 3, scripsimus, eum, qui non habuerit ingenium naturale, & animum, ingeniose & subtiliter perscrutantem principia naturalia, & naturae fundamenta, & artificia, quae naturam assequi possint, in suae actionis proprietatibus, non inuenturum huius preciosissimae artis & magisterii ueram radicem.',
  [['Et', 'et', 'CCONJ', [0, 2]],
   ['eodem', 'idem', 'DET', [3, 8]],
   ['libro', 'liber', 'NOUN', [9, 14]],
   [',', ',', 'PUNCT', [14, 15]],
   ['cap', 'capitulum', 'NOUN', [16, 19]],
   ['.', '.', 'PUNCT', [19, 20]],
   ['3', '3', 'NUM', [21, 22]],
   [',', ',', 'PUNCT', [22, 23]],
   ['scripsimus', 'scribo', 'VERB', [24, 34]],
   [',', ',', 'PUNCT', [34, 35]],
   ['eum', 'is', 'PRON', [36, 39]],
   [',', ',', 'PUNCT', [39, 40]],
   ['qui', 'qui', 'PRON', [41, 44]],
   ['non', 'non', 'PART', [45, 48]],
   ['habuerit', 'habeo', 'VERB', [49, 57]],
   ['ingenium', 'ingenium', 'NOUN', [58, 66]],
   ['naturale', 'naturalis', 'ADJ', [67, 75]],
   [',', ','

In [47]:
lemmatized_sents_path = "/srv/data/tome/tome-corpus/lemmatized_sents_v2-0/"
try:
    os.mkdir(lemmatized_sents_path)
except:
    pass

In [49]:

for fn in fns_jsons:
    lemmatized_sents = []
    sents_data = json.load(open(target_path + fn, "rb"))
    for (doc_id, sent_id, sent_text, sent_data) in sents_data:
        lemmasent = []
        for wordform, lemma, tag, position in sent_data:
            if tag in ["NOUN", "PROPN", "ADJ", "VERB"]:
                lemmasent.append(lemma)
        lemmatized_sents.append(" ".join(lemmasent) + "\n")
    with open(lemmatized_sents_path + fn.replace(".json", ".txt"), "w", encoding="utf-8") as f:
        f.writelines(lemmatized_sents)