In [35]:
import spacy
import os
import glob
from spacy.tokens import Doc
from spacy.language import Language
import pickle
from unidecode import unidecode
import sddk
import pandas as pd
import re

In [2]:
# for the following to work the model has to be previously installed:
# !pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl

nlp = spacy.load('la_core_web_lg') 

In [9]:
# list files in the right folder
path = "../data/large_data/Curated transcriptions/"
filedirs = os.listdir(path)
filedirs[:5]

['100037_DuChesne1575_Ad_Iacobi_Auberti_MDZ_Augsburg',
 '100035_Dorn1570_Lapis_metaphysicus_MDZ_MBS_pdf',
 '100027_Morienus1559_Morieni_Romani_IA_pdf',
 '100012_Gessner1552_Thesaurus_Euonymi_Philiatri_ER_ZZ',
 '100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS']

In [19]:
fns = [(dirpath.partition("_")[0], fn) for dirpath in filedirs for fn in glob.glob(os.path.join(path, dirpath, '*.txt'))]
fns[:5]

[('100037',
  '../data/large_data/Curated transcriptions/100037_DuChesne1575_Ad_Iacobi_Auberti_MDZ_Augsburg/DuChesne1575_Ad_Iacobi_Auberti_MDZ_Augsburg.txt'),
 ('100035',
  '../data/large_data/Curated transcriptions/100035_Dorn1570_Lapis_metaphysicus_MDZ_MBS_pdf/Dorn1570_Lapis_metaphysicus_MDZ_MBS_pdf.txt'),
 ('100027',
  '../data/large_data/Curated transcriptions/100027_Morienus1559_Morieni_Romani_IA_pdf/Morienus1559_Morieni_Romani_IA_pdf.txt'),
 ('100012',
  '../data/large_data/Curated transcriptions/100012_Gessner1552_Thesaurus_Euonymi_Philiatri_ER_ZZ/Gessner1552_Thesaurus_Euonymi_Philiatri_ER_ZZ.txt'),
 ('100029',
  '../data/large_data/Curated transcriptions/100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS/Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS.txt')]

In [29]:
# test loading one specific text
fn = fns[0][1]
with open(fn, "r") as f:
    text = f.read()
text[:5000]


"AD IACOBI\nAVBERTI VINDO¬\nNIS DE ORTV ET CAVSIS\nMETALLORVM CONTRA\nChymicos Explicationem\nIOSEPHI QVERCETANI ARME¬\nniaci, D. Medici breuis Responsio.\nEIVSDEM DE EXQVISITA\nMineralium, Animalium, & Vegetabilium me¬\ndicamentorum Spagyrica praeparatione &\nvsu, perspicua Tractatio.\nLVGDVNI,\nApud loannem Lertotium.\nM. D. LXXV.\n\n\nQVADRIN.\n[F] En nostre estat au vostre tout contraire,\nSi nous soufflons, vous humez d'autrepart:\nOr sus enfans, de ces deux poincts de l'art,\nIugez lequel est plus seant de faire. [/F]\nVIRTVTIS COMES\nINVIDIA.\n\n\nGENEROSIS.\nSIMO AC SPLENDI¬\ndissimo viro Jacobo de la\nFin, Regii ordinis Equiti\nAurato, eiusque Nobili\ncubiculario, D. de la Fin la¬\nNocle, Pluuiers, Baroni\nd Aubusson, &c. Josephus\nQuercetanus. S.\nMemoriae proditum est,\nPythagoram, hominum\nvitam dixisse consimi¬\nlem sibi videri eius Pa¬\nnegyris ac mercatus Graeciae no¬\nbilissimi, quò nonnulli certandi,\nalij emendi & vendendi, alij ve¬\nrò spectandi tantùm causa se con¬\

In [42]:
def text_cleaner(rawtext):
    cleantext = rawtext.replace("¬\n", "").replace("\n", " ").replace("ß", "ss").replace("ij","ii")
    cleantext = " ".join([t[0] + t[1:].lower() for t in cleantext.split()])
    cleantext = re.sub("\s\s+", " ", cleantext)
    cleantext = unidecode(cleantext)
    cleantext = cleantext.replace("v", "u").replace("V", "U")
    return cleantext

# lets encapsulate the cleaning and spacy pipeline application into one function
def from_rawtext_to_doc(rawtext):
    cleantext = text_cleaner(rawtext)
    segment_len = 800000
    if len(cleantext) > segment_len:
        segment_docs = []
        parts = cleantext[:segment_len].rpartition(". ")
        current_segment = parts[0] + parts[1]
        segment_doc = nlp(current_segment)
        segment_docs.append(segment_doc)
        next_segment_beginning = parts[2]
        for n in range(segment_len, len(cleantext), segment_len):
            segment = cleantext[n:n+segment_len]
            if len(segment) == segment_len:
                parts = cleantext[n:n+segment_len].rpartition(". ")
                current_segment = parts[0] + parts[1]
                segment_doc = nlp(next_segment_beginning + current_segment)
                next_segment_beginning = parts[2]
            else:
                segment_doc = nlp(segment)
            segment_docs.append(segment_doc)
        doc = Doc.from_docs(segment_docs)
    else:
        doc = nlp(cleantext)
    return doc

In [53]:
sent = "This is a sentence. AND HERE starts a second one"
text_cleaner(sent)

'This is a sentence. And Here starts a second one'

In [55]:
rawtext[:5000]

'LAVREN¬\nTII VENTVRAE VE¬\nNETI, ARTIVM ET ME¬\nDICINAE DOCTRINA DE RATIONE\nconficiendi Lapidis philosophi¬\nci, liber Vnus.\nAD OTHONEM HENRICVM\nPrincipem Palatinum.\nHuic accesserunt eiusdem Argumenti loannis\nGarlandij Angli liber Vnus. Et ex Spe¬\nculo magno Vincentij li¬\nbri Duo.\nCVM PRIVILEGIO CAESAR. MAIEST.\nBASILEAE\nM. D. LXXI.\n\n\nSERENISSIMO AT¬\nQVE ILLVSTRISSIMO\nPrincipi D. D. OTTHONI HEN¬\nRICO COMITI Palatino Rhe¬\nni, Et vtriusque Bauariae Duci, ac sacri\nRomani Imperij Electori Inuictiss. Lau¬\nrentius Ventura ciuis Venetus,\nartiumque Doctor, Foe¬\nlicitatem.\nMVLTA, Princeps o¬\nptime, homines ad bo¬\nnarum artium studia\ncompellunt, sed qua¬\ntuor, vt caetera omit¬\ntam sunt potissima, quae vltrò mihi sese\nofferunt, ac in ijs perquirendis probos\nviros alliciunt, illisque in hoc glorioso palè¬\nstrae certamine, maximam laborum sum¬\nmam ferè de medio tollunt, ac victoriae\npraemium firmiter pollicentur. Est, in¬\nquam, primo sciendi naturale illud o¬\n):(\n

In [54]:
text_cleaner(rawtext)[:5000]

'Laurentii Uenturae Ueneti, Artium Et Medicinae Doctrina De Ratione conficiendi Lapidis philosophici, liber Unus. Ad Othonem Henricum Principem Palatinum. Huic accesserunt eiusdem Argumenti loannis Garlandii Angli liber Unus. Et ex Speculo magno Uincentii libri Duo. Cum Priuilegio Caesar. Maiest. Basileae M. D. Lxxi. Serenissimo Atque Illustrissimo Principi D. D. Otthoni Henrico Comiti Palatino Rheni, Et utriusque Bauariae Duci, ac sacri Romani Imperii Electori Inuictiss. Laurentius Uentura ciuis Uenetus, artiumque Doctor, Foelicitatem. Multa, Princeps optime, homines ad bonarum artium studia compellunt, sed quatuor, ut caetera omittam sunt potissima, quae ultro mihi sese offerunt, ac in iis perquirendis probos uiros alliciunt, illisque in hoc glorioso palestrae certamine, maximam laborum summam fere de medio tollunt, ac uictoriae praemium firmiter pollicentur. Est, inquam, primo sciendi naturale illud o):( Epistola. mnibus a Deo opt. max. datum desiderium, laudis postremo, & honoris e

In [None]:
"liber_N", "liber_A"

In [56]:
doc = from_rawtext_to_doc(rawtext)

In [44]:
# testing spacy...
fn = fns[0][1]
with open(fn, "r") as f:
    rawtext = f.read()
doc = from_rawtext_to_doc(rawtext)

In [31]:
try:
    os.mkdir("../data/sents_data")
except:
    pass

In [45]:
%%time
for id, fn in fns:
    with open(fn, "r") as f:
        rawtext = f.read()
    doc = from_rawtext_to_doc(rawtext)
    doc_sentdata = [(sent.text, [(t.text, t.lemma_.lower(), t.pos_, (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))) for t in sent]) for sent in doc.sents]
    with open("../data/sents_data/" + id + ".pickle", "wb") as f:
        pickle.dump(doc_sentdata, f)

CPU times: user 7min 8s, sys: 14.4 s, total: 7min 22s
Wall time: 7min 25s


In [34]:
# Extract the lemmatized sentences

In [46]:
fns = os.listdir("../data/sents_data/")
fns[:5]

['100006.pickle',
 '100018.pickle',
 '100014.pickle',
 '100022.pickle',
 '100030.pickle']

In [64]:
sents_data = pickle.load(open("../data/sents_data/" + fns[0], "rb"))
sents_data

[('Excellentissimi Medici Michaelis Sauonarole libellus singularis de arte conficiendi Aquam uitae simplicem &',
  [('Excellentissimi', 'excellentissimi', 'PROPN', (0, 15)),
   ('Medici', 'medicus', 'NOUN', (16, 22)),
   ('Michaelis', 'michael', 'PROPN', (23, 32)),
   ('Sauonarole', 'sauonarola', 'PROPN', (33, 43)),
   ('libellus', 'libellus', 'NOUN', (44, 52)),
   ('singularis', 'singularis', 'ADJ', (53, 63)),
   ('de', 'de', 'ADP', (64, 66)),
   ('arte', 'ars', 'NOUN', (67, 71)),
   ('conficiendi', 'conficio', 'VERB', (72, 83)),
   ('Aquam', 'aqua', 'NOUN', (84, 89)),
   ('uitae', 'uita', 'NOUN', (90, 95)),
   ('simplicem', 'simplex', 'ADJ', (96, 105)),
   ('&', '&', 'PUNCT', (106, 107))]),
 ('compositam.',
  [('compositam', 'compono', 'VERB', (0, 10)), ('.', '.', 'PUNCT', (10, 11))]),
 ('Et de eiusdem admirabili uirtute ad conseruam dam sanitatem, & ad diuersas Humani corporis aegritudines curandas.',
  [('Et', 'et', 'CCONJ', (0, 2)),
   ('de', 'de', 'ADP', (3, 5)),
   ('eiusdem', '

In [47]:
try:
    os.mkdir("../data/lemmatized_sents/")
except:
    pass
for fn in fns:
    lemmatized_sents = []
    sents_data = pickle.load(open("../data/sents_data/" + fn, "rb"))
    for (sent_text, sent_data) in sents_data:
        lemmasent = []
        for wordform, lemma, tag, position in sent_data:
            if tag in ["NOUN", "PROPN", "ADJ", "VERB"]:
                lemmasent.append(lemma)
        lemmatized_sents.append(" ".join(lemmasent) + "\n")
    with open("../data/lemmatized_sents/" + fn.replace(".pickle", ".txt"), "w") as f:
        f.writelines(lemmatized_sents)