In [28]:
import spacy
import os
import glob
from spacy.tokens import Doc
from spacy.language import Language
import pickle
import sddk
import pandas as pd
import re

In [2]:
# for the following to work the model has to be previously installed:
# !pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl

nlp = spacy.load('la_core_web_lg') 

In [9]:
# list files in the right folder
path = "../data/large_data/Curated transcriptions/"
filedirs = os.listdir(path)
filedirs[:5]

['100037_DuChesne1575_Ad_Iacobi_Auberti_MDZ_Augsburg',
 '100035_Dorn1570_Lapis_metaphysicus_MDZ_MBS_pdf',
 '100027_Morienus1559_Morieni_Romani_IA_pdf',
 '100012_Gessner1552_Thesaurus_Euonymi_Philiatri_ER_ZZ',
 '100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS']

In [19]:
fns = [(dirpath.partition("_")[0], fn) for dirpath in filedirs for fn in glob.glob(os.path.join(path, dirpath, '*.txt'))]
fns[:5]

[('100037',
  '../data/large_data/Curated transcriptions/100037_DuChesne1575_Ad_Iacobi_Auberti_MDZ_Augsburg/DuChesne1575_Ad_Iacobi_Auberti_MDZ_Augsburg.txt'),
 ('100035',
  '../data/large_data/Curated transcriptions/100035_Dorn1570_Lapis_metaphysicus_MDZ_MBS_pdf/Dorn1570_Lapis_metaphysicus_MDZ_MBS_pdf.txt'),
 ('100027',
  '../data/large_data/Curated transcriptions/100027_Morienus1559_Morieni_Romani_IA_pdf/Morienus1559_Morieni_Romani_IA_pdf.txt'),
 ('100012',
  '../data/large_data/Curated transcriptions/100012_Gessner1552_Thesaurus_Euonymi_Philiatri_ER_ZZ/Gessner1552_Thesaurus_Euonymi_Philiatri_ER_ZZ.txt'),
 ('100029',
  '../data/large_data/Curated transcriptions/100029_Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS/Paracelsus1560_Libri_quatuor_de_vita_longa_MDZ_MBS.txt')]

In [29]:
# test loading one specific text
fn = fns[0][1]
with open(fn, "r") as f:
    text = f.read()
text[:5000]


"AD IACOBI\nAVBERTI VINDO¬\nNIS DE ORTV ET CAVSIS\nMETALLORVM CONTRA\nChymicos Explicationem\nIOSEPHI QVERCETANI ARME¬\nniaci, D. Medici breuis Responsio.\nEIVSDEM DE EXQVISITA\nMineralium, Animalium, & Vegetabilium me¬\ndicamentorum Spagyrica praeparatione &\nvsu, perspicua Tractatio.\nLVGDVNI,\nApud loannem Lertotium.\nM. D. LXXV.\n\n\nQVADRIN.\n[F] En nostre estat au vostre tout contraire,\nSi nous soufflons, vous humez d'autrepart:\nOr sus enfans, de ces deux poincts de l'art,\nIugez lequel est plus seant de faire. [/F]\nVIRTVTIS COMES\nINVIDIA.\n\n\nGENEROSIS.\nSIMO AC SPLENDI¬\ndissimo viro Jacobo de la\nFin, Regii ordinis Equiti\nAurato, eiusque Nobili\ncubiculario, D. de la Fin la¬\nNocle, Pluuiers, Baroni\nd Aubusson, &c. Josephus\nQuercetanus. S.\nMemoriae proditum est,\nPythagoram, hominum\nvitam dixisse consimi¬\nlem sibi videri eius Pa¬\nnegyris ac mercatus Graeciae no¬\nbilissimi, quò nonnulli certandi,\nalij emendi & vendendi, alij ve¬\nrò spectandi tantùm causa se con¬\

In [26]:
def text_cleaner(rawtext):
    cleantext = rawtext.replace("¬\n", "").replace("\n", " ").replace("ß", "ss").replace("ij","ii")
    cleantext = " ".join([t[0] + t[1:].lower() for t in cleantext.split()])
    cleantext = re.sub("\s\s+", " ", cleantext)
    return cleantext

# lets encapsulate the cleaning and spacy pipeline application into one function
def from_rawtext_to_doc(rawtext):
    cleantext = text_cleaner(rawtext)
    segment_len = 800000
    if len(cleantext) > segment_len:
        segment_docs = []
        parts = cleantext[:segment_len].rpartition(". ")
        current_segment = parts[0] + parts[1]
        segment_doc = nlp(current_segment)
        segment_docs.append(segment_doc)
        next_segment_beginning = parts[2]
        for n in range(segment_len, len(cleantext), segment_len):
            segment = cleantext[n:n+segment_len]
            if len(segment) == segment_len:
                parts = cleantext[n:n+segment_len].rpartition(". ")
                current_segment = parts[0] + parts[1]
                segment_doc = nlp(next_segment_beginning + current_segment)
                next_segment_beginning = parts[2]
            else:
                segment_doc = nlp(segment)
            segment_docs.append(segment_doc)
        doc = Doc.from_docs(segment_docs)
    else:
        doc = nlp(cleantext)
    return doc

In [30]:
text_cleaner(rawtext)[:5000]

"Ad Iacobi Avberti Vindonis De Ortv Et Cavsis Metallorvm Contra Chymicos Explicationem Iosephi Qvercetani Armeniaci, D. Medici breuis Responsio. Eivsdem De Exqvisita Mineralium, Animalium, & Vegetabilium medicamentorum Spagyrica praeparatione & vsu, perspicua Tractatio. Lvgdvni, Apud loannem Lertotium. M. D. Lxxv. Qvadrin. [f] En nostre estat au vostre tout contraire, Si nous soufflons, vous humez d'autrepart: Or sus enfans, de ces deux poincts de l'art, Iugez lequel est plus seant de faire. [/f] Virtvtis Comes Invidia. Generosis. Simo Ac Splendidissimo viro Jacobo de la Fin, Regii ordinis Equiti Aurato, eiusque Nobili cubiculario, D. de la Fin lanocle, Pluuiers, Baroni d Aubusson, &c. Josephus Quercetanus. S. Memoriae proditum est, Pythagoram, hominum vitam dixisse consimilem sibi videri eius Panegyris ac mercatus Graeciae nobilissimi, quò nonnulli certandi, alii emendi & vendendi, alii verò spectandi tantùm causa se conferebant: Philosophos autem eos esse A. ii. Epistola. qui neque p

In [27]:
# testing spacy...
fn = fns[0][1]
with open(fn, "r") as f:
    rawtext = f.read()
doc = from_rawtext_to_doc(rawtext)

In [31]:
try:
    os.mkdir("../data/sents_data")
except:
    pass

In [33]:
%%time
for id, fn in fns:
    with open(fn, "r") as f:
        rawtext = f.read()
    doc = from_rawtext_to_doc(rawtext)
    doc_sentdata = [(sent.text, [(t.text, t.lemma_.lower(), t.pos_, (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))) for t in sent]) for sent in doc.sents]
    with open("../data/sents_data/" + id + ".pickle", "wb") as f:
        pickle.dump(doc_sentdata, f)

CPU times: user 7min 13s, sys: 15.8 s, total: 7min 28s
Wall time: 7min 35s


In [34]:
# Extract the lemmatized sentences

In [None]:
fns = os.listdir("../data/sents_data/")
fns[:5]

In [None]:
try:
    os.mkdir("../data/lemmatized_sents/")
except:
    pass
for fn in fns:
    lemmatized_sents = []
    sents_data = pickle.load(open("../data/sents_data/" + fn, "rb"))
    for (sent_text, sent_data) in sents_data:
        lemmasent = []
        for wordform, lemma, tag, position in sent_data:
            if tag in ["NOUN", "PROPN", "ADJ", "VERB"]:
                lemmasent.append(lemma)
        lemmatized_sents.append(" ".join(lemmasent) + "\n")
    with open("../data/lemmatized_sents/" + fn.replace(".pickle", ".txt"), "w") as f:
        f.writelines(lemmatized_sents)