# Spacy

In [None]:
###  pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl

In [1]:
import spacy
from spacy.tokens import Doc
from spacy.language import Language
import sddk
import pandas as pd
import re

In [2]:
spacy.prefer_gpu()

True

In [3]:
nlp = spacy.load('la_core_web_lg')

In [4]:
nlp.max_length

1000000

In [5]:
ids_filenames_df = pd.read_csv("../data/ids_filenames_df.csv")
ids_filenames_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,filenames_list
0,0,1031760,"['Bacon,_Francis_-_Instauratio_magna__London_1..."
1,1,1085290,"['Linden,_Johannes_Antonides_van_der_-_Lindeni..."
2,2,1285853,"['de_Conde,_Ioannes_Baptista_-_Aphorismi_seu_a..."
3,3,1285854,"['van_Poort,_Henricus_-_Hippocratis_Aphorismi_..."
4,4,1285855,"['Hippocrates_&_Denisot,_Gérard_-_Hippocratis_..."


In [6]:
ids_filenames_df[ids_filenames_df["id"]==1085290]["filenames_list"]

1    ['Linden,_Johannes_Antonides_van_der_-_Lindeni...
Name: filenames_list, dtype: object

In [7]:
ids_filenames_df.set_index("id", inplace=True)
ids_filenames_df.head(5)

Unnamed: 0_level_0,Unnamed: 0,filenames_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1031760,0,"['Bacon,_Francis_-_Instauratio_magna__London_1..."
1085290,1,"['Linden,_Johannes_Antonides_van_der_-_Lindeni..."
1285853,2,"['de_Conde,_Ioannes_Baptista_-_Aphorismi_seu_a..."
1285854,3,"['van_Poort,_Henricus_-_Hippocratis_Aphorismi_..."
1285855,4,"['Hippocrates_&_Denisot,_Gérard_-_Hippocratis_..."


### Spacy test

In [8]:
vitruvius = "Architecti est scientia pluribus disciplinis et variis eruditionibus ornata, quae ab ceteris artibus perficiuntur. Opera ea nascitur et fabrica et ratiocinatione."

In [9]:
doc = nlp(vitruvius)

In [10]:
for token in doc:
    print(token.text, token.pos_, token.lemma_)

Architecti PROPN Architectus
est AUX sum
scientia NOUN scientia
pluribus ADJ multus
disciplinis NOUN disciplina
et CCONJ et
variis ADJ uarius
eruditionibus NOUN eruditio
ornata VERB orno
, PUNCT ,
quae PRON qui
ab ADP ab
ceteris DET ceterus
artibus NOUN ars
perficiuntur VERB perficio
. PUNCT .
Opera NOUN Opus
ea PRON is
nascitur VERB nascor
et CCONJ et
fabrica NOUN fabrica
et CCONJ et
ratiocinatione NOUN ratiocinatio
. PUNCT .


In [11]:
all_sents_lemmata = []
for sent in doc.sents:
    sent_lemmata = []
    for token in sent:
        if token.pos_ in ["NOUN", "VERB", "ADJ"]:
            sent_lemmata.append(token.lemma_)
    all_sents_lemmata.append(sent_lemmata)

In [12]:
all_sents_lemmata

[['scientia',
  'multus',
  'disciplina',
  'uarius',
  'eruditio',
  'orno',
  'ars',
  'perficio'],
 ['Opus', 'nascor', 'fabrica', 'ratiocinatio']]

# Apply spacy model on nocsemus

In [13]:
s = sddk.cloudSession(provider="sciencedata.dk", shared_folder_name="TOME/DATA/NOSCEMUS", owner="kase@zcu.cz")

connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/


In [14]:
# load metadata

In [15]:
# extract a list of ids for iteration
filenames_list = s.list_filenames("noscemus_raw", "txt")

In [16]:
filenames_list[:10]

['1031760.txt',
 '1085290.txt',
 '1285853.txt',
 '1285854.txt',
 '1285855.txt',
 '1285856.txt',
 '1365811.txt',
 '1370560.txt',
 '1378359.txt',
 '1424044.txt']

In [17]:
[fn for fn in filenames_list if "_" in fn]

[]

In [18]:
ids = [fn.partition(".")[0] for fn in filenames_list]

In [19]:
jsonfiles_list = s.list_filenames("noscemus_spacy_jsons", "json")

In [20]:
jsonfiles_list

['1031760.json']

In [21]:
filenames_list[1]

'1085290.txt'

# Text cleaning

In [22]:
filename = filenames_list[1]
rawtext = s.read_file("noscemus_raw/" + filename, "str")

In [23]:
#@Language.component("text_cleaner")
#def text_cleaner(rawtext):
#    for token in doc:
#        token.norm_ = token.norm_.replace("¬\n", "").replace("\n", " ").replace("ß", "ss").replace("ij","ii")
#    return doc

In [24]:
#nlp.add_pipe("text_cleaner", after="normer") 
#nlp.pipeline                                                          

In [25]:
def text_cleaner(rawtext):
    cleantext = rawtext.replace("¬\n", "").replace("\n", " ").replace("ß", "ss").replace("ij","ii")
    cleantext = " ".join([t[0] + t[1:].lower() for t in cleantext.split()])
    cleantext = re.sub("\s\s+", " ", cleantext)
    return cleantext

In [26]:
cleantext = text_cleaner(rawtext)

In [27]:
cleantext[:10000]

'Georg Abraham Mercklini Lindenius Renovatus De Scriptis Medicis Lindenius Renqvatus, Ve S1 Johannis Antonidae van der Linden De Scria Iaumedicis Libridvo. Uorum Prior, Omnium, Tam C Veterum, quàm Recentiorum, Latino idiomate, typis unquam expressorum Scriptorum Medicorum, consummatissimum Catalogum continet; quo indicatur, quid singuli Authores scripserint: nec non ubi, quâ formâ, & quo tempore, omnes eorum Scriptorum Editiones excusae prostent: Posterior verò Cynosuram Medicam, sive, Rerum & Materiarum Indicem, omnium Titulorum vel Thematum Medicorum potiorum Communia Alphabetico hâcque novâ demum Editione primùm adornato ordine suis Lglicita comprehendentem exhibet, ut inquirenti, quicquid desideraverit, velut digito, in multiplicem usum, clarissimè monstretur: Noviter Praeter Haec Addita Plurimorum Authorum, quotquot nempe habere licuit, Vitae Curriculorum succinctâ Descriptione: Adscita undique ab exteris Medicis subsidiariâ ope, propriâque ultra decennium adhibitâ singulari operâ

In [28]:
doc = nlp(cleantext[:10000])
doc

Georg Abraham Mercklini Lindenius Renovatus De Scriptis Medicis Lindenius Renqvatus, Ve S1 Johannis Antonidae van der Linden De Scria Iaumedicis Libridvo. Uorum Prior, Omnium, Tam C Veterum, quàm Recentiorum, Latino idiomate, typis unquam expressorum Scriptorum Medicorum, consummatissimum Catalogum continet; quo indicatur, quid singuli Authores scripserint: nec non ubi, quâ formâ, & quo tempore, omnes eorum Scriptorum Editiones excusae prostent: Posterior verò Cynosuram Medicam, sive, Rerum & Materiarum Indicem, omnium Titulorum vel Thematum Medicorum potiorum Communia Alphabetico hâcque novâ demum Editione primùm adornato ordine suis Lglicita comprehendentem exhibet, ut inquirenti, quicquid desideraverit, velut digito, in multiplicem usum, clarissimè monstretur: Noviter Praeter Haec Addita Plurimorum Authorum, quotquot nempe habere licuit, Vitae Curriculorum succinctâ Descriptione: Adscita undique ab exteris Medicis subsidiariâ ope, propriâque ultra decennium adhibitâ singulari operâ 

# working with large files - development

In [30]:
cleantext = cleantext[:380000]

In [31]:
# segments docs
segment_docs = []
segment_len = 100000
if len(cleantext) > segment_len:
    parts = cleantext[:segment_len].rpartition(". ")
    current_segment = parts[0] + parts[1]
    segment_doc = nlp(current_segment)
    segment_docs.append(segment_doc)
    next_segment_beginning = parts[2]
    for n in range(segment_len, len(cleantext), segment_len):
        print(n)
        segment = cleantext[n:n+segment_len]
        if len(segment) == segment_len: 
            parts = cleantext[n:n+segment_len].rpartition(". ")
            current_segment = parts[0] + parts[1]
            segment_doc = nlp(next_segment_beginning + current_segment)
            next_segment_beginning = parts[2]
        else:
            segment_doc = nlp(segment)
        segment_docs.append(segment_doc)
    doc = Doc.from_docs(segment_docs)
else:
    doc = nlp(cleantext)

100000
200000
300000


In [32]:
doc = Doc.from_docs(segment_docs)

In [33]:
cleantext[199900:200100]

'nica Roberti Boylei, de Vi Aeris elastico, & ejusdem effectibus; quibus Observata illius rationibus Philosophicis, omni Vacuum, ipsumque elaterem Aeris Pecquetianum arcentibus, illustrantur. Gröningae'

In [34]:
doc.text[199900:200100]

'nica Roberti Boylei, de Vi Aeris elastico, & ejusdem effectibus; quibus Observata illius rationibus Philosophicis, omni Vacuum, ipsumque elaterem Aeris Pecquetianum arcentibus, illustrantur. Gröningae'

In [36]:
# lets encapsulate the cleaning and spacy pipeline application into one function
def from_rawtext_to_doc(rawtext):
    cleantext = text_cleaner(rawtext)
    segment_len = 800000
    if len(cleantext) > segment_len:
        segment_docs = []
        parts = cleantext[:segment_len].rpartition(". ")
        current_segment = parts[0] + parts[1]
        segment_doc = nlp(current_segment)
        segment_docs.append(segment_doc)
        next_segment_beginning = parts[2]
        for n in range(segment_len, len(cleantext), segment_len):
            print(n)
            segment = cleantext[n:n+segment_len]
            if len(segment) == segment_len:
                parts = cleantext[n:n+segment_len].rpartition(". ")
                current_segment = parts[0] + parts[1]
                segment_doc = nlp(next_segment_beginning + current_segment)
                next_segment_beginning = parts[2]
            else:
                segment_doc = nlp(segment)
            segment_docs.append(segment_doc)
        doc = Doc.from_docs(segment_docs)
    else:
        doc = nlp(cleantext)
    return doc

# Applying the function

In [37]:
# input text files
filenames_list = s.list_filenames("noscemus_raw", "txt")

In [38]:
# output jsonfiles
target_folder_name = "noscemus_spacyjsons_v1"
jsonfiles_list = s.list_filenames(target_folder_name, "json")
jsonfiles_list[:10]

[]

In [39]:
len(jsonfiles_list)

0

In [None]:
%%time
%%capture
for n, filename in enumerate(filenames_list):
    if n in range(0, len(filenames_list), 50):
        print(n)
    try:
        new_filename = filename.partition(".")[0] + ".json"
        if new_filename not in jsonfiles_list:
            rawtext = s.read_file("noscemus_raw/" + filename, "str")
            doc = from_rawtext_to_doc(rawtext)
            doc_json = doc.to_json()
            s.write_file(target_folder_name + "/" + new_filename, doc_json)
    except:
        pass

In [51]:
jsonfiles_list

['1031760.json']

In [92]:
all_sents_lemmata = []
for sent in doc.sents:
    sent_lemmata = []
    for token in sent:
        #if token.pos_ in ["NOUN", "VERB", "ADJ"]:
        sent_lemmata.append((token.text, token.lemma_))
    all_sents_lemmata.append(sent_lemmata)

In [93]:
all_sents_lemmata[100:120]

[[('Adeò', 'Adeò'),
  ('vt', 'vt'),
  ('Tempus', 'Tempus'),
  (',', ','),
  ('tanquàm', 'tanquàm'),
  ('fluuius', 'fluuius'),
  (',', ','),
  ('leuia', 'leuis'),
  ('&', '&'),
  ('inflata', 'inflo'),
  ('ad', 'ad'),
  ('nos', 'nos'),
  ('deuexerit', 'deuixio'),
  (',', ','),
  ('grauia', 'grauis'),
  ('&', '&'),
  ('solida', 'solidus'),
  ('demerserit', 'demerserit'),
  ('.', '.')],
 [('Quin', 'quin'),
  ('&', '&'),
  ('illi', 'ille'),
  ('ipsi', 'ipse'),
  ('authores', 'author'),
  (',', ','),
  ('qui', 'qui'),
  ('dictaturam', 'dictatura'),
  ('quandam', 'quidam'),
  ('in', 'in'),
  ('Scientiis', 'Scientia'),
  ('inuaserunt', 'inuasero'),
  (',', ','),
  ('&', '&'),
  ('tantà', 'tantà'),
  ('confidentia', 'confidentia'),
  ('de', 'de'),
  ('rebus', 'res'),
  ('pronuntiant', 'pronuntio'),
  (';', ';'),
  ('cum', 'cum'),
  ('tamen', 'tamen'),
  ('per', 'per'),
  ('interualla', 'interuallum'),
  ('ad', 'ad'),
  ('se', 'se'),
  ('redeunt', 'redeo'),
  (',', ','),
  ('ad', 'ad'),
  ('quer

In [None]:
"ingeni", "ingenium"
"deuxerit", "deueho"/"deveho"
"demerserit", "demergo"

In [96]:
nlp("devexerit").to_json()

{'text': 'devexerit',
 'ents': [],
 'sents': [{'start': 0, 'end': 9}],
 'tokens': [{'id': 0,
   'start': 0,
   'end': 9,
   'tag': 'verb',
   'pos': 'VERB',
   'morph': 'Mood=Ind|Number=Sing|Person=3|Tense=Fut|Verbform=Fin|Voice=Act',
   'lemma': 'devicio',
   'dep': 'ROOT',
   'head': 0}]}