# Spacy

In [None]:
###  pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl

In [1]:
import spacy
import sddk
import pandas as pd

In [2]:
nlp = spacy.load('la_core_web_lg')

In [3]:
nlp.max_length = 10000000

### Spacy test

In [4]:
vitruvius = "Architecti est scientia pluribus disciplinis et variis eruditionibus ornata, quae ab ceteris artibus perficiuntur. Opera ea nascitur et fabrica et ratiocinatione."

In [5]:
doc = nlp(vitruvius)

In [6]:
for token in doc:
    print(token.text, token.pos_, token.lemma_)

Architecti PROPN Architectus
est AUX sum
scientia NOUN scientia
pluribus ADJ multus
disciplinis NOUN disciplina
et CCONJ et
variis ADJ uarius
eruditionibus NOUN eruditio
ornata VERB orno
, PUNCT ,
quae PRON qui
ab ADP ab
ceteris DET ceterus
artibus NOUN ars
perficiuntur VERB perficio
. PUNCT .
Opera NOUN Opus
ea PRON is
nascitur VERB nascor
et CCONJ et
fabrica NOUN fabrica
et CCONJ et
ratiocinatione NOUN ratiocinatio
. PUNCT .


In [7]:
all_sents_lemmata = []
for sent in doc.sents:
    sent_lemmata = []
    for token in sent:
        if token.pos_ in ["NOUN", "VERB", "ADJ"]:
            sent_lemmata.append(token.lemma_)
    all_sents_lemmata.append(sent_lemmata)

In [8]:
all_sents_lemmata

[['scientia',
  'multus',
  'disciplina',
  'uarius',
  'eruditio',
  'orno',
  'ars',
  'perficio'],
 ['Opus', 'nascor', 'fabrica', 'ratiocinatio']]

# Apply spacy model on nocsemus

In [9]:
s = sddk.cloudSession(provider="sciencedata.dk", shared_folder_name="TOME/DATA/NOSCEMUS", owner="kase@zcu.cz")

connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/


In [10]:
nlp.max_length

10000000

In [None]:
# load metadata

In [12]:
# extract a list of ids for iteration
filenames_list = s.list_filenames("noscemus_raw", "txt")

In [13]:
filenames_list[:10]

['1031760.txt',
 '1085290.txt',
 '1285853.txt',
 '1285854.txt',
 '1285855.txt',
 '1285856.txt',
 '1365811.txt',
 '1370560.txt',
 '1378359.txt',
 '1424044.txt']

In [20]:
[fn for fn in filenames_list if "_" in fn]

[]

In [14]:
ids = [fn.partition(".")[0] for fn in filenames_list]

In [23]:
%%time
# test with 10 documents
docs_json = []
for filename in filenames_list[:10]:
    rawtext = s.read_file("noscemus_raw/" + filename, "str")
    doc = nlp(rawtext)
    doc_json = doc.to_json()
    docs_json.append(doc_json)

CPU times: user 5min 11s, sys: 37.7 s, total: 5min 48s
Wall time: 9min 1s


In [26]:
doc = nlp("")
doc = doc.from_json(docs_json[0])

In [27]:
all_sents_lemmata = []
for sent in doc.sents:
    sent_lemmata = []
    for token in sent:
        if token.pos_ in ["NOUN", "VERB", "ADJ"]:
            sent_lemmata.append(token.lemma_)
    all_sents_lemmata.append(sent_lemmata)

In [29]:
all_sents_lemmata[30:50]

[['\n\n\no', '\n\n'],
 ['PRINCIPI',
  '\n',
  'ILOBO',
  'GRATIA',
  'MAGNAE',
  'BRITANNIAE',
  'Regi',
  'FIDEI',
  'defensori',
  '\n',
  'OTERIT',
  'Maiestas',
  'Tua',
  'furtius',
  '\n\n\n\n',
  'incuso',
  'quòd',
  '\n\n',
  'f',
  'Temporis',
  '\n\n\n\n\n',
  'sufficio'],
 ['S', '\n\n\n', 'Tuus', 'suffurao'],
 ['\n\n\no', 'habeo', 'dico'],
 ['Tem¬',
  'f',
  'pos',
  'facio',
  'Resti¬s',
  '\n\n',
  '\n',
  'ME',
  '\n',
  'tutio',
  'fortè',
  'detraco',
  'Tempus',
  'Res',
  'Tuus',
  'Nomen',
  'Tui',
  'Ho¬',
  'nor',
  '\n\n\no',
  'nor',
  'Saeculum',
  'Tuus',
  'repono',
  'possum',
  'modò',
  'alicu¬',
  'ius',
  'pretij'],
 ['certè', 'nouus'],
 ['to¬',
  'to',
  'genus',
  'describo',
  'uetus',
  'emplo',
  'Mundus',
  'scilicèt',
  'naturâ',
  'Rerum',
  '\n',
  'Mens'],
 ['certè',
  'ingenuè',
  'fatear',
  'soleo',
  'aestimo',
  'opus',
  'partus',
  'Tempus',
  'quàm'],
 ['solummodò', 'mi¬', 'rabilis'],
 ['Reus', 'tantus', '\n', 'runt', 'mens', 'uenio', '

In [None]:
%%time
# test with savings
for filename in filenames_list[:30]:
    rawtext = s.read_file("noscemus_raw/" + filename, "str")
    doc = nlp(rawtext)
    doc_json = doc.to_json()
    s.write_file("noscemus_spacy_jsons/" + filename.partition(".")[0] + ".json", doc_json)

Your <class 'dict'> object has been succesfully written as "https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/noscemus_spacy_jsons/1031760.txt.json"


In [16]:
jsonfiles_list = s.list_filenames("noscemus_spacy_jsons", "json")

In [17]:
jsonfiles_list

['1031760.json']

1085290.json
1285853.json
1285854.json
1285855.json
1285856.json
1365811.json
1370560.json
1378359.json
1424044.json
1461594.json
1479057.json
1509197.json
1509290.json
1526071.json
1528734.json
1567826.json
597675.json
597737.json
597799.json
598104.json
598116.json
598518.json
599651.json
599653.json
599722.json
599723.json
599724.json
599725.json
599726.json


In [None]:
%%time
%%capture
for filename in filenames_list[:30]:
    try:
        new_filename = filename.partition(".")[0] + ".json"
        if new_filename not in jsonfiles_list:
            rawtext = s.read_file("noscemus_raw/" + filename, "str")
            doc = nlp(rawtext)
            doc_json = doc.to_json()
            s.write_file("noscemus_spacy_jsons/" + new_filename, doc_json)
    except:
        pass