In [3]:
import os
import spacy
from spacy.tokens import Doc
import sddk
import pandas as pd
import re
import matplotlib.pyplot as plt
import json
import shutil

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [4]:
# load the language model (see the previous script)
nlp = spacy.load('la_core_web_lg')

In [5]:
# setup communication with gsheet "noscemus_overview
# to make this work, you need your ServiceAccountKey.json file located somewhere and to point out to it
# I have it in the data folder and in the .gitignore file, what makes it invisible to others; you can make the same
file_data = json.load(open("../data/ServiceAccountsKey.json"))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

noscemus_gs = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ekf9RBfd4yqy0p0IWJ4SDk1kUT70hnoIVX1H6KPdIts/edit?usp=sharing")

# Load & Explore Spacy Docs

In [6]:
s = sddk.cloudSession(provider="sciencedata.dk", shared_folder_name="TOME/DATA/NOSCEMUS", owner="kase@zcu.cz")

connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/


In [7]:
target_folder_name = "noscemus_spacyjsons_v1"
jsonfiles_list = s.list_filenames(target_folder_name, "json")
jsonfiles_list[:10]

['1031760.json',
 '1085290.json',
 '1285853.json',
 '1285854.json',
 '1285855.json',
 '1285856.json',
 '1365811.json',
 '1370560.json',
 '1378359.json',
 '1424044.json']

In [8]:
len(jsonfiles_list)

1007

In [9]:
def filename_to_doc(filename, dir=target_folder_name):
    doc_json = s.read_file(dir + "/" + filename, "dict")
    doc = Doc(nlp.vocab).from_json(doc_json)
    return doc

In [None]:
doc = filename_to_doc(jsonfiles_list[1])

In [40]:
doc_data = {}
doc_data["characters_n"] = len(doc.text)
doc_data["tokens_n"] = len([t for t in doc])

In [41]:
doc_data

{'characters_n': 2930782, 'tokens_n': 572944}

In [42]:
for ent in doc.ents[:20]:
    print(ent.text)

Georg Abraham
Lindenius
Renovatus
Medicis Lindenius
Renqvatus
Johannis
Antonidae
Linden
Catalogum
Authores
Cynosuram
Rerum
Thematum
Vitae
Adscita
Medicis
Anne
Lxii
Continuati
Norimberg


In [43]:
all_lemmata = []
for token in doc:
    all_lemmata.append((token.text, token.lemma_, token.pos_))

In [44]:
all_lemmata[200:400]

[('Ord.', 'Oard.s', 'VERB'),
 ('&', '&', 'PUNCT'),
 ('Academic', 'academic', 'VERB'),
 ('.', '.', 'PUNCT'),
 ('Curios', 'Curii', 'NOUN'),
 ('.', '.', 'PUNCT'),
 ('dict', 'dict', 'VERB'),
 ('.', '.', 'PUNCT'),
 ('Chiron', 'Chiron', 'PROPN'),
 ('.', '.', 'PUNCT'),
 ('Cum', 'cum', 'SCONJ'),
 ('Gratia', 'Gratia', 'NOUN'),
 ('&', '&', 'PUNCT'),
 ('Privilegio', 'Privilegium', 'NOUN'),
 ('S.', 'S.', ''),
 ('C.', 'C.arcus', 'PROPN'),
 ('Majest', 'Majest', 'PROPN'),
 ('.', '.', 'PUNCT'),
 ('Ooabvv', 'Ooabvv', 'NUM'),
 ('-', '-', 'PUNCT'),
 ('ii', 'is', 'NUM'),
 ('Norimbergae', 'Norimbergaa', 'PROPN'),
 (',', ',', 'PUNCT'),
 ('Impensis', 'impensis', 'NOUN'),
 ('Johannis', 'Johann', 'PROPN'),
 ('Georgii', 'Georgius', 'PROPN'),
 ('Endteri', 'Endteri', 'PROPN'),
 ('.', '.', 'PUNCT'),
 ('Anno', 'Annus', 'NOUN'),
 ('Christi', 'Christus', 'PROPN'),
 ('M.', 'M.', 'PROPN'),
 ('Dc', 'Dc', 'ADV'),
 ('.', '.', 'PUNCT'),
 ('Lxxxvi', 'Lxxxvi', 'PROPN'),
 ('.', '.', 'PUNCT'),
 ('Reverendissimo', 'Reverendissi

In [45]:
doc_lemmata = [" ".join([t.lemma_.lower() for t in sent if not t.is_punct]) for sent in doc.sents]
doc_postags = [" ".join([t.pos_ for t in sent if not t.is_punct]) for sent in doc.sents]

In [46]:
all_sents_lemmata = []
for sent in doc.sents:
    sent_lemmata = []
    for token in sent:
        #if token.pos_ in ["NOUN", "VERB", "ADJ"]:
        sent_lemmata.append((token.text, token.lemma_, token.tag_))
    all_sents_lemmata.append(sent_lemmata)

In [99]:
try:
    !mkdir ../data/large_data/sents_lemmata
    !mkdir ../data/large_data/sents_pos
except:
    pass

mkdir: ../data/large_data/sents_lemmata: File exists
mkdir: ../data/large_data/sents_pos: File exists


In [47]:
jsonfiles_list[0]

'1031760.json'

In [48]:
fn = jsonfiles_list[0]
doc = filename_to_doc(fn)

In [49]:
%%time
f_lemmata = open("../data/large_data/sents_lemmata/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
doc_lemmata = [" ".join([t.lemma_.lower() for t in sent if not t.is_punct]) for sent in doc.sents]
f_lemmata.writelines("\n".join(doc_lemmata))

f_postags = open("../data/large_data/sents_pos/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
doc_postags = [" ".join([t.pos_ for t in sent if not t.is_punct]) for sent in doc.sents]
f_postags.writelines("\n".join(doc_postags))

CPU times: user 69.3 ms, sys: 2.32 ms, total: 71.6 ms
Wall time: 72.7 ms


In [50]:
files_ready = os.listdir("../data/large_data/sents_lemmata")

In [51]:
for fn in jsonfiles_list:
    if fn.replace(".json", ".txt") not in files_ready:
        doc = filename_to_doc(fn)
        f_lemmata = open("../data/large_data/sents_lemmata/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
        doc_lemmata = [" ".join([t.lemma_.lower() for t in sent if not t.is_punct]) for sent in doc.sents]
        f_lemmata.writelines("\n".join(doc_lemmata))
        f_postags = open("../data/large_data/sents_pos/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
        doc_postags = [" ".join([t.pos_ for t in sent if not t.is_punct]) for sent in doc.sents]
        f_postags.writelines("\n".join(doc_postags))

In [56]:
# make zip archives of the text files with lemmatazed sentences and pos tags
shutil.make_archive("../data/large_data/sents_lemmata", 'zip', "../data/large_data/sents_lemmata") #
shutil.make_archive("../data/large_data/sents_pos", 'zip', "../data/large_data/sents_pos") #

'/Users/vojtechkase/Projects/noscemus_ETF/data/large_data/sents_pos.zip'

In [65]:
# upload the zip archived data to sciencedata.dk 
s.s.put("https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/sents_lemmata.zip", data=open("../data/large_data/sents_lemmata.zip", "rb"))
s.s.put("https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/sents_pos.zip", data=open("../data/large_data/sents_pos.zip", "rb"))

<Response [201]>

# Preliminary explorations of Latin wordnet

In [138]:
latinwn = pd.read_csv("https://raw.githubusercontent.com/CIRCSE/latinWordnet-revision/master/LiLa_LatinWordnet.csv")

In [139]:
latinwn.head(5)

Unnamed: 0,id,lemma,type,lila_uri,id_synset,definition
0,90942,"a, aa",LEMMA,http://lila-erc.eu/data/id/lemma/90942,http://wordnet-rdf.princeton.edu/wn30/06831177-n,the 1st letter of the Roman alphabet
1,86826,abactio,LEMMA,http://lila-erc.eu/data/id/lemma/86826,http://wordnet-rdf.princeton.edu/wn30/00391599-n,the act of removing
2,86828,abactor,LEMMA,http://lila-erc.eu/data/id/lemma/86828,http://wordnet-rdf.princeton.edu/wn30/10544480-n,someone who steals livestock (especially cattle)
3,91165,abactus,LEMMA,http://lila-erc.eu/data/id/lemma/91165,http://wordnet-rdf.princeton.edu/wn30/00780889-n,the act of taking something from someone unlaw...
4,86833,abaculus,LEMMA,http://lila-erc.eu/data/id/lemma/86833,http://wordnet-rdf.princeton.edu/wn30/04435180-n,a flat thin rectangular slab (as of fired clay...
