In [9]:
import os
import spacy
from spacy.tokens import Doc
import sddk
import pandas as pd
import re
import matplotlib.pyplot as plt
import json
import shutil
import pickle

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [10]:
# load the language model (see the previous script)
nlp = spacy.load('la_core_web_lg')

In [11]:
# setup communication with gsheet "noscemus_overview
# to make this work, you need your ServiceAccountKey.json file located somewhere and to point out to it
# I have it in the data folder and in the .gitignore file, what makes it invisible to others; you can make the same
file_data = json.load(open(os.path.expanduser("~/ServiceAccountsKey.json")))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

noscemus_gs = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ekf9RBfd4yqy0p0IWJ4SDk1kUT70hnoIVX1H6KPdIts/edit?usp=sharing")

# Load & Explore Spacy Docs

In [13]:
source_path = "/srv/data/tome/noscemus/sents_data/"
filenames_list = os.listdir(source_path)
filenames_list[:10]

['725075.pickle',
 '928138.pickle',
 '985903.pickle',
 '733505.pickle',
 '739101.pickle',
 '702145.pickle',
 '906214.pickle',
 '902259.pickle',
 '901017.pickle',
 '904418.pickle']

In [15]:
# loading individual file
f_sents_data = pickle.load(open(source_path + filenames_list[20], "rb"))
f_sents_data[110:115]

[('neque tamen despondet animus, quin (unis post alios eadem philosophandi methodo insistentibus, tandem cum bonis auspiciis ueritas (quatenus humana feret tenuitas) reseratis indies nouis naturae mysteriis & discussis errorum tenebris, integra apparuerit, taedaeque nuptiales accendantur.',
  [('neque', 'neque', 'CCONJ', (0, 5)),
   ('tamen', 'tamen', 'ADV', (6, 11)),
   ('despondet', 'despondeo', 'VERB', (12, 21)),
   ('animus', 'animus', 'NOUN', (22, 28)),
   (',', ',', 'PUNCT', (28, 29)),
   ('quin', 'quin', 'SCONJ', (30, 34)),
   ('(', '(', 'PUNCT', (35, 36)),
   ('unis', 'unus', 'NOUN', (36, 40)),
   ('post', 'post', 'ADP', (41, 45)),
   ('alios', 'alius', 'DET', (46, 51)),
   ('eadem', 'idem', 'DET', (52, 57)),
   ('philosophandi', 'philosopho', 'VERB', (58, 71)),
   ('methodo', 'methodus', 'NOUN', (72, 79)),
   ('insistentibus', 'insisto', 'VERB', (80, 93)),
   (',', ',', 'PUNCT', (93, 94)),
   ('tandem', 'tandem', 'ADV', (95, 101)),
   ('cum', 'cum', 'SCONJ', (102, 105)),
   ('

In [16]:
len(filenames_list)

1007

In [7]:
#def filename_to_doc(filename, dir=target_folder_name):
#    doc_json = s.read_file(dir + "/" + filename, "dict")
#    doc = Doc(nlp.vocab).from_json(doc_json)
#    return doc

In [20]:
#%%time
#doc = filename_to_doc(jsonfiles_list[1])

In [19]:
len(filenames_list)

1007

# Preliminary explorations of Latin wordnet

In [138]:
latinwn = pd.read_csv("https://raw.githubusercontent.com/CIRCSE/latinWordnet-revision/master/LiLa_LatinWordnet.csv")

In [139]:
latinwn.head(5)

Unnamed: 0,id,lemma,type,lila_uri,id_synset,definition
0,90942,"a, aa",LEMMA,http://lila-erc.eu/data/id/lemma/90942,http://wordnet-rdf.princeton.edu/wn30/06831177-n,the 1st letter of the Roman alphabet
1,86826,abactio,LEMMA,http://lila-erc.eu/data/id/lemma/86826,http://wordnet-rdf.princeton.edu/wn30/00391599-n,the act of removing
2,86828,abactor,LEMMA,http://lila-erc.eu/data/id/lemma/86828,http://wordnet-rdf.princeton.edu/wn30/10544480-n,someone who steals livestock (especially cattle)
3,91165,abactus,LEMMA,http://lila-erc.eu/data/id/lemma/91165,http://wordnet-rdf.princeton.edu/wn30/00780889-n,the act of taking something from someone unlaw...
4,86833,abaculus,LEMMA,http://lila-erc.eu/data/id/lemma/86833,http://wordnet-rdf.princeton.edu/wn30/04435180-n,a flat thin rectangular slab (as of fired clay...
