In [1]:
import os
import spacy
from spacy.tokens import Doc
import sddk
import pandas as pd
import re
import matplotlib.pyplot as plt
import json
import shutil
import pickle

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [2]:
# load the language model (see the previous script)
nlp = spacy.load('la_core_web_lg')

In [3]:
# setup communication with gsheet "noscemus_overview
# to make this work, you need your ServiceAccountKey.json file located somewhere and to point out to it
# I have it in the data harmonia and in the .gitignore file, what makes it invisible to others; you can make the same
file_data = json.load(open("../data/ServiceAccountsKey.json"))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

noscemus_gs = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ekf9RBfd4yqy0p0IWJ4SDk1kUT70hnoIVX1H6KPdIts/edit?usp=sharing")

# Load & Explore Spacy Docs

In [4]:
s = sddk.cloudSession(provider="sciencedata.dk", shared_folder_name="TOME/DATA/NOSCEMUS", owner="kase@zcu.cz")

connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/TOME/DATA/NOSCEMUS/


In [5]:
target_folder_name = "noscemus_spacyjsons_v1"
jsonfiles_list = s.list_filenames(target_folder_name, "json")
jsonfiles_list[:10]

['1031760.json',
 '1085290.json',
 '1285853.json',
 '1285854.json',
 '1285855.json',
 '1285856.json',
 '1365811.json',
 '1370560.json',
 '1378359.json',
 '1424044.json']

In [6]:
len(jsonfiles_list)

1007

In [7]:
def filename_to_doc(filename, dir=target_folder_name):
    doc_json = s.read_file(dir + "/" + filename, "dict")
    doc = Doc(nlp.vocab).from_json(doc_json)
    return doc

In [38]:
%%time
doc = filename_to_doc(jsonfiles_list[1])

CPU times: user 48 s, sys: 32.5 s, total: 1min 20s
Wall time: 1min 27s


In [52]:
try:
    !mkdir ../data/large_data/sents_data
except:
    pass

In [59]:
doc_sentdata = [(sent.text, [(t.text, t.lemma_.lower(), t.pos_, (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))) for t in sent]) for sent in doc.sents]

In [60]:
doc_sentdata[:3]

[('Franciscj De Verulamio / Summi Angliae Cancellarij / Instauratio magna.',
  [('Franciscj', 'francisci', 'PROPN', (0, 9)),
   ('De', 'de', 'ADP', (10, 12)),
   ('Verulamio', 'verulamio', 'PROPN', (13, 22)),
   ('/', '/', 'PUNCT', (23, 24)),
   ('Summi', 'summi', 'ADJ', (25, 30)),
   ('Angliae', 'angliae', 'PROPN', (31, 38)),
   ('Cancellarij', 'cancellarij', 'PROPN', (39, 50)),
   ('/', '/', 'PUNCT', (51, 52)),
   ('Instauratio', 'instauratio', 'NOUN', (53, 64)),
   ('magna', 'magnus', 'ADJ', (65, 70)),
   ('.', '.', 'PUNCT', (70, 71))]),
 ('Multi pertransibunt & augebitur scientia.',
  [('Multi', 'multus', 'ADJ', (0, 5)),
   ('pertransibunt', 'pertransibo', 'VERB', (6, 19)),
   ('&', '&', 'PUNCT', (20, 21)),
   ('augebitur', 'augebio', 'VERB', (22, 31)),
   ('scientia', 'scientia', 'NOUN', (32, 40)),
   ('.', '.', 'PUNCT', (40, 41))]),
 ('Anno Londini Apud Joannem Billium Typographum Regium.',
  [('Anno', 'annus', 'NOUN', (0, 4)),
   ('Londini', 'londini', 'PROPN', (5, 12)),
   ('Ap

In [12]:
sents_data_ready = os.listdir("../data/large_data/sents_data")
len(sents_data_ready)

1006

In [11]:
failed = []
for fn in jsonfiles_list:
    if fn.replace(".json", ".pickle") not in sents_data_ready:
        print("working on " + fn)
        try:
            doc = filename_to_doc(fn) # load the doc
            doc_sentdata = [(sent.text, [(t.text, t.lemma_.lower(), t.pos_, (t.idx - sent[0].idx, t.idx - sent[0].idx + len(t))) for t in sent]) for sent in doc.sents]
            with open("../data/large_data/sents_data/" + fn.replace(".json", ".pickle"), "wb") as f:
                pickle.dump(doc_sentdata, f)
        except:
            failed.append(fn)

working on 699869.json


In [56]:
# make zip archives of the text files with lemmatazed sentences and pos tags
shutil.make_archive("../data/large_data/sents_data", 'zip', "../data/large_data/sents_data")
# upload the zip archived data to sciencedata.dk 
s.s.put("https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/sents_data.zip", data=open("../data/large_data/sents_data.zip", "rb"))

'/Users/vojtechkase/Projects/noscemus_ETF/data/large_data/sents_pos.zip'

# Preliminary explorations of Latin wordnet

In [138]:
latinwn = pd.read_csv("https://raw.githubusercontent.com/CIRCSE/latinWordnet-revision/master/LiLa_LatinWordnet.csv")

In [139]:
latinwn.head(5)

Unnamed: 0,id,lemma,type,lila_uri,id_synset,definition
0,90942,"a, aa",LEMMA,http://lila-erc.eu/data/id/lemma/90942,http://wordnet-rdf.princeton.edu/wn30/06831177-n,the 1st letter of the Roman alphabet
1,86826,abactio,LEMMA,http://lila-erc.eu/data/id/lemma/86826,http://wordnet-rdf.princeton.edu/wn30/00391599-n,the act of removing
2,86828,abactor,LEMMA,http://lila-erc.eu/data/id/lemma/86828,http://wordnet-rdf.princeton.edu/wn30/10544480-n,someone who steals livestock (especially cattle)
3,91165,abactus,LEMMA,http://lila-erc.eu/data/id/lemma/91165,http://wordnet-rdf.princeton.edu/wn30/00780889-n,the act of taking something from someone unlaw...
4,86833,abaculus,LEMMA,http://lila-erc.eu/data/id/lemma/86833,http://wordnet-rdf.princeton.edu/wn30/04435180-n,a flat thin rectangular slab (as of fired clay...
