In [1]:
import os
import spacy
from spacy.tokens import Doc
import sddk
import pandas as pd
import re
import matplotlib.pyplot as plt
import json
import shutil
import pickle

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [2]:
# load the language model (see the previous script)
nlp = spacy.load('la_core_web_lg')

In [3]:
# setup communication with gsheet "noscemus_overview
# to make this work, you need your ServiceAccountKey.json file located somewhere and to point out to it
# I have it in the data folder and in the .gitignore file, what makes it invisible to others; you can make the same
file_data = json.load(open("../data/ServiceAccountsKey.json"))
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

noscemus_gs = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ekf9RBfd4yqy0p0IWJ4SDk1kUT70hnoIVX1H6KPdIts/edit?usp=sharing")

# Load & Explore Spacy Docs

In [4]:
s = sddk.cloudSession(provider="sciencedata.dk", shared_folder_name="TOME/DATA/NOSCEMUS", owner="kase@zcu.cz")

connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/TOME/DATA/NOSCEMUS/


In [5]:
target_folder_name = "noscemus_spacyjsons_v1"
jsonfiles_list = s.list_filenames(target_folder_name, "json")
jsonfiles_list[:10]

['1031760.json',
 '1085290.json',
 '1285853.json',
 '1285854.json',
 '1285855.json',
 '1285856.json',
 '1365811.json',
 '1370560.json',
 '1378359.json',
 '1424044.json']

In [6]:
len(jsonfiles_list)

1007

In [7]:
def filename_to_doc(filename, dir=target_folder_name):
    doc_json = s.read_file(dir + "/" + filename, "dict")
    doc = Doc(nlp.vocab).from_json(doc_json)
    return doc

In [8]:
doc = filename_to_doc(jsonfiles_list[1])

In [9]:
fn = jsonfiles_list[1]

In [10]:
doc_data = {}
doc_data["characters_n"] = len(doc.text)
doc_data["tokens_n"] = len([t for t in doc])

In [11]:
doc_data

{'characters_n': 2930782, 'tokens_n': 572944}

In [12]:
for ent in doc.ents[:20]:
    print(ent.text)

Georg Abraham
Lindenius
Renovatus
Medicis Lindenius
Renqvatus
Johannis
Antonidae
Linden
Catalogum
Authores
Cynosuram
Rerum
Thematum
Vitae
Adscita
Medicis
Anne
Lxii
Continuati
Norimberg


In [13]:
all_lemmata = []
for token in doc:
    all_lemmata.append((token.text, token.lemma_, token.pos_, token.idx))

In [14]:
all_lemmata[200:400]

[('Ord.', 'Oard.s', 'VERB', 1250),
 ('&', '&', 'PUNCT', 1255),
 ('Academic', 'academic', 'VERB', 1257),
 ('.', '.', 'PUNCT', 1265),
 ('Curios', 'Curii', 'NOUN', 1267),
 ('.', '.', 'PUNCT', 1273),
 ('dict', 'dict', 'VERB', 1275),
 ('.', '.', 'PUNCT', 1279),
 ('Chiron', 'Chiron', 'PROPN', 1281),
 ('.', '.', 'PUNCT', 1287),
 ('Cum', 'cum', 'SCONJ', 1289),
 ('Gratia', 'Gratia', 'NOUN', 1293),
 ('&', '&', 'PUNCT', 1300),
 ('Privilegio', 'Privilegium', 'NOUN', 1302),
 ('S.', 'S.', '', 1313),
 ('C.', 'C.arcus', 'PROPN', 1316),
 ('Majest', 'Majest', 'PROPN', 1319),
 ('.', '.', 'PUNCT', 1325),
 ('Ooabvv', 'Ooabvv', 'NUM', 1327),
 ('-', '-', 'PUNCT', 1333),
 ('ii', 'is', 'NUM', 1334),
 ('Norimbergae', 'Norimbergaa', 'PROPN', 1337),
 (',', ',', 'PUNCT', 1348),
 ('Impensis', 'impensis', 'NOUN', 1350),
 ('Johannis', 'Johann', 'PROPN', 1359),
 ('Georgii', 'Georgius', 'PROPN', 1368),
 ('Endteri', 'Endteri', 'PROPN', 1376),
 ('.', '.', 'PUNCT', 1383),
 ('Anno', 'Annus', 'NOUN', 1385),
 ('Christi', 'Ch

In [15]:
doc_lemmata = [" ".join([t.lemma_.lower() for t in sent if not t.is_punct]) for sent in doc.sents]
doc_postags = [" ".join([t.pos_ for t in sent if not t.is_punct]) for sent in doc.sents]
doc_sents = [sent.text for sent in doc.sents]

In [16]:
all_sents_lemmata = []
for sent in doc.sents:
    sent_lemmata = []
    for token in sent:
        #if token.pos_ in ["NOUN", "VERB", "ADJ"]:
        sent_lemmata.append((token.text, token.lemma_, token.tag_))
    all_sents_lemmata.append(sent_lemmata)

In [25]:
try:
    !mkdir ../data/large_data/sents_lemmata
    !mkdir ../data/large_data/sents_pos
    !mkdir ../data/large_data/sents_text
    !mkdir ../data/large_data/sents_positions
except:
    pass

In [17]:
jsonfiles_list[0]

'1031760.json'

In [18]:
fn = jsonfiles_list[0]
doc = filename_to_doc(fn)

In [19]:
doc_lemmata = [" ".join([t.lemma_.lower() for t in sent]) for sent in doc.sents]
doc_positions = [[(t.idx - sent[0].idx, t.idx - sent[0].idx + len(t)) for t in sent] for sent in doc.sents]
doc_postags = [" ".join([t.pos_ for t in sent]) for sent in doc.sents]
doc_sents = [sent.text for sent in doc.sents]

In [20]:
doc_positions[:10]

[[(0, 9),
  (10, 12),
  (13, 22),
  (23, 24),
  (25, 30),
  (31, 38),
  (39, 50),
  (51, 52),
  (53, 64),
  (65, 70),
  (70, 71)],
 [(0, 5), (6, 19), (20, 21), (22, 31), (32, 40), (40, 41)],
 [(0, 4), (5, 12), (13, 17), (18, 25), (26, 33), (34, 45), (46, 52), (52, 53)],
 [(0, 4), (4, 5)],
 [(0, 11),
  (12, 14),
  (15, 24),
  (24, 25),
  (26, 38),
  (38, 39),
  (40, 45),
  (45, 48),
  (49, 53),
  (54, 56),
  (57, 65),
  (66, 75),
  (75, 76),
  (77, 81),
  (82, 92),
  (93, 94),
  (95, 103),
  (104, 109),
  (110, 115),
  (115, 116),
  (117, 124),
  (125, 134),
  (135, 142),
  (142, 143)],
 [(0, 2),
  (3, 7),
  (8, 11),
  (12, 20),
  (21, 26),
  (26, 27),
  (28, 39),
  (40, 47),
  (48, 52),
  (53, 57),
  (58, 66),
  (67, 76),
  (76, 77),
  (78, 83),
  (84, 92),
  (93, 98),
  (99, 100),
  (100, 104),
  (105, 107),
  (108, 115),
  (116, 125),
  (126, 130),
  (130, 131),
  (132, 135),
  (136, 142),
  (143, 144),
  (145, 152),
  (152, 153)],
 [(0, 4),
  (5, 14),
  (15, 20),
  (21, 30),
  (30, 

In [21]:
print(len(doc_lemmata), len(doc_sents), len(doc_positions), len(doc_postags))  

5453 5453 5453 5453


In [27]:
with open("../data/large_data/sents_positions/" + fn.replace(".json", ".pickle"), "wb") as f:
    pickle.dump(doc_positions, f)

In [30]:
# to read it back: 
# pickle.load(open("../data/large_data/sents_positions/" + fn.replace(".json", ".pickle"), "rb"))

In [None]:
#f_lemmata = open("../data/large_data/sents_lemmata/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
#doc_lemmata = [" ".join([t.lemma_.lower() for t in sent if not t.is_punct]) for sent in doc.sents]
#f_lemmata.writelines("\n".join(doc_lemmata))

#f_postags = open("../data/large_data/sents_pos/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
#doc_postags = [" ".join([t.pos_ for t in sent if not t.is_punct]) for sent in doc.sents]
#f_postags.writelines("\n".join(doc_postags))

#f_sents = open("../data/large_data/sents_text/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
# doc_sents = [sent.text for sent in doc.sents]
# f_sents.writelines("\n".join(doc_sents))

In [31]:
files_ready = os.listdir("../data/large_data/sents_lemmata")

In [51]:
for fn in jsonfiles_list:
    if fn.replace(".json", ".txt") not in files_ready:
        doc = filename_to_doc(fn)
        f_lemmata = open("../data/large_data/sents_lemmata/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
        doc_lemmata = [" ".join([t.lemma_.lower() for t in sent]) for sent in doc.sents]
        f_lemmata.writelines("\n".join(doc_lemmata))
        f_postags = open("../data/large_data/sents_pos/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
        doc_postags = [" ".join([t.pos_ for t in sent]) for sent in doc.sents]
        f_postags.writelines("\n".join(doc_postags))
        f_sents = open("../data/large_data/sents_text/" + fn.replace(".json", ".txt"), "w", encoding="utf-8")
        doc_sents = [sent.text for sent in doc.sents]
        f_sents.writelines("\n".join(doc_sents))

In [22]:
f_sents_ready = os.listdir("../data/large_data/sents_text")
len(f_sents_ready)

1001

In [25]:
f_positions_ready = os.listdir("../data/large_data/sents_positions")
len(f_positions_ready)

967

In [24]:
for fn in jsonfiles_list:
    if fn.replace(".json", ".txt") not in f_sents_ready and fn.replace(".json", ".pickle") not in f_positions_ready:
        doc = filename_to_doc(fn)
        doc_positions = [[(t.idx - sent[0].idx, t.idx - sent[0].idx + len(t)) for t in sent] for sent in doc.sents]
        with open("../data/large_data/sents_positions/" + fn.replace(".json", ".pickle"), "wb") as f:
            pickle.dump(doc_positions, f)
        if fn not in f_sents_ready:
            with open("../data/large_data/sents_text/" + fn.replace(".json", ".txt"), "w", encoding="utf-8") as f_sents:
                doc_sents = [sent.text for sent in doc.sents]
                f_sents.writelines("\n".join(doc_sents))

In [26]:
for fn in jsonfiles_list:
    if fn.replace(".json", ".pickle") not in f_positions_ready:
        doc = filename_to_doc(fn)
        doc_positions = [[(t.idx - sent[0].idx, t.idx - sent[0].idx + len(t)) for t in sent] for sent in doc.sents]
        with open("../data/large_data/sents_positions/" + fn.replace(".json", ".pickle"), "wb") as f:
            pickle.dump(doc_positions, f)

In [56]:
# make zip archives of the text files with lemmatazed sentences and pos tags
shutil.make_archive("../data/large_data/sents_lemmata", 'zip', "../data/large_data/sents_lemmata") #
shutil.make_archive("../data/large_data/sents_pos", 'zip', "../data/large_data/sents_pos") #

'/Users/vojtechkase/Projects/noscemus_ETF/data/large_data/sents_pos.zip'

In [65]:
# upload the zip archived data to sciencedata.dk 
s.s.put("https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/sents_lemmata.zip", data=open("../data/large_data/sents_lemmata.zip", "rb"))
s.s.put("https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/sents_pos.zip", data=open("../data/large_data/sents_pos.zip", "rb"))

<Response [201]>

In [None]:
# make a zip archive of all sentence text data and upload them to sciencedata
shutil.make_archive("../data/large_data/sents_text", 'zip', "../data/large_data/sents_text") #
s.s.put("https://sciencedata.dk/sharingout/kase%40zcu.cz/TOME/DATA/NOSCEMUS/sents_text.zip", data=open("../data/large_data/sents_text.zip", "rb"))

# Preliminary explorations of Latin wordnet

In [138]:
latinwn = pd.read_csv("https://raw.githubusercontent.com/CIRCSE/latinWordnet-revision/master/LiLa_LatinWordnet.csv")

In [139]:
latinwn.head(5)

Unnamed: 0,id,lemma,type,lila_uri,id_synset,definition
0,90942,"a, aa",LEMMA,http://lila-erc.eu/data/id/lemma/90942,http://wordnet-rdf.princeton.edu/wn30/06831177-n,the 1st letter of the Roman alphabet
1,86826,abactio,LEMMA,http://lila-erc.eu/data/id/lemma/86826,http://wordnet-rdf.princeton.edu/wn30/00391599-n,the act of removing
2,86828,abactor,LEMMA,http://lila-erc.eu/data/id/lemma/86828,http://wordnet-rdf.princeton.edu/wn30/10544480-n,someone who steals livestock (especially cattle)
3,91165,abactus,LEMMA,http://lila-erc.eu/data/id/lemma/91165,http://wordnet-rdf.princeton.edu/wn30/00780889-n,the act of taking something from someone unlaw...
4,86833,abaculus,LEMMA,http://lila-erc.eu/data/id/lemma/86833,http://wordnet-rdf.princeton.edu/wn30/04435180-n,a flat thin rectangular slab (as of fired clay...
