In [1]:
import gzip
import json
import pandas as pd
from my_functions_improved import *

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‎𐤀 CLTK version '1.3.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekSpacyProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``GreekSpacyProcess`` using OdyCy model by Center for Humanities Computing Aarhus from https://huggingface.co/chcaa . Please cite: https://aclanthology.org/2023.latechclfl-1.14
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [2]:
with gzip.open('noun_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    noun_dict = json.load(gzip_file)

In [3]:
with gzip.open('verb_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    verb_dict = json.load(gzip_file)

In [61]:
noun_df = pd.DataFrame(noun_dict)

In [5]:
verb_df = pd.DataFrame(verb_dict)

In [None]:
noun_df

In [None]:
verb_df

In [28]:
# elimino le righe che contengono celle vuote nel df dei verbi
good_verb_df = verb_df[(verb_df["tense"]!= " ") & (verb_df["mode"]!=" ") & (verb_df["act/mid/p"]!= " ") & (verb_df["number"]!= " ")]
good_verb_df

Unnamed: 0,lemma,form,tense,mode,act/mid/p,gender,case,person,number,dialects
0,ἅλλομαι,ἅλεται,aor,subj,mid,,,3rd,sg,
1,ἅλλομαι,ἅληται,aor,subj,mid,,,3rd,sg,
2,ἅλλομαι,ἅλῃ,aor,subj,mid,,,2nd,sg,
4,ἅλλομαι,ἅλλεσθε,imperf,ind,mid,,,2nd,pl,"[(doric, aeolic]"
5,ἅλλομαι,ἅλλεσθε,pres,imperat,mid,,,2nd,pl,
...,...,...,...,...,...,...,...,...,...,...
859326,ζῳοτροφέω,ζῳοτροφεῖ,pres,ind,act,,,3rd,sg,"[doric, ionic]"
859328,ζῳοτροφέω,ζῳοτροφούντων,pres,part,act,masc/neut,gen,,pl,[doric]
859329,ζῳοτροφέω,ζῳοτροφούντων,pres,imperat,act,,,3rd,pl,[doric]
859330,ζῳοτροφέω,ζῳοτροφοῦσιν,pres,part,act,masc/neut,dat,,pl,[doric]


In [30]:
# controllo i valori delle colonne del df dei verbi
unique_tense = good_verb_df['tense'].unique()
unique_mode = good_verb_df['mode'].unique()
unique_act = good_verb_df['act/mid/p'].unique()
unique_gender = good_verb_df['gender'].unique()
unique_case = good_verb_df['case'].unique()
unique_person = good_verb_df['person'].unique()
unique_number = good_verb_df['number'].unique()
unique_tense, unique_mode, unique_gender, unique_case, unique_person, unique_number

(array(['aor', 'imperf', 'pres', 'fut', 'perf', 'futperf', 'plup'],
       dtype=object),
 array(['subj', 'ind', 'imperat', 'opt', 'part'], dtype=object),
 array([' ', 'masc', 'neut', 'fem', 'masc/neut', 'masc/fem'], dtype=object),
 array([' ', 'nom', 'nom/voc/acc', 'nom/voc', 'acc', 'gen', 'dat', 'voc',
        'gen/dat', 'nom/acc'], dtype=object),
 array(['3rd', '2nd', '1st', ' '], dtype=object),
 array(['sg', 'pl', 'dual'], dtype=object))

In [32]:
# controllo i valori delle colonne del df dei nomi
unique_gender = noun_df['gender'].unique()
unique_case = noun_df['case'].unique()
unique_number = noun_df['number'].unique()
unique_gender, unique_case, unique_number

(array(['fem', 'masc', 'neut', 'masc/neut', 'masc/fem', 'masc/fem/neut',
        ' '], dtype=object),
 array(['nom/voc/acc', 'nom/voc', 'dat', 'acc', 'gen', 'nom', 'voc',
        'nom/acc', 'gen/dat'], dtype=object),
 array(['dual', 'sg', 'pl'], dtype=object))

In [62]:
# elimino le righe che contengono celle vuote nel df dei nomi
good_noun_df = noun_df[(noun_df["gender"]!= " ") & (noun_df["case"]!=" ") & (noun_df["number"]!= " ")]

In [63]:
# inserisco nei df la POS e elimino la colonna dei dialetti
good_noun_df.insert(0, "POS", ["n" for x in good_noun_df.iterrows()])
good_noun_df = good_noun_df.drop("dialects", axis = 1)

good_verb_df.insert(0, "POS", ["v" if row["mode"] != "part" else "p" for id, row in good_verb_df.iterrows()])
good_verb_df = good_verb_df.drop("dialects", axis = 1)


In [65]:
# creo per entrambi i df nuove righe in cui separo i nomi e i verbi che hanno la stessa forma ma caratteristiche morfologiche diverse

new_rows = []

for id, row in good_noun_df.iterrows():
    if  "/" in row["case"]:
        cases = str(row["case"]).split("/")
        for case in cases:
            new_rows.append([row["POS"], row["lemma"], row["form"], row["gender"], case, row["number"]])
    
    if "/" in row["gender"]:
        genders = str(row["gender"]).split("/")
        for gender in genders:
            new_rows.append([row["POS"], row["lemma"], row["form"], gender, row["case"], row["number"]])

new_rows_df = new_rows_df = pd.DataFrame(new_rows, columns=["POS", "lemma", "form", "gender", "case", "number"])

good_noun_df = pd.concat([good_noun_df, new_rows_df], ignore_index=True)

new_rows = []

for id, row in good_verb_df.iterrows():
    if  "/" in row["case"]:
        cases = str(row["case"]).split("/")
        for case in cases:
            new_rows.append([row["POS"], row["lemma"], row["form"], row["tense"], row["mode"], row["act/mid/p"], row["gender"], case, row["person"], row["number"]])
    
    if "/" in row["gender"]:
        genders = str(row["gender"]).split("/")
        for gender in genders:
            new_rows.append([row["POS"], row["lemma"], row["form"],row["tense"], row["mode"], row["act/mid/p"], gender, row["case"], row["person"], row["number"]])

new_rows_df = new_rows_df = pd.DataFrame(new_rows, columns=["POS", "lemma", "form", "tense", "mode", "act/mid/p", "gender", "case", "person", "number"])

good_verb_df = pd.concat([good_verb_df, new_rows_df], ignore_index=True)

In [68]:
# rimuovo le righe in cui "gender" o "case" contengono "/" (cioè dove si ha la stessa forma ma caratteristiche morfologiche diverse) 
good_noun_df = good_noun_df[~good_noun_df["case"].str.contains("/") & ~good_noun_df["gender"].str.contains("/")]
good_verb_df = good_verb_df[~good_verb_df["case"].str.contains("/") & ~good_verb_df["gender"].str.contains("/")]

# resetto gli indici
good_noun_df = good_noun_df.reset_index(drop=True)
good_verb_df = good_verb_df.reset_index(drop=True)

In [147]:
# creo il dataset finale che unisce verbi e nomi
morph_df = pd.concat([good_verb_df, good_noun_df])
morph_df

Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number
0,v,ἅλλομαι,ἅλεται,aor,subj,mid,,,3rd,sg
1,v,ἅλλομαι,ἅληται,aor,subj,mid,,,3rd,sg
2,v,ἅλλομαι,ἅλῃ,aor,subj,mid,,,2nd,sg
3,v,ἅλλομαι,ἅλλεσθε,imperf,ind,mid,,,2nd,pl
4,v,ἅλλομαι,ἅλλεσθε,pres,imperat,mid,,,2nd,pl
...,...,...,...,...,...,...,...,...,...,...
329771,n,ζῳώδης,ζῳῶδες,,,,masc,voc,,sg
329772,n,ζῳώδης,ζῳῶδες,,,,fem,voc,,sg
329773,n,ζῳώδης,ζῳῶδες,,,,neut,nom,,sg
329774,n,ζῳώδης,ζῳῶδες,,,,neut,voc,,sg


In [148]:
morph_df

Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number
0,v,ἅλλομαι,ἅλεται,aor,subj,mid,,,3rd,sg
1,v,ἅλλομαι,ἅληται,aor,subj,mid,,,3rd,sg
2,v,ἅλλομαι,ἅλῃ,aor,subj,mid,,,2nd,sg
3,v,ἅλλομαι,ἅλλεσθε,imperf,ind,mid,,,2nd,pl
4,v,ἅλλομαι,ἅλλεσθε,pres,imperat,mid,,,2nd,pl
...,...,...,...,...,...,...,...,...,...,...
329771,n,ζῳώδης,ζῳῶδες,,,,masc,voc,,sg
329772,n,ζῳώδης,ζῳῶδες,,,,fem,voc,,sg
329773,n,ζῳώδης,ζῳῶδες,,,,neut,nom,,sg
329774,n,ζῳώδης,ζῳῶδες,,,,neut,voc,,sg


In [149]:
from sklearn.preprocessing import LabelEncoder

In [150]:
# trasformo in indici le caratteristiche morfologiche per poi creare gli embedding
features = ["POS", "tense", "mode", "act/mid/p", "gender", "case", "person", "number"]

label_encoders = {feature: LabelEncoder() for feature in features}

for feature in features:
    morph_df[feature] = label_encoders[feature].fit_transform(morph_df[feature].fillna('None'))

morph_df


Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number
0,2,ἅλλομαι,ἅλεται,1,5,2,0,0,3,2
1,2,ἅλλομαι,ἅληται,1,5,2,0,0,3,2
2,2,ἅλλομαι,ἅλῃ,1,5,2,0,0,2,2
3,2,ἅλλομαι,ἅλλεσθε,4,2,2,0,0,2,1
4,2,ἅλλομαι,ἅλλεσθε,7,1,2,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...
329771,0,ζῳώδης,ζῳῶδες,0,0,0,2,5,4,2
329772,0,ζῳώδης,ζῳῶδες,0,0,0,1,5,4,2
329773,0,ζῳώδης,ζῳῶδες,0,0,0,3,4,4,2
329774,0,ζῳώδης,ζῳῶδες,0,0,0,3,5,4,2


In [151]:
# trasformo tutti i lemmata e form in lowercase

morph_df.loc[:, 'lemma'] = morph_df['lemma'].str.lower()
morph_df.loc[:, 'form'] = morph_df['form'].str.lower()

In [152]:
# elimino i simboli che sono stati erroneamente inseriti nei lemmi e nelle forme inflesse che non aggiungono significato

chars_to_del = ["%", "(", ")", "-", "1", "2", "3", "="]

for char in chars_to_del:

    morph_df.loc[:, "form"] = morph_df["form"].str.replace(char, '', regex=False)
    morph_df.loc[:, "lemma"] = morph_df["lemma"].str.replace(char, '', regex=False)

In [153]:
# elimino le righe del df con i simboli che sono stati erroneamente inseriti nei lemmi e nelle forme inflesse che non aggiungono significato

rows_to_del = [',', '/', '†', '́']

pattern = '[' + ''.join(rows_to_del) + ']'

morph_df = morph_df[~morph_df["form"].str.contains(pattern, regex = True) & ~morph_df["lemma"].str.contains(pattern, regex = True) ].copy()

In [154]:
morph_df.reset_index()

Unnamed: 0,index,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number
0,0,2,ἅλλομαι,ἅλεται,1,5,2,0,0,3,2
1,1,2,ἅλλομαι,ἅληται,1,5,2,0,0,3,2
2,2,2,ἅλλομαι,ἅλῃ,1,5,2,0,0,2,2
3,3,2,ἅλλομαι,ἅλλεσθε,4,2,2,0,0,2,1
4,4,2,ἅλλομαι,ἅλλεσθε,7,1,2,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
1290539,329771,0,ζῳώδης,ζῳῶδες,0,0,0,2,5,4,2
1290540,329772,0,ζῳώδης,ζῳῶδες,0,0,0,1,5,4,2
1290541,329773,0,ζῳώδης,ζῳῶδες,0,0,0,3,4,4,2
1290542,329774,0,ζῳώδης,ζῳῶδες,0,0,0,3,5,4,2


In [155]:
# creo un vocabolario dei caratteri
all_characters = set()

def collect_unique_chars(word):
    for char in word:
        all_characters.add(char)

morph_df["lemma"].apply(collect_unique_chars)
morph_df["form"].apply(collect_unique_chars)

char_vocab = (sorted(list(all_characters)))
char_vocab

['ά',
 'έ',
 'ή',
 'ί',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'ζ',
 'η',
 'θ',
 'ι',
 'κ',
 'λ',
 'μ',
 'ν',
 'ξ',
 'ο',
 'π',
 'ρ',
 'ς',
 'σ',
 'τ',
 'υ',
 'φ',
 'χ',
 'ψ',
 'ω',
 'ϊ',
 'ϋ',
 'ό',
 'ύ',
 'ώ',
 'ἀ',
 'ἁ',
 'ἄ',
 'ἅ',
 'ἆ',
 'ἇ',
 'ἐ',
 'ἑ',
 'ἔ',
 'ἕ',
 'ἠ',
 'ἡ',
 'ἤ',
 'ἥ',
 'ἦ',
 'ἧ',
 'ἰ',
 'ἱ',
 'ἴ',
 'ἵ',
 'ἶ',
 'ἷ',
 'ὀ',
 'ὁ',
 'ὄ',
 'ὅ',
 'ὐ',
 'ὑ',
 'ὔ',
 'ὕ',
 'ὖ',
 'ὗ',
 'ὠ',
 'ὡ',
 'ὤ',
 'ὥ',
 'ὦ',
 'ὧ',
 'ὶ',
 'ᾀ',
 'ᾄ',
 'ᾅ',
 'ᾆ',
 'ᾇ',
 'ᾐ',
 'ᾑ',
 'ᾔ',
 'ᾕ',
 'ᾖ',
 'ᾗ',
 'ᾠ',
 'ᾡ',
 'ᾤ',
 'ᾦ',
 'ᾧ',
 'ᾰ',
 'ᾱ',
 'ᾳ',
 'ᾴ',
 'ᾶ',
 'ᾷ',
 'ῃ',
 'ῄ',
 'ῆ',
 'ῇ',
 'ῐ',
 'ΐ',
 'ῖ',
 'ῠ',
 'ῥ',
 'ῦ',
 'ῳ',
 'ῴ',
 'ῶ',
 'ῷ',
 '’']

In [156]:
# creo un dizionario che assegna ad ogni carattere un indice
char_to_idx = {char: idx for idx, char in enumerate(char_vocab)}

vocab_size = len(char_to_idx)

In [157]:
# creo altre due colonne nel dataset in cui inserisco la mappatura in indice dei caratteri delle colonne "form" e "lemma"
morph_df.loc[:, "lemma_tok"] = morph_df["lemma"].apply(lambda x: [char_to_idx.get(char) for char in x])
morph_df.loc[:, "form_tok"] = morph_df["form"].apply(lambda x: [char_to_idx.get(char) for char in x]) 

In [158]:
morph_df

Unnamed: 0,POS,lemma,form,tense,mode,act/mid/p,gender,case,person,number,lemma_tok,form_tok
0,2,ἅλλομαι,ἅλεται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 8, 23, 4, 12]"
1,2,ἅλλομαι,ἅληται,1,5,2,0,0,3,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 10, 23, 4, 12]"
2,2,ἅλλομαι,ἅλῃ,1,5,2,0,0,2,2,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 95]"
3,2,ἅλλομαι,ἅλλεσθε,4,2,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
4,2,ἅλλομαι,ἅλλεσθε,7,1,2,0,0,2,1,"[37, 14, 14, 18, 15, 4, 12]","[37, 14, 14, 8, 22, 11, 8]"
...,...,...,...,...,...,...,...,...,...,...,...,...
329771,0,ζῳώδης,ζῳῶδες,0,0,0,2,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329772,0,ζῳώδης,ζῳῶδες,0,0,0,1,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329773,0,ζῳώδης,ζῳῶδες,0,0,0,3,4,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"
329774,0,ζῳώδης,ζῳῶδες,0,0,0,3,5,4,2,"[9, 105, 33, 7, 10, 21]","[9, 105, 107, 7, 8, 21]"


In [159]:
import pickle

In [160]:
morph_df.to_pickle("morph_df.pkl")