In [7]:
import gzip
import json
import pandas as pd
from my_functions_improved import *

  backends.update(_get_backends("networkx.backends"))
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‎𐤀 CLTK version '1.2.3'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``LatinSpacyProcess`` using Stanza model by Stanford University from https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [8]:
with gzip.open('noun_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    noun_dict = json.load(gzip_file)

In [45]:
with gzip.open('verb_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    verb_dict = json.load(gzip_file)

In [10]:
noun_df = pd.DataFrame(noun_dict)

In [11]:
noun_df["form"]

0            ἅβρα
1            ἅβρα
2           ἅβραι
3           ἅβραι
4          ἅβραις
           ...   
210675     ζῳῶδες
210676     ζῳῶδες
210677    ζῳωδίας
210678    ζῳωδίας
210679     ζῳωδίᾳ
Name: form, Length: 210680, dtype: object

In [12]:
l = noun_df.loc[noun_df["form"]=="ἅβρα"]
l

Unnamed: 0,lemma,form,gender,case,number,dialects
0,ἅβρα,ἅβρα,fem,nom/voc/acc,dual,
1,ἅβρα,ἅβρα,fem,nom/voc,sg,"[(attic, doric, aeolic]"


In [47]:
verb_df = pd.DataFrame(verb_dict)

In [48]:
verb_df

Unnamed: 0,lemma,form,tense,mode,act/mid/p,gender,case,person,number,dialects
0,ἅλλομαι,ἅλεται,aor,subj,mid,,,3rd,sg,
1,ἅλλομαι,ἅληται,aor,subj,mid,,,3rd,sg,
2,ἅλλομαι,ἅλῃ,aor,subj,mid,,,2nd,sg,
3,ἅλλομαι,ἅλλεσθαι,pres,inf,mid,,,,,
4,ἅλλομαι,ἅλλεσθε,imperf,ind,mid,,,2nd,pl,"[(doric, aeolic]"
...,...,...,...,...,...,...,...,...,...,...
859327,ζῳοτροφέω,ζῳοτροφεῖν,pres,inf,act,,,,,[doric]
859328,ζῳοτροφέω,ζῳοτροφούντων,pres,part,act,masc/neut,gen,,pl,[doric]
859329,ζῳοτροφέω,ζῳοτροφούντων,pres,imperat,act,,,3rd,pl,[doric]
859330,ζῳοτροφέω,ζῳοτροφοῦσιν,pres,part,act,masc/neut,dat,,pl,[doric]


In [15]:
strings = ["ἂν δὲ καὶ αὐτοὶ βάντες ἐπὶ κληῖσι καθῖζον", "ἑξῆς δ᾽ ἑζόμενοι πολιὴν ἅλα τύπτον ἐρετμοῖς", "ἔνθεν δὲ προτέρω πλέομεν ἀκαχήμενοι ἦτορ", "ἄσμενοι ἐκ θανάτοιο, φίλους ὀλέσαντες ἑταίρους", "νῆα μὲν οἵ γε μέλαιναν ἁλὸς βένθοσδε ἔρυσσαν", "τοὶ δὲ πρυμνήσι᾽ ἔλυσαν"]

In [16]:
def get_noun_analysis(noun, df):
    noun_analysis = []
    word_df = df.loc[df["form"]==noun]
    for id, row in word_df.iterrows():
        noun_analysis.append([row["gender"], row["case"], row["number"], row ["dialects"]])
    return noun_analysis

In [17]:
def get_verb_analysis(verb, df):
    verb_analysis = []
    word_df = df.loc[df["form"]==verb]
    for id, row in word_df.iterrows():
        verb_analysis.append([row["tense"], row["mode"], row["act/mid/p"], row ["gender"], row["case"], row["person"], row["number"], row["dialects"]])
    return verb_analysis

In [18]:
import pickle

In [19]:
with open("sents_sim_dict.pickle", "rb") as file:
    sents_sim = pickle.load(file)

In [20]:
def get_lemma_syn(word, sim_dic):
    lemma_w = lemmatize_cltk(word)[0]
    
    for sentence in sim_dic:
        for lemma in sim_dic[sentence]:
            if lemma == lemma_w:
                print("lol")
                sorted_values = sorted(sim_dic[sentence][lemma], key=lambda x: x[1], reverse=True)
                syn, value = sorted_values[1]
                return syn

In [36]:
def get_infl_s_noun(noun, analysis):
    forms = []
    syn_df = noun_df.loc[(noun_df["lemma"] == noun) & (noun_df["gender"] == analysis[0]) & (noun_df["case"] == analysis[1]) & (noun_df["number"] == analysis[2]) & (noun_df["dialects"] == analysis[3])]
    for id, row in syn_df.iterrows():
        forms.append(row["form"])
    return forms

In [37]:
def get_infl_s_verb(verb, analysis):
    forms = []
    syn_df = verb_df.loc[(verb_df["lemma"] == verb) & (verb_df["tense"] == analysis[0]) & (verb_df["mode"] == analysis[1]) & (verb_df["act/mid/p"] == analysis[2]) & (verb_df["gender"] == analysis[3]) & (verb_df["case"] == analysis[4]) & (verb_df["person"] == analysis[5]) & (verb_df["number"] == analysis[6]) & (verb_df["dialects"] == analysis[7])]
    for id, row in syn_df.iterrows():
        forms.append(row["form"])
    return forms

In [43]:
t_df = verb_df.loc[verb_df["lemma"]=="θαάσσω"]
t_df

Unnamed: 0,lemma,form,tense,mode,act/mid/p,gender,case,person,number,dialects
691620,θαάσσω,θάασσε,pres,imperat,imperat,,,2nd,sg,
691621,θαάσσω,θάασσε,imperf,ind,imperf,,,3rd,sg,"[(homeric, ionic]"
691622,θαάσσω,θάασσεν,imperf,ind,imperf,,,3rd,sg,"[(homeric, ionic]"
691623,θαάσσω,θάασσον,imperf,ind,imperf,,,3rd,pl,"[(homeric, ionic]"
691624,θαάσσω,θάασσον,imperf,ind,imperf,,,1st,sg,"[(homeric, ionic]"
691625,θαάσσω,θαάσσει,pres,ind,mp,,,2nd,sg,
691626,θαάσσω,θαάσσει,pres,ind,act,,,3rd,sg,
691627,θαάσσω,θαάσσειν,pres,inf,act,,,,,
691628,θαάσσω,θαάσσεις,pres,ind,act,,,2nd,sg,
691629,θαάσσω,θαάσσω,pres,subj,act,,,1st,sg,


In [23]:
def get_infl_syn(lemma, analysis):
    pos = pos_tag_cltk(lemma)[0]
    if pos == "a" or pos == "n":
        forms = get_infl_s_noun(lemma, analysis)
    if pos == "v":
        forms = get_infl_s_verb(lemma, analysis)
    return forms


In [41]:
s = get_lemma_syn("θαάσσω", sents_sim)
s

In [40]:
s = "ἂν δὲ καὶ αὐτοὶ βάντες ἐπὶ κληῖσι καθῖζον"
sent_analysis = {}
sent_split = s.split(" ")
for id, word in enumerate(sent_split):
    word_analysis = []
    pos = pos_tag_grecy(word)[0]
    if pos == "a" or pos == "n":
        word_analysis = get_noun_analysis(word, noun_df)
    if pos == "v":
        word_analysis = get_verb_analysis(word, verb_df)
    if word_analysis != []:
        for analysis in word_analysis:
            print(analysis)
            lemma_synonym = get_lemma_syn(word, sents_sim)
            if lemma_synonym != None:
                print(lemma_synonym)
                inflected_syns = get_infl_syn(lemma_synonym, analysis)
                for syn in inflected_syns:
                    sent_split[id] = syn
                    new_sent = " ".join(sent_split)
                    print(new_sent)

            

['aor', 'part', 'act', 'masc', 'nom/voc', ' ', 'pl', ' ']
lol
διαβαίνω
ἂν δὲ καὶ αὐτοὶ διαβάντες ἐπὶ κληῖσι καθῖζον
['fem', 'dat', 'pl', ['(epic', 'ionic']]
['imperf', 'ind', 'imperf', ' ', ' ', '3rd', 'pl', ' ']
lol
θαάσσω
['imperf', 'ind', 'imperf', ' ', ' ', '1st', 'sg', ' ']
lol
θαάσσω
