In [2]:
import gzip
import json
import pandas as pd
from my_functions_improved import *

  backends.update(_get_backends("networkx.backends"))
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‎𐤀 CLTK version '1.2.3'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``LatinSpacyProcess`` using Stanza model by Stanford University from https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [3]:
with gzip.open('noun_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    noun_dict = json.load(gzip_file)

In [4]:
with gzip.open('verb_dict.json.gz', 'rt', encoding='utf-8') as gzip_file:
    verb_dict = json.load(gzip_file)

In [5]:
noun_df = pd.DataFrame(noun_dict)

In [6]:
noun_df["form"]

0            ἅβρα
1            ἅβρα
2           ἅβραι
3           ἅβραι
4          ἅβραις
           ...   
210675     ζῳῶδες
210676     ζῳῶδες
210677    ζῳωδίας
210678    ζῳωδίας
210679     ζῳωδίᾳ
Name: form, Length: 210680, dtype: object

In [23]:
l = noun_df.loc[noun_df["form"]=="ἐρετμοῖς"]
l

Unnamed: 0,lemma,form,gender,case,number,dialects


In [8]:
verb_df = pd.DataFrame(verb_dict)

In [9]:
verb_df

Unnamed: 0,lemma,form,tense,mode,act/mid/p,gender,case,person,number,dialects
0,ἅλλομαι,ἅλεται,aor,subj,mid,,,3rd,sg,
1,ἅλλομαι,ἅληται,aor,subj,mid,,,3rd,sg,
2,ἅλλομαι,ἅλῃ,aor,subj,mid,,,2nd,sg,
3,ἅλλομαι,ἅλλεσθαι,pres,inf,mid,,,,,
4,ἅλλομαι,ἅλλεσθε,imperf,ind,mid,,,2nd,pl,"[(doric, aeolic]"
...,...,...,...,...,...,...,...,...,...,...
859327,ζῳοτροφέω,ζῳοτροφεῖν,pres,inf,act,,,,,[doric]
859328,ζῳοτροφέω,ζῳοτροφούντων,pres,part,act,masc/neut,gen,,pl,[doric]
859329,ζῳοτροφέω,ζῳοτροφούντων,pres,imperat,act,,,3rd,pl,[doric]
859330,ζῳοτροφέω,ζῳοτροφοῦσιν,pres,part,act,masc/neut,dat,,pl,[doric]


In [10]:
strings = ["ἂν δὲ καὶ αὐτοὶ βάντες ἐπὶ κληῖσι καθῖζον", "ἑξῆς δ᾽ ἑζόμενοι πολιὴν ἅλα τύπτον ἐρετμοῖς", "ἔνθεν δὲ προτέρω πλέομεν ἀκαχήμενοι ἦτορ", "ἄσμενοι ἐκ θανάτοιο, φίλους ὀλέσαντες ἑταίρους", "νῆα μὲν οἵ γε μέλαιναν ἁλὸς βένθοσδε ἔρυσσαν", "τοὶ δὲ πρυμνήσι᾽ ἔλυσαν"]

In [11]:
def get_noun_analysis(noun, df):
    noun_analysis = []
    word_df = df.loc[df["form"]==noun]
    for id, row in word_df.iterrows():
        noun_analysis.append([row["gender"], row["case"], row["number"], row ["dialects"]])
    return noun_analysis

In [12]:
def get_verb_analysis(verb, df):
    verb_analysis = []
    word_df = df.loc[df["form"]==verb]
    for id, row in word_df.iterrows():
        verb_analysis.append([row["tense"], row["mode"], row["act/mid/p"], row ["gender"], row["case"], row["person"], row["number"], row["dialects"]])
    return verb_analysis

In [13]:
import pickle

In [14]:
with open("sents_sim_dict.pickle", "rb") as file:
    sents_sim = pickle.load(file)

In [15]:
def get_lemma_syn(word, sim_dicts):
    lemma_w = lemmatize_cltk(word)[0]
    
    for sim_dict in sim_dicts:
        for lemma in sim_dict:
            if lemma == lemma_w:
                if sim_dict[lemma] != []:
                    sorted_values = sorted(sim_dict[lemma], key=lambda x: x[1], reverse=True)
                    for i, tup in enumerate(sorted_values):
                        syn, value = sorted_values[i]
                        if syn not in verb_df["lemma"].values and syn not in noun_df["lemma"].values:
                            continue
                        else:
                            return syn

In [16]:
def get_infl_s_noun(noun, analysis):
    forms = []
    if analysis[3] != " ":
        for dialect in analysis[3]:
            syn_df = noun_df.loc[(noun_df["lemma"] == noun) & (noun_df["gender"] == analysis[0]) & (noun_df["case"] == analysis[1]) & (noun_df["number"] == analysis[2]) & (noun_df["dialects"] == dialect)]
            for id, row in syn_df.iterrows():
                forms.append(row["form"])
    else:
        syn_df = noun_df.loc[(noun_df["lemma"] == noun) & (noun_df["gender"] == analysis[0]) & (noun_df["case"] == analysis[1]) & (noun_df["number"] == analysis[2])]
        for id, row in syn_df.iterrows():
            forms.append(row["form"])
    return forms

In [17]:
def get_infl_s_verb(verb, analysis):
    forms = []
    if analysis[7] != " ":
        for dialect in analysis[7]:
            syn_df = verb_df.loc[(verb_df["lemma"] == verb) & (verb_df["tense"] == analysis[0]) & (verb_df["mode"] == analysis[1]) & (verb_df["act/mid/p"] == analysis[2]) & (verb_df["gender"] == analysis[3]) & (verb_df["case"] == analysis[4]) & (verb_df["person"] == analysis[5]) & (verb_df["number"] == analysis[6]) & (verb_df["dialects"] == dialect)]
            for id, row in syn_df.iterrows():
                forms.append(row["form"])
    else:
        syn_df = verb_df.loc[(verb_df["lemma"] == verb) & (verb_df["tense"] == analysis[0]) & (verb_df["mode"] == analysis[1]) & (verb_df["act/mid/p"] == analysis[2]) & (verb_df["gender"] == analysis[3]) & (verb_df["case"] == analysis[4]) & (verb_df["person"] == analysis[5]) & (verb_df["number"] == analysis[6])]
        for id, row in syn_df.iterrows():
            forms.append(row["form"])
    return forms

In [18]:
def get_infl_syn(lemma, analysis):
    pos = pos_tag_cltk(lemma)[0]
    if pos == "a" or pos == "n":
        forms = get_infl_s_noun(lemma, analysis)
    if pos == "v":
        forms = get_infl_s_verb(lemma, analysis)
    return forms


In [19]:
s = get_lemma_syn("θαάσσω", sents_sim)
s

In [20]:
for sim_dict in sents_sim:
    report_similarities(sim_dict)

The best synonyms for βαίνω are:
	διαβαίνω with a score of [[0.88808715]]
	ἕρπω with a score of [[0.87936157]]
	στείχω with a score of [[0.8118147]]
	πατέω with a score of [[0.]]
The best synonyms for καθίζω are:
	ἵζω with a score of [[0.94065994]]
	καταδύω with a score of [[0.94029063]]
	κατατίθημι with a score of [[0.9275684]]
	ἰάλλω with a score of [[0.91808397]]
	εὐνάω with a score of [[0.89377445]]
	τίθημι with a score of [[0.89339024]]
	ἐφίστημι with a score of [[0.86465937]]
	ὑφίστημι with a score of [[0.81488335]]
	πέμπω with a score of [[0.7911981]]
	ἐπιτίθημι with a score of [[0.7376522]]
	λέγω with a score of [[0.6852222]]
	συγκαλέω with a score of [[0.4999244]]
	καθίστημι with a score of [[0.3065406]]
	συνάγω with a score of [[0.]]
	προτίθημι with a score of [[0.]]
	ἰάπτω with a score of [[0.]]
	ὑπομιμνήσκω with a score of [[0.]]
	ἐπικλίνω with a score of [[0.]]
	ὑποστόρνυμι with a score of [[0.]]
	στηρίζω with a score of [[0.]]
The best synonyms for ἕζομαι are:
	ἵζω with a

In [21]:
print(pos_tag_grecy("ἦτορ")[0])

n


In [22]:
for s in strings:
    paraphrasis = []
    sent_split = s.split(" ")
    for id, word in enumerate(sent_split):
        new_sent_split = sent_split.copy()
        word_analysis = []
        pos = pos_tag_grecy(word)[0]
        if pos == "a" or pos == "n":
            word_analysis = get_noun_analysis(word, noun_df)
        if pos == "v":
            word_analysis = get_verb_analysis(word, verb_df)
        if word_analysis != []:
            for analysis in word_analysis:
                lemma_synonym = get_lemma_syn(word, sents_sim)
                if lemma_synonym != None:
                    inflected_syns = get_infl_syn(lemma_synonym, analysis)
                    for syn in inflected_syns:
                        new_sent_split[id] = syn
                        new_sent = " ".join(new_sent_split)
                        if new_sent not in paraphrasis:
                            paraphrasis.append(new_sent)
    print(paraphrasis)

            

['ἂν δὲ καὶ αὐτοὶ διαβάντες ἐπὶ κληῖσι καθῖζον', 'ἂν δὲ καὶ αὐτοὶ διαβάσαντες ἐπὶ κληῖσι καθῖζον', 'ἂν δὲ καὶ αὐτοὶ βάντες ἐπὶ κληῖσι ἵζον', 'ἂν δὲ καὶ αὐτοὶ βάντες ἐπὶ κληῖσι ἷζον']
['ἑξῆς δ᾽ ἱζόμενοι πολιὴν ἅλα τύπτον ἐρετμοῖς', 'ἑξῆς δ᾽ ἑζόμενοι πολιὴν ἅλα κόπτον ἐρετμοῖς']
[]
['γηθόσυνοι ἐκ θανάτοιο, φίλους ὀλέσαντες ἑταίρους', 'ἄσμενοι ἐκ θανάτοιο, φίλους ὁδεύσαντες ἑταίρους']
['νῆα μὲν οἵ γε πορφυρᾶν ἁλὸς βένθοσδε ἔρυσσαν', 'νῆα μὲν οἵ γε πορφυρέαν ἁλὸς βένθοσδε ἔρυσσαν', 'νῆα μὲν οἵ γε πορφυρέην ἁλὸς βένθοσδε ἔρυσσαν', 'νῆα μὲν οἵ γε πορφυρῆν ἁλὸς βένθοσδε ἔρυσσαν', 'νῆα μὲν οἵ γε μέλαιναν ἁλὸς βένθοσδε ἔνισπον', 'νῆα μὲν οἵ γε μέλαιναν ἁλὸς βένθοσδε ἤνισπον']
['τοὶ δὲ πρυμνήσι᾽ ἔκλυσαν', 'τοὶ δὲ πρυμνήσι᾽ ἐξέλυσαν']


In [28]:
len(noun_dict["lemma"])

210680

In [26]:
len(verb_df)

859332