In [1]:
import pandas as pd
import requests
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# simple latin dictionary

In [2]:
target = "equus"
base_url = "https://www.latin-is-simple.com/api/vocabulary/search/?query={}&forms_only=true"
resp_json = requests.get(base_url.format(target)).json()
resp_json

[{'id': 63,
  'intern_type': 'noun',
  'short_name': 'equus',
  'full_name': 'equus, equi [m.] O',
  'type': {'name': 'noun', 'label': 'Noun'},
  'translations_unstructured': {'en': 'horse', 'de': 'Pferd'},
  'url': 'https://www.latin-is-simple.com/en/vocabulary/noun/63/'}]

In [3]:
resp_json[0]["translations_unstructured"]["en"]

'horse'

In [8]:
def simple_translation(target):
    try:
        base_url = "https://www.latin-is-simple.com/api/vocabulary/search/?query={}&forms_only=true"
        resp_json = requests.get(base_url.format(target)).json()
        transl = resp_json[0]["translations_unstructured"]["en"]
        if transl == "still in translation":
            transl = ""
        return transl
    except:
        return ""

In [9]:
filtered_vocab_df = pd.read_json("../data/filtered_vocab_df.json")
filtered_vocab_df.head(20)

Unnamed: 0,word,1501-1550,1551-1600,1601-1650,1651-1700,mean,in_lila_embeddings,in_lasla,in_operamaiora
5018,dico,42831,120825,41832,72221,69427.25,True,True,True
6155,omnis,33152,108530,46009,66387,63519.5,True,True,True
2515,facio,33323,109486,40134,53953,59224.0,True,True,True
4040,pars,32591,90667,43593,51249,54525.0,True,True,True
6769,habeo,30884,87225,38386,57791,53571.5,True,True,True
364,possum,23869,79572,36898,72892,53307.75,True,True,True
897,res,27869,68790,27473,49014,43286.5,True,True,True
4810,suus,23716,58975,33267,52441,42099.75,True,True,True
2392,uideo,21389,60143,28355,54288,41043.75,True,True,True
1790,magnus,23446,61213,29708,39211,38394.5,True,True,True


In [10]:
%%time
filtered_vocab_df["word"].head(20).apply(simple_translation)

CPU times: user 582 ms, sys: 33.9 ms, total: 616 ms
Wall time: 7.1 s


5018                                      say, call, tell
6155                                                     
2515                                     do, make, handle
4040                                                 part
6769                 have, hold, possess, consider, think
364                                          be able, can
897                                              suddenly
4810                 his own, her own, its own, their own
2392                                                     
1790                                           big, great
2217                                      place, location
2377                                         body, corpus
3110                                                 many
2555                    first, foremost, chief, principal
2853                                          give, offer
4389                                                     
1648    reckoning, account, reason, judgement, conside...
1830          

In [11]:
periods_str = filtered_vocab_df.columns[1:5]
periods_str

Index(['1501-1550', '1551-1600', '1601-1650', '1651-1700'], dtype='object')

In [None]:
# load all 6 vector models
# (2 from LiLa, 4 trained by us on noscemus)

lasla = KeyedVectors.load_word2vec_format("../data/large_data/allLASLA-lemmi-fast-100-SKIP-win5-min5.vec")
operamaiora = KeyedVectors.load_word2vec_format("../data/large_data/opera-maiora-lemmas_skip_100.vec")

In [6]:
vectors_dict = {}
for per_str in periods_str:
    vectors_dict[per_str] = KeyedVectors.load("../data/large_data/vectors_{}.wv".format(per_str))

In [21]:
target = "equus"
topn=10
most_similar_by_per = {}
most_similar_by_per["lasla"] = [tup[0] for tup in lasla.most_similar(target, topn=topn)]
most_similar_by_per["operamaiora"] = [tup[0] for tup in operamaiora.most_similar(target, topn=topn)]
for per_str in periods_str:
    nn_tups = vectors_dict[per_str].most_similar(target, topn=topn)
    nns = [tup[0] for tup in nn_tups]
    most_similar_by_per[per_str] = nns
pd.DataFrame(most_similar_by_per)

Unnamed: 0,lasla,operamaiora,1501-1550,1551-1600,1601-1650,1651-1700
0,currus,asinus,camelus,asinus,bos,asinus
1,equa,eques,asini,mulus,canis,currus
2,equito,bos,currus,bos,mulus,bos
3,calcar,bipes,asinus,currus,currus,mulus
4,ceruus,domus,canis,iumentum,asinus,eques
5,freno,trituro,bos,camelus,camelus,canus
6,habena,mercatio,pecus,eques,grex,ungula
7,essedum,equester,ungula,canis,lupus,canis
8,frenum,iumentum,mulus,certamen,curru,ceruus
9,frendo,animalis,iumentum,lupus,ferox,curru


In [17]:
most_similar_by_per = {}
most_similar_by_per["lasla"] = [tup[0] for tup in lasla.most_similar("seminarium", topn=10)]
#most_similar_by_per["operamaiora"] = [tup[0] for tup in operamaiora.most_similar("seminarium", topn=20)]

In [18]:
for per_str in periods_str:
    nn_tups = vectors_dict[per_str].most_similar("seminarium", topn=10)
    nns = [tup[0] for tup in nn_tups]
    most_similar_by_per[per_str] = nns

In [19]:
pd.DataFrame(most_similar_by_per)

Unnamed: 0,lasla,1501-1550,1551-1600,1601-1650,1651-1700
0,carnarium,sacrarium,ordinarius,mortarium,caldarium
1,sudarium,ordinarius,saecularis,caldarium,impono
2,uinarius,mortarium,sacrarium,palatium,mortarium
3,torcularium,macero,iubeo,aedes,aerarium
4,armarium,caldarium,tusculanus,sacrarium,ordinarius
5,milliarium,aerarium,donatio,conficio,aenarius
6,mortarium,oleum,victor,aerarium,scalptura
7,asinarius,mercenarius,seruus,soleo,paro
8,pomarium,iubeo,ius,mensa,preciosus
9,olearius,fiscus,praecipio,farina,statua


In [None]:
def get_tsne_coors(svd_matrix, perplexity=18):
    # inverse similarity to distance
    #data = (1 - sim_matrix) / 1
    words = svd_matrix.index
    #data.round(5)
    # tSNE to project all words into a 2-dimensional space
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, metric='cosine', n_iter=1000) # dissimilarity="precomputed",
    #tsne = TSNE(n_components=2, random_state=42, perplexity=18, metric='precomputed', n_iter=5000) # dissimilarity="precomputed",
    pos = tsne.fit_transform(svd_matrix) # project all points into space
    xs, ys = pos[:, 0], pos[:, 1]
    # extract minimal and maximal values
    minmax = [pos[:, 0].min(), pos[:, 0].max(), pos[:, 1].min(), pos[:, 1].max()]
    # normalize on scale from 0 to 1
    xs = (xs - minmax[0]) / (minmax[1] - minmax[0])
    ys = (ys - minmax[2]) / (minmax[3] - minmax[2])
    return xs, ys, words