In [34]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import random as rd
import itertools as it
import matplotlib.pyplot as plt

# 1-Loading Things
Loading the models, sentences and analogies used.
Models: https://fasttext.cc/docs/en/crawl-vectors.html
Sentences: https://github.com/alexa/massive

In [31]:
def load_files(model_path, sentences_path, limit=None):
    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors='replace', limit=limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    return model, sentences

In [32]:
#Paths; order: model path, sentences path
paths = {
    'pt': ['datasets/cc.pt.300.vec', 'datasets/1.0/data/pt-PT.jsonl'],
    'en': ['datasets/cc.en.300.vec', 'datasets/1.0/data/en-US.jsonl'],
    'es': ['datasets/cc.es.300.vec', 'datasets/1.0/data/es-ES.jsonl']
}
languages = paths.keys()

#Load
models = {}
sentences = {}
for k, v in paths.items():
    models[k], sentences[k] = load_files(v[0], v[1], limit=500_000)

# 2-Preparing Data

In [33]:
samples = {k: [] for k in languages}
for idx in range(16521):
    actual_sentence = {k: [] for k in languages}
    try:
        for l, s in sentences.items():
            for w in s[idx].split(' '):
                actual_sentence[l].append( models[l][w] )
    except KeyError:
        continue
    
    for k, v in actual_sentence.items():
        samples[k].append(sum(v))
f'{ len(sentences["en"]) } -> { len(samples["en"]) } ({ len(samples["en"]) / len(sentences["en"]) *100:.2f}%)'

'16521 -> 12839 (77.71%)'

# 3-Making the translation

In [35]:
#compute the translator matrix
translations = {k: { j: None for j in languages if j != k} for k in languages}
for origin, target in it.permutations(languages, 2): #can use combinations and transpose translator to speedup and similar results
    U, Sig, Vt = np.linalg.svd(np.transpose( samples[origin] )@ samples[target])
    translator = np.transpose(Vt) @ np.transpose(U)
    translations[origin][target] = translator

In [42]:
models['es'].most_similar(translations['pt']['es'] @ models['pt']['sapato'])

[('zapato', 0.6466608643531799),
 ('zapatos', 0.5412116050720215),
 ('vestido', 0.5243082642555237),
 ('calzado', 0.512926459312439),
 ('bolso', 0.5040846467018127),
 ('tacón', 0.5010794401168823),
 ('tacones', 0.49396854639053345),
 ('abriguito', 0.4819582998752594),
 ('zapatito', 0.4813190698623657),
 ('vestidito', 0.4792330861091614)]

# 4-Evaluate *future work
Two ways to evaluate a path between two languages

In [43]:
word = 'sapato'

In [51]:
#Getting the most similar word in each language it pass
#Most expensive (uses most_similar multiple times) and try to aproximate a word each time
english_word = models['en'].most_similar(translations['pt']['en'] @models['pt'][word])[0][0]
print(english_word)
models['es'].most_similar(translations['en']['es'] @ models['en'][english_word])[0]

shoes


('zapatos', 0.6335018873214722)

In [63]:
#Using the vector transformed to each subspace
#Uses most_similar and try to approximate the word just one time 
english_vector = translations['pt']['en'] @models['pt'][word]
print(type(english_vector), len(english_vector))
models['es'].most_similar(translations['en']['es'] @ english_vector)[0]

<class 'numpy.ndarray'> 300


('zapato', 0.5794187188148499)