# Word Embedding Translator

## Imports

In [48]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [49]:
def load_files(model_path, sentences_path, limit = None):
    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

In [50]:
# Paths order: model path, sentences path
fasttext_path ='datasets/'
massive_path = 'Datasets/Amazon_Massive/'

paths = {
    'pt': [ fasttext_path + 'cc.pt.300.vec', massive_path + 'pt-PT.jsonl' ],
    'en': [ fasttext_path + 'cc.en.300.vec', massive_path + 'en-US.jsonl' ],
    'es': [ fasttext_path + 'cc.es.300.vec', massive_path + 'es-ES.jsonl' ]
}

languages = paths.keys()

In [51]:
models, sentences = {}, {}

for key, value in paths.items():
    models[key], sentences[key] = load_files(value[0], value[1], limit = 100000)

## 2 - Preparing data

In [75]:
samples = { key: [] for key in languages }

for idx in range(len(sentences['pt'])):
    actual_sentence = { key: [] for key in languages}
    
    try:
        for lang, sent in sentences.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(models[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        samples[key].append([sentences[key][idx], sum(value)])

AttributeError: 'KeyedVectors' object has no attribute 'add'

In [53]:
size_samples = 0
for key in sentences:
    size_samples = len(samples[key])
    print(f'Total sentences: { len(sentences[key]) } -> Model { key } samples: { len(samples[key]) } ({ len(samples[key]) / len(sentences[key]) * 100:.2f}%)')

Total sentences: 16521 -> Model pt samples: 9685 (58.62%)
Total sentences: 16521 -> Model en samples: 9685 (58.62%)
Total sentences: 16521 -> Model es samples: 9685 (58.62%)


In [54]:
#split 70/30
split = int(size_samples*0.7)

train_set = { key: samples[key][:split] for key in languages }
test_set = { key: samples[key][split:] for key in languages }

## 3 - Making the translation

In [58]:
translations = { key: { lang: None for lang in languages if lang != key } for key in languages }

# It is possible to use combinations and transpose translator to speedup and similar results
for origin, target in it.permutations(languages, 2):
    samples_origin = [s[1] for s in train_set[origin]]
    samples_target = [s[1] for s in train_set[target]]
    
    U, Sig, Vt = np.linalg.svd(np.transpose(samples_origin) @ samples_target)
    translator = np.transpose(Vt) @ np.transpose(U)
    translations[origin][target] = translator

In [60]:
pt_word_list = [
    'sapato',
    'flor',
    'aniversário',
    'saudades'
]

In [61]:
for pt_word in pt_word_list:
    print(models['es'].most_similar(translations['pt']['es'] @ models['pt'][pt_word]))
    print("===")

[('zapato', 0.6491979956626892), ('zapatos', 0.5523584485054016), ('bolso', 0.5190415978431702), ('calzado', 0.5010228157043457), ('calcetín', 0.4991340935230255), ('vestido', 0.4952298104763031), ('sandalias', 0.4918721914291382), ('tacones', 0.4841281771659851), ('tacón', 0.4792153835296631), ('abrigo', 0.47542041540145874)]
===
[('flor', 0.5486701130867004), ('flores', 0.4406846761703491), ('rosa', 0.4260789453983307), ('amapola', 0.41856929659843445), ('preciosa', 0.39803820848464966), ('maceta', 0.3977915346622467), ('guirnalda', 0.3962261378765106), ('amapolas', 0.39263394474983215), ('lavanda', 0.3908427357673645), ('hoja', 0.3826061189174652)]
===
[('cumpleaños', 0.7168344259262085), ('aniversario', 0.568382978439331), ('Cumpleaños', 0.5552074313163757), ('aniversarios', 0.4919818341732025), ('celebrar', 0.49020811915397644), ('festejar', 0.4850721061229706), ('celebracion', 0.4834299385547638), ('celebración', 0.48271429538726807), ('fiesta', 0.48007288575172424), ('festejo', 

## 4-Evaluate
Uses the amazon dataset to evaluate the results

In [76]:
def cossine_similarity(v1, v2):
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [138]:
#idea behind: "translating" the vector that one phrase represents to other should result in a similar phrase
explanation_idx = 0
print(test_set['pt'][explanation_idx][0], '->', test_set['en'][explanation_idx][0])

vector_translated = translations['pt']['en'] @ test_set['pt'][explanation_idx][1]
vector_target = test_set['en'][explanation_idx][1]

print("Cossine similarity:", cossine_similarity(vector_translated, vector_target)) # -1 to 1 interval

adere a isto -> join this
Cossine similarity: 0.5027318


In [134]:
def avaliate_path(path):
    translation_matrix = np.identity(300) #replace to get size from actual translation matrix
    
    for (origin, target) in it.pairwise(path): #accumulate matrix
        translation_matrix = translations[origin][target] @ translation_matrix
    
    vectors = [translation_matrix @ v for _, v in test_set[path[0]] ]
    vectors_target = [ v for _, v in test_set[path[-1]]]
    
    return sum( [ cossine_similarity(v1, v2) for v1, v2 in zip(vectors, vectors_target)])/len(vectors)

In [135]:
#bulk test, mean, pt to es
avaliate_path(['pt', 'es'])

0.7732642441362156

In [136]:
#bulk test, mean, pt to en to es
avaliate_path(['pt', 'en', 'es'])

0.7448056987423631

## 5-Other ways to evaluate *future work
Two ways to evaluate a path between two languages using words, not phrases

In [29]:
# Getting the most similar word in each language it pass
# Most expensive (uses most_similar multiple times) and try to aproximate a word each time
for pt_word in pt_word_list:
    english_word = models['en'].most_similar(translations['pt']['en'] @models['pt'][pt_word])[0][0]
    print(english_word)
    print(models['es'].most_similar(translations['en']['es'] @ models['en'][english_word]))
    print("===")

shoes
[('zapatos', 0.6464644074440002), ('zapatillas', 0.6166307926177979), ('sandalias', 0.5769971013069153), ('botas', 0.5539029240608215), ('calzado', 0.551113486289978), ('zapato', 0.5218526721000671), ('chanclas', 0.5212720036506653), ('calzarán', 0.5174567103385925), ('chancletas', 0.5101444721221924), ('calcetines', 0.5067110657691956)]
===
flower
[('flor', 0.5974904894828796), ('peonía', 0.5804789066314697), ('flores', 0.5449814796447754), ('peonías', 0.5210880637168884), ('floral', 0.49466392397880554), ('anturio', 0.4922579824924469), ('crisantemo', 0.4872678816318512), ('camelia', 0.47671687602996826), ('flores.La', 0.47307777404785156), ('gerbera', 0.4728681743144989)]
===
birthday
[('cumpleaños', 0.7605622410774231), ('cumpleaño', 0.6504004597663879), ('cumpleños', 0.5819917321205139), ('cumpleaños.El', 0.5789167284965515), ('Cumpleaños', 0.5748640894889832), ('cumpeaños', 0.572516918182373), ('cumpleañero', 0.5690513849258423), ('cumpleaños.Y', 0.5607236623764038), ('cump

In [30]:
# Using the vector transformed to each subspace
# Uses most_similar and try to approximate the word just one time 
for pt_word in pt_word_list:
    english_vector = translations['pt']['en'] @models['pt'][pt_word]
    print(models['es'].most_similar(translations['en']['es'] @ english_vector))
    print("===")

[('zapato', 0.5569401383399963), ('zapatos', 0.5283559560775757), ('tacones', 0.49460408091545105), ('vestido', 0.4847036302089691), ('sapato', 0.4791000187397003), ('sapatos', 0.46534422039985657), ('pantalón', 0.46260935068130493), ('collarcito', 0.4543265402317047), ('tacón', 0.4519156813621521), ('pantalones', 0.45182451605796814)]
===
[('flor', 0.5727273225784302), ('peonia', 0.498028963804245), ('peonía', 0.4919746220111847), ('florecita', 0.48463183641433716), ('rosa', 0.4773949682712555), ('florecilla', 0.4752405881881714), ('flor.Y', 0.4716757535934448), ('rosaY', 0.4682154655456543), ('rosaa', 0.46463143825531006), ('azalea', 0.4611826539039612)]
===
[('cumpleaños', 0.6476301550865173), ('cumpleaño', 0.6001196503639221), ('cumpleños', 0.5466558933258057), ('cumpleaños.El', 0.5311540961265564), ('cumpleaños.En', 0.5147355198860168), ('cumpeaños', 0.4955201745033264), ('aniversario', 0.4934757947921753), ('cumpleanos', 0.48804065585136414), ('cumpleañero', 0.4873219132423401), 