# Word Embedding Translator

## Imports

In [1]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [2]:
def load_files(model_path, sentences_path, limit = None):
    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

In [3]:
# Paths order: model path, sentences path
fasttext_path ='Datasets/FastText/'
massive_path = 'Datasets/Amazon_Massive/'

paths = {
    'pt': [ fasttext_path + 'cc.pt.300.vec', massive_path + 'pt-PT.jsonl' ],
    'en': [ fasttext_path + 'cc.en.300.vec', massive_path + 'en-US.jsonl' ],
    'es': [ fasttext_path + 'cc.es.300.vec', massive_path + 'es-ES.jsonl' ]
}

languages = paths.keys()

In [4]:
models, sentences = {}, {}

for key, value in paths.items():
    models[key], sentences[key] = load_files(value[0], value[1])

## 2 - Preparing data

In [5]:
samples = { key: [] for key in languages }

for idx in range(len(sentences['pt'])):
    actual_sentence = { key: [] for key in languages}
    
    try:
        for lang, sent in sentences.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(models[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        samples[key].append(sum(value))

In [6]:
for key in sentences:
    print(f'Total sentences: { len(sentences[key]) } -> Model { key } samples: { len(samples[key]) } ({ len(samples[key]) / len(sentences[key]) * 100:.2f}%)')

Total sentences: 16521 -> Model pt samples: 15055 (91.13%)
Total sentences: 16521 -> Model en samples: 15055 (91.13%)
Total sentences: 16521 -> Model es samples: 15055 (91.13%)


## 3 - Making the translation

In [7]:
translations = { key: { lang: None for lang in languages if lang != key } for key in languages }

# It is possible to use combinations and transpose translator to speedup and similar results
for origin, target in it.permutations(languages, 2): 
    U, Sig, Vt = np.linalg.svd(np.transpose(samples[origin]) @ samples[target])
    translator = np.transpose(Vt) @ np.transpose(U)
    translations[origin][target] = translator

In [25]:
pt_word_list = [
    'sapato',
    'flor',
    'aniversário',
    'saudades'
]

In [26]:
for pt_word in pt_word_list:
    print(models['es'].most_similar(translations['pt']['es'] @ models['pt'][pt_word]))
    print("===")

[('zapato', 0.662111222743988), ('zapatos', 0.5563797950744629), ('vestido', 0.5291659235954285), ('calzado', 0.5260925889015198), ('tacón', 0.5103809237480164), ('sapato', 0.5011822581291199), ('bolso', 0.5003554224967957), ('tacones', 0.4980669617652893), ('abriguito', 0.4970734119415283), ('tacon', 0.4929378926753998)]
===
[('flor', 0.578707754611969), ('florecilla', 0.5153224468231201), ('peonia', 0.5134485960006714), ('camelia', 0.49486222863197327), ('flores.La', 0.4931686818599701), ('gardenia', 0.4834303855895996), ('plantita', 0.4797089993953705), ('florcita', 0.47890302538871765), ('gerbera', 0.4722459316253662), ('peonía', 0.4695531129837036)]
===
[('cumpleaños', 0.7326678037643433), ('cumpleaño', 0.6536815762519836), ('cumpleños', 0.6065086126327515), ('aniversario', 0.5872988104820251), ('cumpleaños.El', 0.5673967003822327), ('cumpeaños', 0.5645080208778381), ('Cumpleaños', 0.5528781414031982), ('cumpleanos', 0.5322667956352234), ('cumpleñaos', 0.5267534255981445), ('cumpl

## 4-Evaluate *future work
Two ways to evaluate a path between two languages

In [29]:
# Getting the most similar word in each language it pass
# Most expensive (uses most_similar multiple times) and try to aproximate a word each time
for pt_word in pt_word_list:
    english_word = models['en'].most_similar(translations['pt']['en'] @models['pt'][pt_word])[0][0]
    print(english_word)
    print(models['es'].most_similar(translations['en']['es'] @ models['en'][english_word]))
    print("===")

shoes
[('zapatos', 0.6464644074440002), ('zapatillas', 0.6166307926177979), ('sandalias', 0.5769971013069153), ('botas', 0.5539029240608215), ('calzado', 0.551113486289978), ('zapato', 0.5218526721000671), ('chanclas', 0.5212720036506653), ('calzarán', 0.5174567103385925), ('chancletas', 0.5101444721221924), ('calcetines', 0.5067110657691956)]
===
flower
[('flor', 0.5974904894828796), ('peonía', 0.5804789066314697), ('flores', 0.5449814796447754), ('peonías', 0.5210880637168884), ('floral', 0.49466392397880554), ('anturio', 0.4922579824924469), ('crisantemo', 0.4872678816318512), ('camelia', 0.47671687602996826), ('flores.La', 0.47307777404785156), ('gerbera', 0.4728681743144989)]
===
birthday
[('cumpleaños', 0.7605622410774231), ('cumpleaño', 0.6504004597663879), ('cumpleños', 0.5819917321205139), ('cumpleaños.El', 0.5789167284965515), ('Cumpleaños', 0.5748640894889832), ('cumpeaños', 0.572516918182373), ('cumpleañero', 0.5690513849258423), ('cumpleaños.Y', 0.5607236623764038), ('cump

In [30]:
# Using the vector transformed to each subspace
# Uses most_similar and try to approximate the word just one time 
for pt_word in pt_word_list:
    english_vector = translations['pt']['en'] @models['pt'][pt_word]
    print(models['es'].most_similar(translations['en']['es'] @ english_vector))
    print("===")

[('zapato', 0.5569401383399963), ('zapatos', 0.5283559560775757), ('tacones', 0.49460408091545105), ('vestido', 0.4847036302089691), ('sapato', 0.4791000187397003), ('sapatos', 0.46534422039985657), ('pantalón', 0.46260935068130493), ('collarcito', 0.4543265402317047), ('tacón', 0.4519156813621521), ('pantalones', 0.45182451605796814)]
===
[('flor', 0.5727273225784302), ('peonia', 0.498028963804245), ('peonía', 0.4919746220111847), ('florecita', 0.48463183641433716), ('rosa', 0.4773949682712555), ('florecilla', 0.4752405881881714), ('flor.Y', 0.4716757535934448), ('rosaY', 0.4682154655456543), ('rosaa', 0.46463143825531006), ('azalea', 0.4611826539039612)]
===
[('cumpleaños', 0.6476301550865173), ('cumpleaño', 0.6001196503639221), ('cumpleños', 0.5466558933258057), ('cumpleaños.El', 0.5311540961265564), ('cumpleaños.En', 0.5147355198860168), ('cumpeaños', 0.4955201745033264), ('aniversario', 0.4934757947921753), ('cumpleanos', 0.48804065585136414), ('cumpleañero', 0.4873219132423401), 