# Word Embedding Translator

## Libraries

In [5]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [6]:
def load_files(model_path, sentences_path, limit = None):
    '''
    Load models from FastText folder and sentences from Amazon Massive folder.
    
    Params:
    - model_path: path to the folder containing all models used, i.e., FastText
    - sentences_path: path to the folder containing all sentences used, i.e., Amazon_Massive
    - limit: define a limit in case your have low computer power, e.g., 5000
    
    Return:
    Tuple containing the language model and its corresponding sentences
    '''

    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

In [7]:
FASTTEXT_PATH = 'Datasets/FastText/'
MASSIVE_PATH = 'Datasets/Amazon_Massive/'

In [8]:
PATHS = {
    'en': [ FASTTEXT_PATH + 'cc.en.300.vec', MASSIVE_PATH + 'en-US.jsonl' ],
    'pt': [ FASTTEXT_PATH + 'cc.pt.300.vec', MASSIVE_PATH + 'pt-PT.jsonl' ],
    'es': [ FASTTEXT_PATH + 'cc.es.300.vec', MASSIVE_PATH + 'es-ES.jsonl' ]
}

LANGUAGES = PATHS.keys()

**Note**: the cell below takes approximately 5 to 6 minutes per model.

In [9]:
MODELS, SENTENCES = {}, {}

for language, value in PATHS.items():
    model = value[0]
    sentences = value[1]

    print("Loading", model)
    MODELS[language], SENTENCES[language] = load_files(model, sentences)
    print("Finished loading", model)

print("All models and sentences are now loaded!")

Finished loading  Datasets/FastText/cc.en.300.vec
Finished loading  Datasets/FastText/cc.pt.300.vec
Finished loading  Datasets/FastText/cc.es.300.vec


## 2 - Preparing data

In [10]:
SAMPLES = { key: [] for key in LANGUAGES }

In [16]:
# Since all sentence files have the same length, we chose one at random for the range function.
# We prove this in the cell below
for idx in range(len(SENTENCES['pt'])):
    
    actual_sentence = { key: [] for key in LANGUAGES}
    
    try:
        for lang, sent in SENTENCES.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(MODELS[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        SAMPLES[key].append(sum(value))

In [18]:
for key in SENTENCES:
    print(f'Total sentences in { key } file: { len(SENTENCES[key]) } -> Model { key } samples: { len(SAMPLES[key]) } ({ len(SAMPLES[key]) / len(SENTENCES[key]) * 100:.2f}%)')

Total sentences in en file: 16521 -> Model en samples: 15055 (91.13%)
Total sentences in pt file: 16521 -> Model pt samples: 15055 (91.13%)
Total sentences in es file: 16521 -> Model es samples: 15055 (91.13%)


## 3 - Translating words

In [19]:
TRANSLATIONS = { key: { lang: None for lang in LANGUAGES if lang != key } for key in LANGUAGES }

In [20]:
for origin, target in it.permutations(LANGUAGES, 2): 
    U, Sig, Vt = np.linalg.svd(np.transpose(SAMPLES[origin]) @ SAMPLES[target])
    TRANSLATOR = np.transpose(Vt) @ np.transpose(U)
    TRANSLATIONS[origin][target] = TRANSLATOR

### List of examples words
**Note**: only single words can be written, i.e., compound words like "washing machine" will result in Error

- English

In [35]:
EN_WORD_LIST = [
    'specification',
    'book',
    'duckling',
    'machine',
    'headphones'
]

- Portuguese

In [22]:
PT_WORD_LIST = [
    'sapato',
    'flor',
    'aniversário',
    'saudades',
]

- Spanish

In [23]:
ES_WORD_LIST = [
    'hola',
    'sí',
    'computadora',
    'país'
]

In [30]:
def translate(word_list, target_lang, origin_lang):
    '''
    Function to translate one word from one language to another.

    Params:
    - word_list: list of example words.
    - target_lang: language you wish to know the translation
    - origin_lang: language in which the words in word_list are written

    Example of usage:
    translate(PT_WORD_LIST, 'es', 'pt')
    '''
    
    for word in word_list:
        print("Original word:", word)
        print("Top 10 most similar words in", target_lang)
        print(MODELS[target_lang].most_similar(TRANSLATIONS[origin_lang][target_lang] @ MODELS[origin_lang][word]))
        print("\n")

### Examples

- Portuguese -> Spanish

In [31]:
translate(PT_WORD_LIST, 'es', 'pt')

Original word: sapato
Top 10 most similar words in es
[('zapato', 0.662111222743988), ('zapatos', 0.5563797950744629), ('vestido', 0.5291659235954285), ('calzado', 0.5260925889015198), ('tacón', 0.5103809237480164), ('sapato', 0.5011822581291199), ('bolso', 0.5003554224967957), ('tacones', 0.4980669617652893), ('abriguito', 0.4970734119415283), ('tacon', 0.4929378926753998)]


Original word: flor
Top 10 most similar words in es
[('flor', 0.578707754611969), ('florecilla', 0.5153224468231201), ('peonia', 0.5134485960006714), ('camelia', 0.49486222863197327), ('flores.La', 0.4931686818599701), ('gardenia', 0.4834303855895996), ('plantita', 0.4797089993953705), ('florcita', 0.47890302538871765), ('gerbera', 0.4722459316253662), ('peonía', 0.4695531129837036)]


Original word: aniversário
Top 10 most similar words in es
[('cumpleaños', 0.7326678037643433), ('cumpleaño', 0.6536815762519836), ('cumpleños', 0.6065086126327515), ('aniversario', 0.5872988104820251), ('cumpleaños.El', 0.56739670

- Portuguese -> English

In [32]:
translate(PT_WORD_LIST, 'en', 'pt')

Original word: sapato
Top 10 most similar words in en
[('shoes', 0.49248427152633667), ('shoe', 0.4787915349006653), ('handbag', 0.46190956234931946), ('shoes.', 0.45884427428245544), ('high-heels', 0.4253663122653961), ('shoes.It', 0.42041054368019104), ('wear', 0.41922497749328613), ('dress', 0.41850635409355164), ('shoes.I', 0.41741669178009033), ('stilettos', 0.41115647554397583)]


Original word: flor
Top 10 most similar words in en
[('flower', 0.4892441928386688), ('flower.', 0.45763397216796875), ('flowers', 0.4486697316169739), ('floweret', 0.4485749304294586), ('frangipani', 0.4426537752151489), ('flower.I', 0.44215601682662964), ('rose-bud', 0.4417296051979065), ('flowers.I', 0.4355604350566864), ('floweres', 0.43370744585990906), ('peony', 0.4296590983867645)]


Original word: aniversário
Top 10 most similar words in en
[('birthday', 0.6762741208076477), ('brithday', 0.6188441514968872), ('anniversary', 0.5894871354103088), ('b-day', 0.5865834951400757), ('B-day', 0.56888413

- Spanish -> English

In [33]:
translate(ES_WORD_LIST, 'en', 'es')

Original word: hola
Top 10 most similar words in en
[('hi', 0.7843162417411804), ('hello', 0.6993371844291687), ('Hello', 0.5977930426597595), ('Hey', 0.5961156487464905), ('Hi', 0.5825239419937134), ('hello.', 0.5765830874443054), ('hellow', 0.5692029595375061), ('hey', 0.5662575960159302), ('hi.', 0.5605804920196533), ('hiya', 0.5553944110870361)]


Original word: sí
Top 10 most similar words in en
[('if', 0.5527223348617554), ('it', 0.5410739779472351), ('is', 0.5105495452880859), ('so', 0.502880334854126), ('ok', 0.48606130480766296), ('he', 0.47977301478385925), ('So', 0.473000168800354), ('but', 0.47253870964050293), ('It', 0.4574020206928253), ('.But', 0.4567376375198364)]


Original word: computadora
Top 10 most similar words in en
[('computer', 0.5687118768692017), ('computers', 0.5186777710914612), ('compuer', 0.4945278763771057), ('comupter', 0.473619282245636), ('lap-top', 0.4680299460887909), ('comptuer', 0.46599137783050537), ('computer.', 0.460129976272583), ('computer.B

- English -> Portuguese

In [36]:
translate(EN_WORD_LIST, 'pt', 'en')

Original word: specification
Top 10 most similar words in pt
[('especificação', 0.46337905526161194), ('e-ping', 0.3616640865802765), ('SEPIADES', 0.3566552996635437), ('modelo', 0.3557088375091553), ('especifcado', 0.34288156032562256), ('ocódigo', 0.33144697546958923), ('documento-base', 0.3298552334308624), ('defnição', 0.32953616976737976), ('FURPS', 0.32794156670570374), ('modelo-base', 0.32483965158462524)]


Original word: book
Top 10 most similar words in pt
[('livro', 0.6918875575065613), ('livrinho', 0.5807176232337952), ('livroO', 0.552798330783844), ('livro.O', 0.5448362231254578), ('olivro', 0.5433851480484009), ('umlivro', 0.5348812341690063), ('romance', 0.5321617722511292), ('livro.E', 0.5300799608230591), ('livroA', 0.5215703845024109), ('audiolivro', 0.5129063725471497)]


Original word: duckling
Top 10 most similar words in pt
[('patinho', 0.4825701415538788), ('galinha', 0.4435596764087677), ('pintinho', 0.4388863146305084), ('ovinho', 0.4304274618625641), ('pato', 

## 4 - Translating words using intermediate languages

### Getting the most similar word in each language it pass.
Most expensive (uses most_similar multiple times) and try to aproximate a word each time.

In [41]:
def intermediate_most_similar_word(word_list, target_lang, origin_lang, intermediate_lang):
    '''
    Translate one word from one language to another passing by an intermediate language.
    In this function, we use the result of the most similar word of the intermediate language to make the next translation.

    Params:
    - word_list: list of example words.
    - target_lang: language you wish to know the translation
    - origin_lang: language in which the words in word_list are written
    - intermediate_lang: intermediate language which translation between origin_lang and target_lang passes by

    Example of usage:
    intermediate_most_similar_word(PT_WORD_LIST, 'es', 'pt', 'en')
    '''
    for word in word_list:
        print("Original word:", word)
        
        intermediate_word = MODELS[intermediate_lang].most_similar(TRANSLATIONS[origin_lang][intermediate_lang] @ MODELS[origin_lang][word])[0][0]
        print("Most similar word according to intermediate language:", intermediate_word)

        translated_language = MODELS[target_lang].most_similar(TRANSLATIONS[intermediate_lang][target_lang] @ MODELS[intermediate_lang][intermediate_word])
        print("Top 10 most similar words in target language passing by the intermediate language:")
        print(translated_language)
        
        print("\n")

- Portuguese -> English -> Spanish

In [42]:
intermediate_most_similar_word(PT_WORD_LIST, 'es', 'pt', 'en')

Original word: sapato
Most similar word according to intermediate language: shoes
Top 10 most similar words in target language passing by the intermediate language:
[('zapatos', 0.6464644074440002), ('zapatillas', 0.6166307926177979), ('sandalias', 0.5769971013069153), ('botas', 0.5539029240608215), ('calzado', 0.551113486289978), ('zapato', 0.5218526721000671), ('chanclas', 0.5212720036506653), ('calzarán', 0.5174567103385925), ('chancletas', 0.5101444721221924), ('calcetines', 0.5067110657691956)]


Original word: flor
Most similar word according to intermediate language: flower
Top 10 most similar words in target language passing by the intermediate language:
[('flor', 0.5974904894828796), ('peonía', 0.5804789066314697), ('flores', 0.5449814796447754), ('peonías', 0.5210880637168884), ('floral', 0.49466392397880554), ('anturio', 0.4922579824924469), ('crisantemo', 0.4872678816318512), ('camelia', 0.47671687602996826), ('flores.La', 0.47307777404785156), ('gerbera', 0.472868174314498

- Spanish -> Portuguese -> English

In [44]:
intermediate_most_similar_word(ES_WORD_LIST, 'en', 'es', 'pt')

Original word: hola
Most similar word according to intermediate language: olá
Top 10 most similar words in target language passing by the intermediate language:
[('hi', 0.814129650592804), ('hello', 0.7390920519828796), ('Hi', 0.618787944316864), ('hellow', 0.6097189784049988), ('Hey', 0.6047798991203308), ('Hello', 0.6047042608261108), ('hello.', 0.6040636301040649), ('helllo', 0.5991599559783936), ('hi.', 0.5966516733169556), ('hey', 0.5912631154060364)]


Original word: sí
Most similar word according to intermediate language: só
Top 10 most similar words in target language passing by the intermediate language:
[('if', 0.6420409679412842), ('it', 0.5627325773239136), ('So', 0.5312941670417786), ('so', 0.5246899724006653), ('is', 0.5088378190994263), ('me', 0.5050639510154724), ('If', 0.5030770301818848), ('I', 0.5029612183570862), ('But', 0.4980606436729431), ('there', 0.49335208535194397)]


Original word: computadora
Most similar word according to intermediate language: computador


- English -> Spanish -> Portuguese

In [45]:
intermediate_most_similar_word(EN_WORD_LIST, 'pt', 'en', 'es')

Original word: specification
Most similar word according to intermediate language: especificación
Top 10 most similar words in target language passing by the intermediate language:
[('especificação', 0.6213092803955078), ('defnição', 0.518690288066864), ('especiﬁcação', 0.5082621574401855), ('definição', 0.5008370280265808), ('descrição', 0.4939766228199005), ('especicação', 0.47761470079421997), ('diferenciabilidade', 0.4761696457862854), ('exaustividade', 0.47191688418388367), ('composicionalidade', 0.45916664600372314), ('especiﬁcações', 0.4575174152851105)]


Original word: book
Most similar word according to intermediate language: libro
Top 10 most similar words in target language passing by the intermediate language:
[('livro', 0.850339949131012), ('livrinho', 0.6726419925689697), ('livro.O', 0.6658133268356323), ('livroO', 0.6547927856445312), ('olivro', 0.646285355091095), ('livro.E', 0.639930248260498), ('livro.É', 0.6361121535301208), ('livro-', 0.6252992153167725), ('umlivro

### Using the vector transformed to each subspace.
Uses most_similar and try to approximate the word just one time.

In [46]:
def intermediate_most_similar_vector(word_list, target_lang, origin_lang, intermediate_lang):
    '''
    Translate one word from one language to another passing by an intermediate language.
    In this function, we use the result of the vector of the translation passing by the intermediate language to make the next translation.

    Params:
    - word_list: list of example words.
    - target_lang: language you wish to know the translation
    - origin_lang: language in which the words in word_list are written
    - intermediate_lang: intermediate language which translation between origin_lang and target_lang passes by

    Example of usage:
    intermediate_most_similar_vector(PT_WORD_LIST, 'es', 'pt', 'en')
    '''
    for word in word_list:
        print("Original word:", word)

        intermediate_vector = TRANSLATIONS[origin_lang][intermediate_lang] @ MODELS[origin_lang][word]
        translated_vector = MODELS[target_lang].most_similar(TRANSLATIONS[intermediate_lang][target_lang] @ intermediate_vector)
        print("Top 10 most similar words in target language passing by the intermediate language:")
        print(translated_vector)

        print("\n")

- Portuguese -> English -> Spanish

In [47]:
intermediate_most_similar_vector(PT_WORD_LIST, 'es', 'pt', 'en')

Original word: sapato
Top 10 most similar words in target language passing by the intermediate language:
[('zapato', 0.5569401383399963), ('zapatos', 0.5283559560775757), ('tacones', 0.49460408091545105), ('vestido', 0.4847036302089691), ('sapato', 0.4791000187397003), ('sapatos', 0.46534422039985657), ('pantalón', 0.46260935068130493), ('collarcito', 0.4543265402317047), ('tacón', 0.4519156813621521), ('pantalones', 0.45182451605796814)]


Original word: flor
Top 10 most similar words in target language passing by the intermediate language:
[('flor', 0.5727273225784302), ('peonia', 0.498028963804245), ('peonía', 0.4919746220111847), ('florecita', 0.48463183641433716), ('rosa', 0.4773949682712555), ('florecilla', 0.4752405881881714), ('flor.Y', 0.4716757535934448), ('rosaY', 0.4682154655456543), ('rosaa', 0.46463143825531006), ('azalea', 0.4611826539039612)]


Original word: aniversário
Top 10 most similar words in target language passing by the intermediate language:
[('cumpleaños', 0

- Spanish -> Portuguese -> English

In [48]:
intermediate_most_similar_vector(ES_WORD_LIST, 'en', 'es', 'pt')

Original word: hola
Top 10 most similar words in target language passing by the intermediate language:
[('hi', 0.7485652565956116), ('hello', 0.6746777892112732), ('Hey', 0.5734724998474121), ('Hello', 0.5732272863388062), ('hey', 0.571393609046936), ('helllo', 0.5682624578475952), ('hellow', 0.5636361837387085), ('hi.', 0.5567192435264587), ('hello.', 0.554912805557251), ('hiya', 0.5387632846832275)]


Original word: sí
Top 10 most similar words in target language passing by the intermediate language:
[('if', 0.5431066751480103), ('is', 0.497404545545578), ('it', 0.4913477599620819), ('But', 0.47263434529304504), ('ok', 0.4664313495159149), ('so', 0.46200358867645264), ('but', 0.4593670070171356), ('that', 0.45623454451560974), ('So', 0.44092997908592224), ('he', 0.4334471523761749)]


Original word: computadora
Top 10 most similar words in target language passing by the intermediate language:
[('computer', 0.5079346895217896), ('comupter', 0.4528789818286896), ('lap-top', 0.446105718

- English -> Spanish -> Portuguese

In [49]:
intermediate_most_similar_vector(EN_WORD_LIST, 'pt', 'en', 'es')

Original word: specification
Top 10 most similar words in target language passing by the intermediate language:
[('especificação', 0.4690665602684021), ('ABNT.', 0.39077940583229065), ('especificaçao', 0.3870176672935486), ('especifcado', 0.3863670527935028), ('certiﬁcação', 0.3830704987049103), ('especiﬁcação', 0.369947224855423), ('ante-projeto', 0.3688240051269531), ('defnição', 0.3673538267612457), ('certiﬁcado', 0.36712342500686646), ('normatizada', 0.3665965497493744)]


Original word: book
Top 10 most similar words in target language passing by the intermediate language:
[('livro', 0.6548709273338318), ('livrinho', 0.5630871057510376), ('livroO', 0.5443949103355408), ('livro.O', 0.5260764956474304), ('olivro', 0.5172587037086487), ('umlivro', 0.5164043307304382), ('livroA', 0.515987753868103), ('livrode', 0.5135417580604553), ('livros', 0.5123415589332581), ('leitura', 0.5048522353172302)]


Original word: duckling
Top 10 most similar words in target language passing by the inte

## 5 - Evaluate