# Word Embedding Translator

## Libraries

In [None]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [None]:
def load_files(model_path, sentences_path, limit = None):
    '''
    Load models from FastText folder and sentences from Amazon Massive folder.
    
    Params:
    - model_path: path to the folder containing all models used, i.e., FastText
    - sentences_path: path to the folder containing all sentences used, i.e., Amazon_Massive
    - limit: define a limit in case your have low computer power, e.g., 5000
    
    Return:
    Tuple containing the language model and its corresponding sentences
    '''

    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

Defining data path.

In [None]:
FASTTEXT_PATH = 'Datasets/FastText/'
MASSIVE_PATH = 'Datasets/Amazon_Massive/'

In [None]:
PATHS = {
    'da': [ FASTTEXT_PATH + 'cc.da.300.vec', MASSIVE_PATH + 'da-DK.jsonl' ],
    'de': [ FASTTEXT_PATH + 'cc.de.300.vec', MASSIVE_PATH + 'de-DE.jsonl' ],
    'en': [ FASTTEXT_PATH + 'cc.en.300.vec', MASSIVE_PATH + 'en-US.jsonl' ],
    'es': [ FASTTEXT_PATH + 'cc.es.300.vec', MASSIVE_PATH + 'es-ES.jsonl' ],
    'fr': [ FASTTEXT_PATH + 'cc.fr.300.vec', MASSIVE_PATH + 'fr-FR.jsonl' ],
    'it': [ FASTTEXT_PATH + 'cc.it.300.vec', MASSIVE_PATH + 'it-IT.jsonl' ],
    'nl': [ FASTTEXT_PATH + 'cc.nl.300.vec', MASSIVE_PATH + 'nl-NL.jsonl' ],
    'pt': [ FASTTEXT_PATH + 'cc.pt.300.vec', MASSIVE_PATH + 'pt-PT.jsonl' ],
    'ro': [ FASTTEXT_PATH + 'cc.ro.300.vec', MASSIVE_PATH + 'ro-RO.jsonl' ],
    'sv': [ FASTTEXT_PATH + 'cc.sv.300.vec', MASSIVE_PATH + 'sv-SE.jsonl' ],
}

LANGUAGES = PATHS.keys()

**Note**: the cell below takes approximately 5 to 6 minutes per model.

In [None]:
MODELS, SENTENCES = {}, {}

for language, value in PATHS.items():
    model = value[0]
    sentences = value[1]

    print("Loading", model)
    MODELS[language], SENTENCES[language] = load_files(model, sentences)
    print("Finished loading", model)

print("\nAll models and sentences are now loaded!")

## 2 - Preparing data

In [None]:
SAMPLES = { key: [] for key in LANGUAGES }

In [None]:
# Since all sentence files have the same length, we chose one at random for the range function.
# We prove this in the cell below
for idx in range(len(SENTENCES['pt'])):
    
    actual_sentence = { key: [] for key in LANGUAGES}
    
    try:
        for lang, sent in SENTENCES.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(MODELS[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        SAMPLES[key].append([SENTENCES[key][idx], sum(value)])

In [None]:
for key in SENTENCES:
    SIZE_SAMPLES = len(SAMPLES[key])
    print(f'Total sentences in { key } file: { len(SENTENCES[key]) } -> Model { key } samples: { len(SAMPLES[key]) } ({ len(SAMPLES[key]) / len(SENTENCES[key]) * 100:.2f}%)')

Splitting into train and test.

In [None]:
SPLIT_RATE = int(SIZE_SAMPLES * 0.7)

TRAIN_SET = { key: SAMPLES[key][:SPLIT_RATE] for key in LANGUAGES }
TEST_SET = { key: SAMPLES[key][SPLIT_RATE:] for key in LANGUAGES }

## 3 - Translating words

In [None]:
TRANSLATIONS = { key: { lang: None for lang in LANGUAGES if lang != key } for key in LANGUAGES }

In [None]:
for origin, target in it.permutations(LANGUAGES, 2): 

    samples_origin = [sample[1] for sample in TRAIN_SET[origin]]
    samples_target = [sample[1] for sample in TRAIN_SET[target]]

    U, Sig, Vt = np.linalg.svd(np.transpose(samples_origin) @ samples_target)
    
    TRANSLATOR = np.transpose(Vt) @ np.transpose(U)
    TRANSLATIONS[origin][target] = TRANSLATOR

### List of examples words
**Note**: only single words can be written, i.e., compound words like "washing machine" will result in Error

- English

In [None]:
EN_WORD_LIST = [
    'specification',
    'book',
    'duckling',
    'machine',
    'headphones'
]

- Spanish

In [None]:
ES_WORD_LIST = [
    'hola',
    'sí',
    'computadora',
    'país'
]

- Portuguese

In [None]:
PT_WORD_LIST = [
    'sapato',
    'flor',
    'aniversário',
    'saudades',
]

In [None]:
def translate(word_list, origin_lang, target_lang):
    '''
    Function to translate one word from one language to another.

    Params:
    - word_list: list of example words.
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    translate(PT_WORD_LIST, 'es', 'pt')
    '''
    
    for word in word_list:
        print("Original word:", word)
        print("Top 10 most similar words in", target_lang)
        print(MODELS[target_lang].most_similar(TRANSLATIONS[origin_lang][target_lang] @ MODELS[origin_lang][word]))
        print("\n")

### Examples

- Portuguese -> Spanish

In [None]:
translate(PT_WORD_LIST, 'pt', 'es')

- Portuguese -> English

In [None]:
translate(PT_WORD_LIST, 'pt', 'en')

- Spanish -> English

In [None]:
translate(ES_WORD_LIST, 'es', 'en')

- English -> Portuguese

In [None]:
translate(EN_WORD_LIST, 'en', 'pt')

## 4 - Translating words using intermediate languages

### Getting the most similar word in each language it pass.
Most expensive (uses most_similar multiple times) and try to aproximate a word each time.

In [None]:
def intermediate_most_similar_word(word_list, origin_lang, intermediate_lang, target_lang):
    '''
    Translate one word from one language to another passing by an intermediate language.
    In this function, we use the result of the most similar word of the intermediate language to make the next translation.

    Params:
    - word_list: list of example words.
    - origin_lang: language in which the words in word_list are written
    - intermediate_lang: intermediate language which translation between origin_lang and target_lang passes by
    - target_lang: language you wish to know the translation

    Example of usage:
    intermediate_most_similar_word(PT_WORD_LIST, 'es', 'pt', 'en')
    '''
    
    for word in word_list:
        print("Original word:", word)
        
        intermediate_word = MODELS[intermediate_lang].most_similar(TRANSLATIONS[origin_lang][intermediate_lang] @ MODELS[origin_lang][word])[0][0]
        print("Most similar word according to intermediate language:", intermediate_word)

        translated_language = MODELS[target_lang].most_similar(TRANSLATIONS[intermediate_lang][target_lang] @ MODELS[intermediate_lang][intermediate_word])
        print("Top 10 most similar words in target language passing by the intermediate language:")
        print(translated_language)
        
        print("\n")

- Portuguese -> English -> Spanish

In [None]:
intermediate_most_similar_word(PT_WORD_LIST, 'pt', 'en', 'es')

- Spanish -> Portuguese -> English

In [None]:
intermediate_most_similar_word(ES_WORD_LIST, 'es', 'pt', 'en')

- English -> Spanish -> Portuguese

In [None]:
intermediate_most_similar_word(EN_WORD_LIST, 'en', 'es', 'pt')

### Using the vector transformed to each subspace.
Uses most_similar and try to approximate the word just one time.

In [None]:
def intermediate_most_similar_vector(word_list, origin_lang, intermediate_lang, target_lang):
    '''
    Translate one word from one language to another passing by an intermediate language.
    In this function, we use the result of the vector of the translation passing by the intermediate language to make the next translation.

    Params:
    - word_list: list of example words.
    - origin_lang: language in which the words in word_list are written
    - intermediate_lang: intermediate language which translation between origin_lang and target_lang passes by
    - target_lang: language you wish to know the translation

    Example of usage:
    intermediate_most_similar_vector(PT_WORD_LIST, 'es', 'pt', 'en')
    '''
    
    for word in word_list:
        print("Original word:", word)

        intermediate_vector = TRANSLATIONS[origin_lang][intermediate_lang] @ MODELS[origin_lang][word]
        translated_vector = MODELS[target_lang].most_similar(TRANSLATIONS[intermediate_lang][target_lang] @ intermediate_vector)
        
        print("Top 10 most similar words in target language passing by the intermediate language:")
        print(translated_vector)
        print("\n")

- Portuguese -> English -> Spanish

In [None]:
intermediate_most_similar_vector(PT_WORD_LIST, 'pt', 'en', 'es')

- Spanish -> Portuguese -> English

In [None]:
intermediate_most_similar_vector(ES_WORD_LIST, 'es', 'pt', 'en')

- English -> Spanish -> Portuguese

In [None]:
intermediate_most_similar_vector(EN_WORD_LIST, 'en', 'es', 'pt')

## 5 - Evaluating

Theorically speaking, translating the vector that one sentence represents to another should result in a similar sentence. For that purpose, we evaluate our results using the cosine similarity, which range is from -1 to 1.

In [None]:
def evaluate_single_cosine_similarity(origin_lang, target_lang):
    '''
    Evaluate cosine similarity between single sentences.
    Cosine similarity has an interval from -1 to 1, and the closer to 1 the value is, more similar the params are.

    Params:
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    evaluate_single_cosine_similarity('pt', 'en')
    '''
    
    for index in range(5):
        print(TEST_SET[origin_lang][index][0], '->', TEST_SET[target_lang][index][0])

        vector_translated = TRANSLATIONS[origin_lang][target_lang] @ TEST_SET[origin_lang][index][1]
        vector_target = TEST_SET[target_lang][index][1]

        print("Cossine similarity:", cosine_similarity([vector_translated], [vector_target])[0][0], "\n")

- Portuguese -> English

In [None]:
evaluate_single_cosine_similarity('pt', 'en')

- Portuguese -> Spanish

In [None]:
evaluate_single_cosine_similarity('pt', 'es')

- English -> Spanish

In [None]:
evaluate_single_cosine_similarity('en', 'es')

### Avaliating path
We use the following metrics for that purpose:
- Cosine similarity
- Euclidean distance
- Manhattan distance

In [None]:
def pairwise(iterable):
    '''
    Return successive overlapping pairs taken from the input iterable.
    The number of 2-tuples in the output iterator will be one fewer than the number of inputs. 
    It will be empty if the input iterable has fewer than two values.
    pairwise('ABCDEFG') --> AB BC CD DE EF FG

    Source: https://docs.python.org/3/library/itertools.html#itertools.pairwise
    '''
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

In [None]:
def avaliate_path(path):
    '''
    Avaliate cosine similarity of translation path.

    Params:
    - path: path of desired translation

    Example of usage:
    avaliate_path(['pt', 'en', 'es'])
    '''

    translation_matrix = np.identity(300)

    for (origin, target) in pairwise(path):
        translation_matrix = TRANSLATIONS[origin][target] @ translation_matrix
    
    vectors = [translation_matrix @ v for _, v in TEST_SET[path[0]] ]
    vectors_target = [v for _, v in TEST_SET[path[-1]]]
    
    mean_cos_sim = sum([cosine_similarity([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    print("Average cosine similarity of path", path, "=", round(mean_cos_sim[0][0], 5))

    mean_euc_dist = sum([euclidean_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    print("Average Euclidean distance of path", path, "=", round(mean_euc_dist[0][0], 5))

    mean_man_dist = sum([manhattan_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    print("Average Manhattan distance of path", path, "=", round(mean_man_dist[0][0], 5))

## 6 - Experiments

### Experiment #1: English - Portuguese - Spanish

- Portuguese -> Spanish

In [None]:
avaliate_path(['pt', 'es'])

- Portuguese -> English -> Spanish

In [None]:
avaliate_path(['pt', 'en', 'es'])

- Spanish -> English -> Portuguese

In [None]:
avaliate_path(['es', 'en', 'pt'])

- Portuguese -> English -> Portuguese -> English -> Spanish

In [None]:
avaliate_path(['pt', 'en', 'pt', 'en', 'es'])

### Experiment #2: Portuguese - Spanish - French - Italian - Romanian

### Experiment #3: English - German - Swedish - Dutch - Danish