# Word Embedding Translator

## Libraries

In [None]:
import os
import urllib.request

import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

Defining global data paths.

In [None]:
FASTTEXT_PATH = 'Datasets/FastText/'
MASSIVE_PATH = 'Datasets/Amazon_Massive/'

In case this your first time running this notebook, the cell below is going to create a FastText folder inside the Datasets one to store the FastText models.

In [None]:
if not os.path.exists(FASTTEXT_PATH):
    print('Creating FastText folder inside Datasets...')
    os.mkdir(FASTTEXT_PATH)
else:
    print('FastText folder already exists.')

Defining FastText models urls.

In [None]:
URL_MODELS = [
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz',
    'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz'
]

The cell below automatically creates a list with the FastText file names in the following format: cc.LANGUAGE.300.vec

In [None]:
FILE_MODELS = []

for index in range(len(URL_MODELS)):
    FILE_MODELS.append(
        URL_MODELS[index][URL_MODELS[index].index('cc') : URL_MODELS[index].index('.gz')]
    )

In case you do not have the necessary models already downloaded and placed on the folder FastText, the cell below does it for you. Takes approximately 2min per .vec file.

In [None]:
for url_model, file_model in zip(URL_MODELS, FILE_MODELS):
    if not os.path.isfile(FASTTEXT_PATH + file_model):

        print(f'\nDownloading {file_model}...')

        urllib.request.urlretrieve(
            url_model, 
            FASTTEXT_PATH + file_model
        )

        print(f'Finished downloading {file_model}.')

    else:
        print(f'File {file_model} already exists.')

Unfortunatelly, the cell below must be hard-coded because the Amazon Massive file names do not follow a simple pattern.

In [None]:
PATHS = {
    'da': [ FASTTEXT_PATH + 'cc.da.300.vec', MASSIVE_PATH + 'da-DK.jsonl' ],
    'nl': [ FASTTEXT_PATH + 'cc.nl.300.vec', MASSIVE_PATH + 'nl-NL.jsonl' ],
    'en': [ FASTTEXT_PATH + 'cc.en.300.vec', MASSIVE_PATH + 'en-US.jsonl' ],
    'fr': [ FASTTEXT_PATH + 'cc.fr.300.vec', MASSIVE_PATH + 'fr-FR.jsonl' ],
    'de': [ FASTTEXT_PATH + 'cc.de.300.vec', MASSIVE_PATH + 'de-DE.jsonl' ],
    'it': [ FASTTEXT_PATH + 'cc.it.300.vec', MASSIVE_PATH + 'it-IT.jsonl' ],
    'pt': [ FASTTEXT_PATH + 'cc.pt.300.vec', MASSIVE_PATH + 'pt-PT.jsonl' ],
    'ro': [ FASTTEXT_PATH + 'cc.ro.300.vec', MASSIVE_PATH + 'ro-RO.jsonl' ],
    'es': [ FASTTEXT_PATH + 'cc.es.300.vec', MASSIVE_PATH + 'es-ES.jsonl' ],
    'sv': [ FASTTEXT_PATH + 'cc.sv.300.vec', MASSIVE_PATH + 'sv-SE.jsonl' ],
}

LANGUAGES = PATHS.keys()

In [None]:
def load_files(model_path, sentences_path, limit = None):
    '''
    Load models from FastText folder and sentences from Amazon_Massive folder.
    
    Params:
    - model_path: path to the folder containing all models used, i.e., FastText
    - sentences_path: path to the folder containing all sentences used, i.e., Amazon_Massive
    - limit: define a limit in case your have low computer power, e.g., 5000
    
    Return:
    Tuple containing the language model and its corresponding sentences
    '''

    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

We are now ready to load ours models and sentences into the code.

**Note**: the cell below takes approximately 5 to 6 minutes per model. Given than we are working with 10 models, it should take approximately 1 hour. This is by far the most time-consuming cell of our code.

In [None]:
MODELS, SENTENCES = {}, {}

for language, value in PATHS.items():
    model = value[0]
    sentences = value[1]

    print(f'Loading {model}...')

    MODELS[language], SENTENCES[language] = load_files(model, sentences)

    print(f'Finished loading {model}.\n')

print('\nAll models and sentences are now loaded!')

## 2 - Preparing data

In [None]:
SAMPLES = { key: [] for key in LANGUAGES }

In [None]:
# Since all sentence files have the same length, we chose one at random for the range function.
# We prove this in the cell below
for index in range(len(SENTENCES['en'])):
    
    actual_sentence = { key: [] for key in LANGUAGES }
    
    try:
        for lang, sent in SENTENCES.items():
            for word in sent[index].split(' '):
                actual_sentence[lang].append(MODELS[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        SAMPLES[key].append([SENTENCES[key][index], sum(value)])

In [None]:
for key in SENTENCES:
    SIZE_SAMPLES = len(SAMPLES[key])
    print(
        f'Total sentences in { key } file: { len(SENTENCES[key]) } \
        -> Model { key } samples: { len(SAMPLES[key]) } \
        ({ len(SAMPLES[key]) / len(SENTENCES[key]) * 100:.2f}%)'
    )

Splitting into train and test.

In [None]:
TRAIN_PERCENTAGE = 0.7

In [None]:
SPLIT_RATE = int(SIZE_SAMPLES * TRAIN_PERCENTAGE)

TRAIN_SET = { key: SAMPLES[key][:SPLIT_RATE] for key in LANGUAGES }
TEST_SET = { key: SAMPLES[key][SPLIT_RATE:] for key in LANGUAGES }

## 3 - Translating words using SVD

In [None]:
TRANSLATIONS = { key: { lang: None for lang in LANGUAGES if lang != key } for key in LANGUAGES }

Calculating SVD.

In [None]:
for origin, target in it.permutations(LANGUAGES, 2): 

    samples_origin = [sample[1] for sample in TRAIN_SET[origin]]
    samples_target = [sample[1] for sample in TRAIN_SET[target]]

    U, Sig, Vt = np.linalg.svd(np.transpose(samples_origin) @ samples_target)
    
    TRANSLATOR = np.transpose(Vt) @ np.transpose(U)
    TRANSLATIONS[origin][target] = TRANSLATOR

### List of examples words
**Note**: only single words can be written, i.e., compound words like "washing machine" will result in Error

- English

In [None]:
EN_WORD_LIST = [
    'specification',
    'book',
    'duckling',
    'machine',
    'headphones'
]

- Spanish

In [None]:
ES_WORD_LIST = [
    'hola',
    'sí',
    'computadora',
    'país',
    'nuevo',
    'amor'
]

- Portuguese

In [None]:
PT_WORD_LIST = [
    'sapato',
    'flor',
    'aniversário',
    'saudades',
    'amigo',
    'faculdade'
]

In [None]:
def translate(word_list, origin_lang, target_lang):
    '''
    Function to translate one word from one language to another.

    Params:
    - word_list: list of example words.
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    translate(PT_WORD_LIST, 'es', 'pt')
    '''
    
    for word in word_list:
        print(
            f'Original word: {word}'
            f'Top 10 most similar words in {target_lang}'
            f'{MODELS[target_lang].most_similar(TRANSLATIONS[origin_lang][target_lang] @ MODELS[origin_lang][word])}\n'
        )

### Examples

- Portuguese $\rightarrow$ Spanish

In [None]:
translate(PT_WORD_LIST, 'pt', 'es')

- Portuguese $\rightarrow$ English

In [None]:
translate(PT_WORD_LIST, 'pt', 'en')

- Spanish $\rightarrow$ English

In [None]:
translate(ES_WORD_LIST, 'es', 'en')

- English $\rightarrow$ Portuguese

In [None]:
translate(EN_WORD_LIST, 'en', 'pt')

## 4 - Translating words using intermediate languages

### Getting the most similar word in each language it pass.
Most expensive (uses most_similar multiple times) and try to aproximate a word each time.

In [None]:
def intermediate_most_similar_word(word_list, origin_lang, intermediate_lang, target_lang):
    '''
    Translate one word from one language to another passing by an intermediate language.
    In this function, we use the result of the most similar word of the intermediate language to make the next translation.

    Params:
    - word_list: list of example words.
    - origin_lang: language in which the words in word_list are written
    - intermediate_lang: intermediate language which translation between origin_lang and target_lang passes by
    - target_lang: language you wish to know the translation

    Example of usage:
    intermediate_most_similar_word(PT_WORD_LIST, 'es', 'pt', 'en')
    '''
    
    for word in word_list:
        print(f'Original word: {word}')
        
        intermediate_word = MODELS[intermediate_lang].most_similar(
            TRANSLATIONS[origin_lang][intermediate_lang] @ MODELS[origin_lang][word]
        )[0][0]
        print(f'Most similar word according to intermediate language: {intermediate_word}')

        translated_language = MODELS[target_lang].most_similar(
            TRANSLATIONS[intermediate_lang][target_lang] @ MODELS[intermediate_lang][intermediate_word]
        )
        print(f'Top 10 most similar words in target language passing by the intermediate language: {translated_language}\n')

- Portuguese $\rightarrow$ English $\rightarrow$ Spanish

In [None]:
intermediate_most_similar_word(PT_WORD_LIST, 'pt', 'en', 'es')

- Spanish $\rightarrow$ Portuguese $\rightarrow$ English

In [None]:
intermediate_most_similar_word(ES_WORD_LIST, 'es', 'pt', 'en')

- English $\rightarrow$ Spanish $\rightarrow$ Portuguese

In [None]:
intermediate_most_similar_word(EN_WORD_LIST, 'en', 'es', 'pt')

### Using the vector transformed to each subspace.
Uses most_similar and try to approximate the word just one time.

In [None]:
def intermediate_most_similar_vector(word_list, origin_lang, intermediate_lang, target_lang):
    '''
    Translate one word from one language to another passing by an intermediate language.
    In this function, we use the result of the vector of the translation passing by the intermediate language to make the next translation.

    Params:
    - word_list: list of example words.
    - origin_lang: language in which the words in word_list are written
    - intermediate_lang: intermediate language which translation between origin_lang and target_lang passes by
    - target_lang: language you wish to know the translation

    Example of usage:
    intermediate_most_similar_vector(PT_WORD_LIST, 'es', 'pt', 'en')
    '''
    
    for word in word_list:
        print(f'Original word: {word}')

        intermediate_vector = TRANSLATIONS[origin_lang][intermediate_lang] @ MODELS[origin_lang][word]
        translated_vector = MODELS[target_lang].most_similar(TRANSLATIONS[intermediate_lang][target_lang] @ intermediate_vector)
        
        print(f'Top 10 most similar words in target language passing by the intermediate language: {translated_vector}\n')

- Portuguese $\rightarrow$ English $\rightarrow$ Spanish

In [None]:
intermediate_most_similar_vector(PT_WORD_LIST, 'pt', 'en', 'es')

- Spanish $\rightarrow$ Portuguese $\rightarrow$ English

In [None]:
intermediate_most_similar_vector(ES_WORD_LIST, 'es', 'pt', 'en')

- English $\rightarrow$ Spanish $\rightarrow$ Portuguese

In [None]:
intermediate_most_similar_vector(EN_WORD_LIST, 'en', 'es', 'pt')

## 5 - Evaluating

In [None]:
RIGHT_ARROW = '\u2192'

Theorically speaking, translating the vector that one sentence represents to another should result in a similar sentence. For that purpose, we evaluate our results using the cosine similarity, which range is from -1 to 1.

In [None]:
def evaluate_single_cosine_similarity(origin_lang, target_lang):
    '''
    Evaluate cosine similarity between single sentences.
    Cosine similarity has an interval from -1 to 1, and the closer to 1 the value is, more similar the params are.

    Params:
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    evaluate_single_cosine_similarity('pt', 'en')
    '''
    
    for index in range(5):
        print(f'{TEST_SET[origin_lang][index][0]} {RIGHT_ARROW} {TEST_SET[target_lang][index][0]}')

        vector_translated = TRANSLATIONS[origin_lang][target_lang] @ TEST_SET[origin_lang][index][1]
        vector_target = TEST_SET[target_lang][index][1]

        print(f'Cossine similarity:  {cosine_similarity([vector_translated], [vector_target])[0][0]}\n')

- Portuguese -> English

In [None]:
evaluate_single_cosine_similarity('pt', 'en')

- Portuguese -> Spanish

In [None]:
evaluate_single_cosine_similarity('pt', 'es')

- English -> Spanish

In [None]:
evaluate_single_cosine_similarity('en', 'es')

### Avaliating path
We use the following metrics for that purpose:
- Cosine similarity
- Euclidean distance
- Manhattan distance

In [None]:
def pairwise(iterable):
    '''
    Return successive overlapping pairs taken from the input iterable.
    The number of 2-tuples in the output iterator will be one fewer than the number of inputs. 
    It will be empty if the input iterable has fewer than two values.
    pairwise('ABCDEFG') --> AB BC CD DE EF FG

    Source: https://docs.python.org/3/library/itertools.html#itertools.pairwise
    '''
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

In [None]:
def avaliate_path(path):
    '''
    Avaliate the translation path using cosine similarity, euclidean distance and manhattan distance.

    Params:
    - path: path of desired translation
    
    Return:
    - Score of each avaliation method
    
    Example of usage:
    avaliate_path(['pt', 'en', 'es'])
    '''
    
    translation_matrix = np.identity(300)

    for (origin, target) in pairwise(path):
        translation_matrix = TRANSLATIONS[origin][target] @ translation_matrix
    
    vectors = [translation_matrix @ v for _, v in TEST_SET[path[0]] ]
    vectors_target = [v for _, v in TEST_SET[path[-1]]]
    
    mean_cos_sim = sum([cosine_similarity([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    mean_euc_dist = sum([euclidean_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    mean_man_dist = sum([manhattan_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    
    return mean_cos_sim[0][0], mean_euc_dist[0][0], mean_man_dist[0][0]

In [None]:
def avaliate_possible_paths(languages):
    '''
    Avaliate possible paths from first to last language on the list, changing the languages in the middle
    
    Params:
    - languages: list of languages
    
    Return:
    - Dataframe with the score of each path on cossine similarity, euclidean distance and manhattan distance
    
    Example of usage:
    avaliate_paths(['pt', 'en', 'es'])
    '''
    
    start = languages[0]
    end = languages[-1]
    paths = [[start, end]]
    
    for i in range(len(languages) - 2):
        for comb in it.combinations(languages[1 : -1], i + 1):
            paths.append([start] + list(comb) + [end])
    
    scores = [avaliate_path(p) for p in paths]
    index = [RIGHT_ARROW.join(p) for p in paths]
    
    return pd.DataFrame(
        data = scores, 
        columns = ['Cosine Similarity', 'Euclidean Distance', 'Manhattan Distance'], 
        index = index
    )


## 6 - Experiments

In [None]:
EXPERIMENTS = []

### Experiment #1: Portuguese - English - Spanish
In this experiment, we intend to evaluate how good is a translation between two languages from the Latin group, such as Portuguese and Spanish, and if adding a language from an outer group, such as English from the West Germanic, affects the quality of the translation.

In [None]:
exp1 = avaliate_possible_paths(['pt', 'en', 'es'])
EXPERIMENTS.append(exp1)
exp1

### Experiment #2: Portuguese - Spanish - French - Italian - Romanian
In this experiment, we intend to evaluate translations between multiple languages from the same Latin group.

In [None]:
exp2 = avaliate_possible_paths(['pt', 'fr', 'it', 'ro', 'es'])
EXPERIMENTS.append(exp2)
exp2

> Adding a test with English in the middle

In [None]:
path = ['pt', 'fr', 'it', 'en', 'ro', 'es']
exp_df = pd.DataFrame(
    data = [avaliate_path(path)], 
    index = [RIGHT_ARROW.join(path)], 
    columns = exp2.columns
)

exp21 = pd.concat([exp2, exp_df])
EXPERIMENTS.append(exp21)
exp21

### Experiment #3: English - German - Swedish - Dutch - Danish
In this experiment, we intend to evaluate translations between multiple languages from the same Germanic group.

In [None]:
exp3 = avaliate_possible_paths(['en', 'sv', 'nl', 'da', 'de'])
EXPERIMENTS.append(exp3)
exp3

> Adding a test with Italian in the middle

In [None]:
path = ['en', 'sv', 'it', 'nl', 'da', 'de']
exp_df2 = pd.DataFrame(
    data = [avaliate_path(path)], 
    index = [RIGHT_ARROW.join(path)], 
    columns = exp3.columns
)

exp31 = pd.concat([exp3, exp_df2])
EXPERIMENTS.append(exp31)
exp31

### Experiment #4: Portuguese - German - French - English - Spanish
In this experiment, we intend to evaluate translations using languages from different groups.

In [None]:
exp4 = avaliate_possible_paths(['pt', 'de', 'fr', 'en', 'es'])
EXPERIMENTS.append(exp4)
exp4

### Experiment #5: English - Spanish - Swedish - Italian - German
In this experiment, we intend to evaluate translations using languages from different groups.

In [None]:
exp5 = avaliate_possible_paths(['en', 'es', 'sv', 'it', 'de'])
EXPERIMENTS.append(exp5)
exp5

### Experiment #6: Ingles to Spanish through multiple languages

In [None]:
exp6 = avaliate_possible_paths(['en', 'pt', 'es'])
EXPERIMENTS.append(exp6)
exp6

In [None]:
exp61 = avaliate_possible_paths(['en', 'de', 'es'])
EXPERIMENTS.append(exp61)
exp61

In [None]:
exp62 = avaliate_possible_paths(['en', 'it', 'es'])
EXPERIMENTS.append(exp62)
exp62

### Saving results to spreadsheets

In [None]:
for experiment in EXPERIMENTS:
    experiment.to_excel(f'Experiment results/Experiment {EXPERIMENTS.index(experiment) + 1}.xlsx')