# Translation of short phrases using an intermediate language:
## A thesis on leveraging interlingual approaches

## Libraries

In [6]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [7]:
def load_files(model_path, sentences_path, limit = None):
    '''
    Load models from FastText folder and sentences from Amazon Massive folder.
    
    Params:
    - model_path: path to the folder containing all models used, i.e., FastText
    - sentences_path: path to the folder containing all sentences used, i.e., Amazon_Massive
    - limit: define a limit in case your have low computer power, e.g., 5000
    
    Return:
    Tuple containing the language model and its corresponding sentences
    '''

    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

Defining data path.

In [8]:
FASTTEXT_PATH = 'Datasets/FastText/'
MASSIVE_PATH = 'Datasets/Amazon_Massive/'

In [9]:
PATHS = {
    'de': [ FASTTEXT_PATH + 'cc.de.300.vec', MASSIVE_PATH + 'de-DE.jsonl' ],
    'en': [ FASTTEXT_PATH + 'cc.en.300.vec', MASSIVE_PATH + 'en-US.jsonl' ],
    'es': [ FASTTEXT_PATH + 'cc.es.300.vec', MASSIVE_PATH + 'es-ES.jsonl' ],
    'it': [ FASTTEXT_PATH + 'cc.it.300.vec', MASSIVE_PATH + 'it-IT.jsonl' ],
    'pt': [ FASTTEXT_PATH + 'cc.pt.300.vec', MASSIVE_PATH + 'pt-PT.jsonl' ],
    'sv': [ FASTTEXT_PATH + 'cc.sv.300.vec', MASSIVE_PATH + 'sv-SE.jsonl' ],
}

LANGUAGES = PATHS.keys()

**Note**: the cell below takes approximately 5 to 7 minutes per model, if no limit is set.

In [10]:
MODELS, SENTENCES = {}, {}

for language, value in PATHS.items():
    model = value[0]
    sentences = value[1]

    print(f'Loading {model}...')
    MODELS[language], SENTENCES[language] = load_files(model, sentences, None)
    print(f'Finished loading {model}\n')

print('\nAll models and sentences are now loaded!')

Loading Datasets/FastText/cc.es.300.vec...
Finished loading Datasets/FastText/cc.es.300.vec

Loading Datasets/FastText/cc.pt.300.vec...
Finished loading Datasets/FastText/cc.pt.300.vec


All models and sentences are now loaded!


## 2 - Preparing data

In [11]:
SAMPLES = { key: [] for key in LANGUAGES }

In [13]:
# Since all sentence files have the same length, we chose one at random for the range function.
# We prove this in the cell below
for idx in range(len(SENTENCES['pt'])):
    
    actual_sentence = { key: [] for key in LANGUAGES }
    
    try:
        for lang, sent in SENTENCES.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(MODELS[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        SAMPLES[key].append([SENTENCES[key][idx], sum(value)])

In [14]:
for key in SENTENCES:
    SIZE_SAMPLES = len(SAMPLES[key])
    SIZE_SENTENCES = len(SENTENCES[key])
    print(
        f'Total sentences in { key } file: { SIZE_SENTENCES }'
        f' -> Model { key } samples: { SIZE_SAMPLES } ({ SIZE_SAMPLES / SIZE_SENTENCES * 100:.2f}%)'
    )

Total sentences in es file: 16521 -> Model es samples: 10796 (65.35%)
Total sentences in pt file: 16521 -> Model pt samples: 10796 (65.35%)


Splitting into train and test.

In [15]:
SPLIT_RATE = int(SIZE_SAMPLES * 0.7)

TRAIN_SET = { key: SAMPLES[key][:SPLIT_RATE] for key in LANGUAGES }
TEST_SET = { key: SAMPLES[key][SPLIT_RATE:] for key in LANGUAGES }

## 3 - Evaluating Control Group

Theorically speaking, translating the vector that one sentence represents to another should result in a similar sentence. For that purpose, we evaluate our results using the cosine similarity, which range is from -1 to 1.

In [16]:
TRANSLATIONS = { key: { lang: None for lang in LANGUAGES if lang != key } for key in LANGUAGES }

for origin, target in it.permutations(LANGUAGES, 2): 

    samples_origin = [sample[1] for sample in TRAIN_SET[origin]]
    samples_target = [sample[1] for sample in TRAIN_SET[target]]

    U, Sig, Vt = np.linalg.svd(np.transpose(samples_origin) @ samples_target)
    
    TRANSLATOR = np.transpose(Vt) @ np.transpose(U)
    TRANSLATIONS[origin][target] = TRANSLATOR

In [38]:
def evaluate_single_cosine_similarity(origin_lang, target_lang):
    '''
    Evaluate cosine similarity between single sentences.
    Cosine similarity has an interval from -1 to 1, and the closer to 1 the value is, more similar the params are.

    Params:
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    evaluate_single_cosine_similarity('pt', 'en')
    '''
    
    for index in range(9):
        print('Original sentence:', TEST_SET[origin_lang][index][0])
        print('Target sentence:', TEST_SET[target_lang][index][0])

        vector_translated = TRANSLATIONS[origin_lang][target_lang] @ TEST_SET[origin_lang][index][1]
        vector_target = TEST_SET[target_lang][index][1]
        
        similarities = [(s, cosine_similarity([vector_translated], [v])) for s, v in SAMPLES[target_lang]]
        most_similar = max(similarities, key= lambda x: x[1])
        print('Generated sentence:', most_similar[0])
        print('Cossine similarity to generated:', most_similar[1][0][0])

        print("Cossine similarity to target:", cosine_similarity([vector_translated], [vector_target])[0][0], "\n")

### Portuguese -> Spanish

In [39]:
evaluate_single_cosine_similarity('pt', 'es')

Original sentence: toca a próxima
Target sentence: reproduce la siguiente
Generated sentence: toca la canción siguiente
Cossine similarity to generated: 0.87800485
Cossine similarity to target: 0.8331171 

Original sentence: play podcast da minha biblioteca
Target sentence: reproducir un podcast de mi biblioteca
Generated sentence: quita la leche de mi lista de la compra
Cossine similarity to generated: 0.76390374
Cossine similarity to target: 0.6227994 

Original sentence: encontre um debate sobre escolas em portugal
Target sentence: encuentra un debate sobre los colegios en reino unido
Generated sentence: artículos de noticias sobre un tema en particular
Cossine similarity to generated: 0.8785187
Cossine similarity to target: 0.8754334 

Original sentence: salta para a seguinte
Target sentence: salta al siguiente
Generated sentence: cambiar la alarma para que comience a la medianoche
Cossine similarity to generated: 0.85757124
Cossine similarity to target: 0.5795974 

Original senten

### English -> German

In [23]:
evaluate_single_cosine_similarity('en', 'de')

next episode in podcast -> nächste episode im podcast
Cossine similarity: 0.7473393 

move forward to the next episode -> fahre zur nächsten episode fort
Cossine similarity: 0.72413206 

skip ahead to the next podcast -> überspringe zum nächsten podcast
Cossine similarity: 0.7075042 

jump to the next podcast -> springe zum nächsten podcast
Cossine similarity: 0.7180757 

play stuff you should know -> spiele sachen die du wissen solltest
Cossine similarity: 0.8636745 

please play the next episode -> spiel bitte die nächste episode
Cossine similarity: 0.85040724 

play the next episode of this podcast -> spiel die nächste episode von dem podcast
Cossine similarity: 0.86628777 

skip to the next episode -> überspringe zur nächsten episode
Cossine similarity: 0.7179898 

skip this episode -> überspringe diese folge
Cossine similarity: 0.5980465 



### Avaliating path
We use the following metrics for that purpose:
- Cosine similarity
- Euclidean distance

In [24]:
def pairwise(iterable):
    '''
    Return successive overlapping pairs taken from the input iterable.
    The number of 2-tuples in the output iterator will be one fewer than the number of inputs. 
    It will be empty if the input iterable has fewer than two values.
    pairwise('ABCDEFG') --> AB BC CD DE EF FG

    Source: https://docs.python.org/3/library/itertools.html#itertools.pairwise
    '''
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

In [25]:
def avaliate_path(path):
    '''
    Avaliate the translation path using cosine similarity, euclidean distance and manhattan distance.

    Params:
    - path: path of desired translation
    
    Return:
    - Score of each avaliation method
    
    Example of usage:
    avaliate_path(['pt', 'en', 'es'])
    '''
    
    translation_matrix = np.identity(300)

    for (origin, target) in pairwise(path):
        translation_matrix = TRANSLATIONS[origin][target] @ translation_matrix
        evaluate_single_cosine_similarity(origin, target)
    
    vectors = [translation_matrix @ v for _, v in TEST_SET[path[0]] ]
    vectors_target = [v for _, v in TEST_SET[path[-1]]]
    
    mean_cos_sim = sum([cosine_similarity([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    mean_euc_dist = sum([euclidean_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    
    return mean_cos_sim[0][0], mean_euc_dist[0][0]

In [26]:
def avaliate_possible_paths(languages):
    '''
    Avaliate possible paths from first to last language on the list, changing the languages in the middle
    
    Params:
    - languages: list of languages
    
    Return:
    - Dataframe with the score of each path on cossine similarity, euclidean distance and manhattan distance
    
    Example of usage:
    avaliate_paths(['pt', 'en', 'es'])
    '''
    
    start = languages[0]
    end = languages[-1]
    
    paths = [[start, end]]
    for i in range(len(languages) - 2):
        for comb in it.combinations(languages[1: -1], i + 1):
            paths.append([start] + list(comb) + [end])
    
    scores = [avaliate_path(p) for p in paths]
    index = [ ' -> '.join(p) for p in paths]

    return pd.DataFrame(data=scores, columns=['Cosine Similarity', 'Euclidean Distance'], index = index)


## 4 - Study Cases

### Study Case #1: Portuguese - Italian - Spanish
In this case, we intend to evaluate how good is a translation between two languages from the Romance language family, such as Portuguese and Spanish, and if adding a language from the same family, such as Italian, affects the quality of the translation.

In [27]:
exp1 = avaliate_possible_paths(['pt', 'it', 'es'])
exp1

próximo episódio no podcast -> siguiente episodio nel podcast
Cossine similarity: 0.49552757 

passa para o próximo episódio -> avanzar al próximo episodio
Cossine similarity: 0.74353737 

salta para o próximo podcast -> pasar al siguiente podcast
Cossine similarity: 0.74962056 

passa para o próximo podcast -> saltar al siguiente podcast
Cossine similarity: 0.7128504 

tocar separados de fresco -> jugar cosa que deberías saber
Cossine similarity: 0.2803227 

por favor põe o próximo episódio -> por favor reproduce el próximo episodio
Cossine similarity: 0.87914884 

põe o próximo episódio deste podcast -> pon el siguiente episodio de este podcast
Cossine similarity: 0.86633617 

passa para o próximo episódio -> pasar al siguiente episodio
Cossine similarity: 0.73972523 

passa este episódio -> saltar este episodio
Cossine similarity: 0.73970485 

próximo episódio no podcast -> prossimo episodio del podcast
Cossine similarity: 0.73532826 

passa para o próximo episódio -> passiamo al pr

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> es,0.759271,4.143797
pt -> it -> es,0.747151,4.229964


### Study Case #2: English - Swedish - German
In this case, we intend to evaluate how good is a translation between two languages from the Anglo-Saxon language family, such as English and German, and if adding a language from the same family, such as Swedish, affects the quality of the translation.

In [28]:
exp2 = avaliate_possible_paths(['en', 'sv', 'de'])
exp2

next episode in podcast -> nächste episode im podcast
Cossine similarity: 0.7473393 

move forward to the next episode -> fahre zur nächsten episode fort
Cossine similarity: 0.72413206 

skip ahead to the next podcast -> überspringe zum nächsten podcast
Cossine similarity: 0.7075042 

jump to the next podcast -> springe zum nächsten podcast
Cossine similarity: 0.7180757 

play stuff you should know -> spiele sachen die du wissen solltest
Cossine similarity: 0.8636745 

please play the next episode -> spiel bitte die nächste episode
Cossine similarity: 0.85040724 

play the next episode of this podcast -> spiel die nächste episode von dem podcast
Cossine similarity: 0.86628777 

skip to the next episode -> überspringe zur nächsten episode
Cossine similarity: 0.7179898 

skip this episode -> überspringe diese folge
Cossine similarity: 0.5980465 

next episode in podcast -> nästa avsnitt av podcasten
Cossine similarity: 0.61435306 

move forward to the next episode -> gå vidare till nästa

Unnamed: 0,Cosine Similarity,Euclidean Distance
en -> de,0.749783,4.820215
en -> sv -> de,0.737107,4.912853


### Study Case #3: Portuguese - English - Spanish
In this case, we intend to evaluate how good is a translation between two languages from the Romance language family, such as Portuguese and Spanish, and if adding a language from an outer group, such as English from the Anglo-Saxon, affects the quality of the translation.

In [29]:
exp3 = avaliate_possible_paths(['pt', 'en', 'es'])
exp3

próximo episódio no podcast -> siguiente episodio nel podcast
Cossine similarity: 0.49552757 

passa para o próximo episódio -> avanzar al próximo episodio
Cossine similarity: 0.74353737 

salta para o próximo podcast -> pasar al siguiente podcast
Cossine similarity: 0.74962056 

passa para o próximo podcast -> saltar al siguiente podcast
Cossine similarity: 0.7128504 

tocar separados de fresco -> jugar cosa que deberías saber
Cossine similarity: 0.2803227 

por favor põe o próximo episódio -> por favor reproduce el próximo episodio
Cossine similarity: 0.87914884 

põe o próximo episódio deste podcast -> pon el siguiente episodio de este podcast
Cossine similarity: 0.86633617 

passa para o próximo episódio -> pasar al siguiente episodio
Cossine similarity: 0.73972523 

passa este episódio -> saltar este episodio
Cossine similarity: 0.73970485 

próximo episódio no podcast -> next episode in podcast
Cossine similarity: 0.739766 

passa para o próximo episódio -> move forward to the ne

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> es,0.759271,4.143797
pt -> en -> es,0.732372,4.375051


### Study Case #4: English - Portuguese - German
In this case, we intend to evaluate how good is a translation between two languages from the Anglo-Saxon language family, such as English and German, and if adding a language from an outer group, such as Portuguese from the Romance, affects the quality of the translation.

In [30]:
exp4 = avaliate_possible_paths(['pt', 'en', 'es'])
exp4

próximo episódio no podcast -> siguiente episodio nel podcast
Cossine similarity: 0.49552757 

passa para o próximo episódio -> avanzar al próximo episodio
Cossine similarity: 0.74353737 

salta para o próximo podcast -> pasar al siguiente podcast
Cossine similarity: 0.74962056 

passa para o próximo podcast -> saltar al siguiente podcast
Cossine similarity: 0.7128504 

tocar separados de fresco -> jugar cosa que deberías saber
Cossine similarity: 0.2803227 

por favor põe o próximo episódio -> por favor reproduce el próximo episodio
Cossine similarity: 0.87914884 

põe o próximo episódio deste podcast -> pon el siguiente episodio de este podcast
Cossine similarity: 0.86633617 

passa para o próximo episódio -> pasar al siguiente episodio
Cossine similarity: 0.73972523 

passa este episódio -> saltar este episodio
Cossine similarity: 0.73970485 

próximo episódio no podcast -> next episode in podcast
Cossine similarity: 0.739766 

passa para o próximo episódio -> move forward to the ne

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> es,0.759271,4.143797
pt -> en -> es,0.732372,4.375051


## Experiments

### Experiment #1: Portuguese - Spanish - German
In this experiment, we intend to evaluate translations using languages from different groups.

In [31]:
exp5 = avaliate_possible_paths(['pt', 'es', 'de'])
exp5

próximo episódio no podcast -> nächste episode im podcast
Cossine similarity: 0.7291949 

passa para o próximo episódio -> fahre zur nächsten episode fort
Cossine similarity: 0.5995621 

salta para o próximo podcast -> überspringe zum nächsten podcast
Cossine similarity: 0.7491197 

passa para o próximo podcast -> springe zum nächsten podcast
Cossine similarity: 0.7427332 

tocar separados de fresco -> spiele sachen die du wissen solltest
Cossine similarity: 0.45921522 

por favor põe o próximo episódio -> spiel bitte die nächste episode
Cossine similarity: 0.77984095 

põe o próximo episódio deste podcast -> spiel die nächste episode von dem podcast
Cossine similarity: 0.77985924 

passa para o próximo episódio -> überspringe zur nächsten episode
Cossine similarity: 0.558504 

passa este episódio -> überspringe diese folge
Cossine similarity: 0.5666073 

próximo episódio no podcast -> siguiente episodio nel podcast
Cossine similarity: 0.49552757 

passa para o próximo episódio -> avan

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> de,0.668403,4.698783
pt -> es -> de,0.662016,4.732067


### Experiment #2: Portuguese - Swedish - German 

In [32]:
exp6 = avaliate_possible_paths(['pt', 'sv', 'de'])
exp6

próximo episódio no podcast -> nächste episode im podcast
Cossine similarity: 0.7291949 

passa para o próximo episódio -> fahre zur nächsten episode fort
Cossine similarity: 0.5995621 

salta para o próximo podcast -> überspringe zum nächsten podcast
Cossine similarity: 0.7491197 

passa para o próximo podcast -> springe zum nächsten podcast
Cossine similarity: 0.7427332 

tocar separados de fresco -> spiele sachen die du wissen solltest
Cossine similarity: 0.45921522 

por favor põe o próximo episódio -> spiel bitte die nächste episode
Cossine similarity: 0.77984095 

põe o próximo episódio deste podcast -> spiel die nächste episode von dem podcast
Cossine similarity: 0.77985924 

passa para o próximo episódio -> überspringe zur nächsten episode
Cossine similarity: 0.558504 

passa este episódio -> überspringe diese folge
Cossine similarity: 0.5666073 

próximo episódio no podcast -> nästa avsnitt av podcasten
Cossine similarity: 0.48663807 

passa para o próximo episódio -> gå vidar

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> de,0.668403,4.698783
pt -> sv -> de,0.65375,4.785809


### Experiment #3: English - Swedish - Italian

In [33]:
exp7 = avaliate_possible_paths(['en', 'sv', 'it'])
exp7

next episode in podcast -> prossimo episodio del podcast
Cossine similarity: 0.60514194 

move forward to the next episode -> passiamo al prossimo episodio
Cossine similarity: 0.7117564 

skip ahead to the next podcast -> salta al prossimo podcast
Cossine similarity: 0.72894955 

jump to the next podcast -> passa al podcast successivo
Cossine similarity: 0.73135567 

play stuff you should know -> riproduci cose che dovresti sapere
Cossine similarity: 0.7421476 

please play the next episode -> riproduci il prossimo episodio
Cossine similarity: 0.71181726 

play the next episode of this podcast -> riproduci il prossimo episodio di questo podcast
Cossine similarity: 0.81085324 

skip to the next episode -> salta il prossimo episodio
Cossine similarity: 0.74048585 

skip this episode -> salta questo episodio
Cossine similarity: 0.6070248 

next episode in podcast -> nästa avsnitt av podcasten
Cossine similarity: 0.61435306 

move forward to the next episode -> gå vidare till nästa avsnitt

Unnamed: 0,Cosine Similarity,Euclidean Distance
en -> it,0.706648,5.405672
en -> sv -> it,0.690093,5.535204


### Experiment #4: English - Spanish - Italian

In [34]:
exp8 = avaliate_possible_paths(['en', 'es', 'it'])
exp8

next episode in podcast -> prossimo episodio del podcast
Cossine similarity: 0.60514194 

move forward to the next episode -> passiamo al prossimo episodio
Cossine similarity: 0.7117564 

skip ahead to the next podcast -> salta al prossimo podcast
Cossine similarity: 0.72894955 

jump to the next podcast -> passa al podcast successivo
Cossine similarity: 0.73135567 

play stuff you should know -> riproduci cose che dovresti sapere
Cossine similarity: 0.7421476 

please play the next episode -> riproduci il prossimo episodio
Cossine similarity: 0.71181726 

play the next episode of this podcast -> riproduci il prossimo episodio di questo podcast
Cossine similarity: 0.81085324 

skip to the next episode -> salta il prossimo episodio
Cossine similarity: 0.74048585 

skip this episode -> salta questo episodio
Cossine similarity: 0.6070248 

next episode in podcast -> siguiente episodio nel podcast
Cossine similarity: 0.37801433 

move forward to the next episode -> avanzar al próximo episo

Unnamed: 0,Cosine Similarity,Euclidean Distance
en -> it,0.706648,5.405672
en -> es -> it,0.699319,5.459113


## Saving results to spreadsheet

In [35]:
path = 'results.xlsx'
with pd.ExcelWriter(path) as writer:
    exp1.to_excel(writer, sheet_name = 'Study case 1')
    exp2.to_excel(writer, sheet_name = 'Study case 2')
    exp3.to_excel(writer, sheet_name = 'Study case 3')
    exp4.to_excel(writer, sheet_name = 'Study case 4')
    exp5.to_excel(writer, sheet_name = 'Experiment 1')
    exp6.to_excel(writer, sheet_name = 'Experiment 2')
    exp7.to_excel(writer, sheet_name = 'Experiment 3')
    exp8.to_excel(writer, sheet_name = 'Experiment 4')