# Translation of short phrases using an intermediate language:
## A thesis on leveraging interlingual approaches

## Libraries

In [1]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu #pip install nltk

In [2]:
ref = ["estou fazendo um teste".split()]
frase = "fazendo um teste".split()
cc = SmoothingFunction() #https://github.com/alvations/nltk/blob/develop/nltk/translate/bleu_score.py#L425
sentence_bleu(ref, frase, weights = (0, 1, 0), smoothing_function = cc.method4)

0.7165313105737893

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [3]:
def load_files(model_path, sentences_path, limit = None):
    '''
    Load models from FastText folder and sentences from Amazon Massive folder.
    
    Params:
    - model_path: path to the folder containing all models used, i.e., FastText
    - sentences_path: path to the folder containing all sentences used, i.e., Amazon_Massive
    - limit: define a limit in case your have low computer power, e.g., 5000
    
    Return:
    Tuple containing the language model and its corresponding sentences
    '''

    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

Defining data path.

In [4]:
FASTTEXT_PATH = 'Datasets/FastText/'
MASSIVE_PATH = 'Datasets/Amazon_Massive/'

In [5]:
PATHS = {
    'de': [ FASTTEXT_PATH + 'cc.de.300.vec', MASSIVE_PATH + 'de-DE.jsonl' ],
    'en': [ FASTTEXT_PATH + 'cc.en.300.vec', MASSIVE_PATH + 'en-US.jsonl' ],
    'es': [ FASTTEXT_PATH + 'cc.es.300.vec', MASSIVE_PATH + 'es-ES.jsonl' ],
    'it': [ FASTTEXT_PATH + 'cc.it.300.vec', MASSIVE_PATH + 'it-IT.jsonl' ],
    'pt': [ FASTTEXT_PATH + 'cc.pt.300.vec', MASSIVE_PATH + 'pt-PT.jsonl' ],
    'sv': [ FASTTEXT_PATH + 'cc.sv.300.vec', MASSIVE_PATH + 'sv-SE.jsonl' ],
}

LANGUAGES = PATHS.keys()

**Note**: the cell below takes approximately 5 to 7 minutes per model, if no limit is set.

In [6]:
MODELS, SENTENCES = {}, {}

for language, value in PATHS.items():
    model = value[0]
    sentences = value[1]

    print(f'Loading {model}...')
    MODELS[language], SENTENCES[language] = load_files(model, sentences)
    print(f'Finished loading {model}\n')

print('\nAll models and sentences are now loaded!')

Loading Datasets/FastText/cc.de.300.vec...
Finished loading Datasets/FastText/cc.de.300.vec

Loading Datasets/FastText/cc.en.300.vec...
Finished loading Datasets/FastText/cc.en.300.vec

Loading Datasets/FastText/cc.es.300.vec...
Finished loading Datasets/FastText/cc.es.300.vec

Loading Datasets/FastText/cc.it.300.vec...
Finished loading Datasets/FastText/cc.it.300.vec

Loading Datasets/FastText/cc.pt.300.vec...
Finished loading Datasets/FastText/cc.pt.300.vec

Loading Datasets/FastText/cc.sv.300.vec...
Finished loading Datasets/FastText/cc.sv.300.vec


All models and sentences are now loaded!


## 2 - Preparing data

In [7]:
SAMPLES = { key: [] for key in LANGUAGES }

In [8]:
# Since all sentence files have the same length, we chose one at random for the range function.
# We prove this in the cell below
for idx in range(len(SENTENCES['pt'])):
    
    actual_sentence = { key: [] for key in LANGUAGES }
    
    try:
        for lang, sent in SENTENCES.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(MODELS[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        SAMPLES[key].append([SENTENCES[key][idx], sum(value)])

In [9]:
for key in SENTENCES:
    SIZE_SAMPLES = len(SAMPLES[key])
    SIZE_SENTENCES = len(SENTENCES[key])
    print(
        f'Total sentences in { key } file: { SIZE_SENTENCES }'
        f' -> Model { key } samples: { SIZE_SAMPLES } ({ SIZE_SAMPLES / SIZE_SENTENCES * 100:.2f}%)'
    )

Total sentences in de file: 16521 -> Model de samples: 10941 (66.22%)
Total sentences in en file: 16521 -> Model en samples: 10941 (66.22%)
Total sentences in es file: 16521 -> Model es samples: 10941 (66.22%)
Total sentences in it file: 16521 -> Model it samples: 10941 (66.22%)
Total sentences in pt file: 16521 -> Model pt samples: 10941 (66.22%)
Total sentences in sv file: 16521 -> Model sv samples: 10941 (66.22%)


Splitting into train and test.

In [10]:
SPLIT_RATE = int(SIZE_SAMPLES * 0.7)

TRAIN_SET = { key: SAMPLES[key][:SPLIT_RATE] for key in LANGUAGES }
TEST_SET = { key: SAMPLES[key][SPLIT_RATE:] for key in LANGUAGES }

## 3 - Evaluating Control Group

Theorically speaking, translating the vector that one sentence represents to another should result in a similar sentence. For that purpose, we evaluate our results using the cosine similarity, which range is from -1 to 1.

In [11]:
TRANSLATIONS = { key: { lang: None for lang in LANGUAGES if lang != key } for key in LANGUAGES }

for origin, target in it.permutations(LANGUAGES, 2): 

    samples_origin = [sample[1] for sample in TRAIN_SET[origin]]
    samples_target = [sample[1] for sample in TRAIN_SET[target]]

    U, Sig, Vt = np.linalg.svd(np.transpose(samples_origin) @ samples_target)
    
    TRANSLATOR = np.transpose(Vt) @ np.transpose(U)
    TRANSLATIONS[origin][target] = TRANSLATOR

In [12]:
def evaluate_single_cosine_similarity(origin_lang, target_lang):
    '''
    Evaluate cosine similarity between single sentences.
    Cosine similarity has an interval from -1 to 1, and the closer to 1 the value is, more similar the params are.

    Params:
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    evaluate_single_cosine_similarity('pt', 'en')
    '''
    cc = SmoothingFunction()
    for index in range(10):
        print('Original sentence:', TEST_SET[origin_lang][index][0])
        print('Target sentence:', TEST_SET[target_lang][index][0])

        vector_translated = TRANSLATIONS[origin_lang][target_lang] @ TEST_SET[origin_lang][index][1]
        vector_target = TEST_SET[target_lang][index][1]
        
        vectors = [v for s, v in SAMPLES[target_lang]]
        strings = [s for s, v in SAMPLES[target_lang]]
        most_similar = max(zip(cosine_similarity([vector_translated], vectors)[0], strings))
        print('Generated sentence:', most_similar[1])
        
        BLEU_score = sentence_bleu([TEST_SET[target_lang][index][0].split()], most_similar[1].split(), weights=(1, 0, 0, 0), smoothing_function=cc.method4)
        print('BLEU score:', BLEU_score)
        
        print('Cossine similarity to generated:', most_similar[0])
        print("Cossine similarity to target:", cosine_similarity([vector_translated], [vector_target])[0][0], "\n")

### Portuguese -> Spanish

In [13]:
teste = [v for s, v in SAMPLES['pt']][:3]
padrao = teste[0]
cosine_similarity([padrao], teste)[0]

array([1.       , 0.931034 , 0.6409205], dtype=float32)

In [14]:
evaluate_single_cosine_similarity('pt', 'es')

Original sentence: continua para o próximo podcast
Target sentence: continuar con el siguiente podcast
Generated sentence: iniciar el próximo podcast
BLEU score: 0.38940039153570244
Cossine similarity to generated: 0.88823247
Cossine similarity to target: 0.8825095 

Original sentence: começa o próximo episódio do podcast
Target sentence: empieza el siguiente episodio del podcast
Generated sentence: comienza el siguiente episodio del podcast
BLEU score: 0.8333333333333334
Cossine similarity to generated: 0.9409791
Cossine similarity to target: 0.9405708 

Original sentence: reproduz o meu podcast favorito por favor
Target sentence: pon mi podcast favorito por favor
Generated sentence: por favor modifique mi calendario con este evento
BLEU score: 0.375
Cossine similarity to generated: 0.88078916
Cossine similarity to target: 0.8463628 

Original sentence: inicia o podcast esta semana para o jantar
Target sentence: comienza esta semana para cenar el podcast
Generated sentence: comienza e

### English -> German

In [15]:
evaluate_single_cosine_similarity('en', 'de')

Original sentence: continue to next podcast
Target sentence: fahre mit dem nächsten podcast fort
Generated sentence: beginn die nächste episode zu spielen
BLEU score: 0
Cossine similarity to generated: 0.851653
Cossine similarity to target: 0.6785552 

Original sentence: start next podcast episode
Target sentence: starte die nächste folge des podcasts
Generated sentence: folgende podcast episode starten
BLEU score: 0
Cossine similarity to generated: 0.715421
Cossine similarity to target: 0.6182311 

Original sentence: play my favorite podcast please
Target sentence: spiel bitte meinen lieblings podcast
Generated sentence: spiel bitte meinen lieblings podcast
BLEU score: 1.0
Cossine similarity to generated: 0.8784077
Cossine similarity to target: 0.8784077 

Original sentence: start this week for dinner podcast
Target sentence: starte den this week for dinner podcast
Generated sentence: zeitplan für diese woche
BLEU score: 0
Cossine similarity to generated: 0.805292
Cossine similarity t

### Avaliating path
We use the following metrics for that purpose:
- Cosine similarity
- Euclidean distance

In [16]:
def pairwise(iterable):
    '''
    Return successive overlapping pairs taken from the input iterable.
    The number of 2-tuples in the output iterator will be one fewer than the number of inputs. 
    It will be empty if the input iterable has fewer than two values.
    pairwise('ABCDEFG') --> AB BC CD DE EF FG

    Source: https://docs.python.org/3/library/itertools.html#itertools.pairwise
    '''
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

In [20]:
def avaliate_path(path, debug = True):
    '''
    Avaliate the translation path using cosine similarity, euclidean distance.

    Params:
    - path: path of desired translation
    
    Return:
    - Score of each avaliation method
    
    Example of usage:
    avaliate_path(['pt', 'en', 'es'])
    '''
    
    translation_matrix = np.identity(300)

    for (origin, target) in pairwise(path):
        translation_matrix = TRANSLATIONS[origin][target] @ translation_matrix
        if debug:
            evaluate_single_cosine_similarity(origin, target)
    
    vectors = [translation_matrix @ v for _, v in TEST_SET[path[0]] ]
    vectors_target = [v for _, v in TEST_SET[path[-1]]]
    
    mean_cos_sim = sum([cosine_similarity([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    mean_euc_dist = sum([euclidean_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    
    mean_bleu_score = 0
    for i, (vo, vt) in enumerate( zip(vectors, vectors_target) ):
        target_lang = path[-1]
        
        all_vectors = [v for s, v in SAMPLES[target_lang]]
        all_strings = [s for s, v in SAMPLES[target_lang]]
        generated_sentence = max(zip(cosine_similarity([vo], all_vectors)[0], all_strings))[1]
        
        original_sentence = TEST_SET[path[0]][i][0]
        target_sentence = TEST_SET[path[-1]][i][0]
        mean_bleu_score += sentence_bleu([target_sentence.split()], generated_sentence.split(), weights=(1, 0, 0, 0), smoothing_function=cc.method4)
        
        # print(original_sentence, [target_sentence.split()], generated_sentence, bleu_score, sep='|')
    
    return mean_cos_sim[0][0], mean_euc_dist[0][0], mean_bleu_score/len(vectors)
    # return mean_cos_sim[0][0], mean_euc_dist[0][0]

In [21]:
def avaliate_possible_paths(languages):
    '''
    Avaliate possible paths from first to last language on the list, changing the languages in the middle
    
    Params:
    - languages: list of languages
    
    Return:
    - Dataframe with the score of each path on cossine similarity, euclidean distance and manhattan distance
    
    Example of usage:
    avaliate_paths(['pt', 'en', 'es'])
    '''
    
    start = languages[0]
    end = languages[-1]
    
    paths = [[start, end]]
    for i in range(len(languages) - 2):
        for comb in it.combinations(languages[1: -1], i + 1):
            paths.append([start] + list(comb) + [end])
    
    scores = [avaliate_path(p) for p in paths]
    index = [ ' -> '.join(p) for p in paths]

    return pd.DataFrame(data = scores, columns = ['Cosine Similarity', 'Euclidean Distance', 'Bleu Score'], index = index)
    # return pd.DataFrame(data = scores, columns = ['Cosine Similarity', 'Euclidean Distance'], index = index)

## 4 - Study Cases

### Study Case #1: Portuguese - Italian - Spanish
In this case, we intend to evaluate how good is a translation between two languages from the Romance language family, such as Portuguese and Spanish, and if adding a language from the same family, such as Italian, affects the quality of the translation.

In [22]:
exp1 = avaliate_possible_paths(['pt', 'it', 'es'])
exp1

Original sentence: continua para o próximo podcast
Target sentence: continuar con el siguiente podcast
Generated sentence: iniciar el próximo podcast
BLEU score: 0.38940039153570244
Cossine similarity to generated: 0.88823247
Cossine similarity to target: 0.8825095 

Original sentence: começa o próximo episódio do podcast
Target sentence: empieza el siguiente episodio del podcast
Generated sentence: comienza el siguiente episodio del podcast
BLEU score: 0.8333333333333334
Cossine similarity to generated: 0.9409791
Cossine similarity to target: 0.9405708 

Original sentence: reproduz o meu podcast favorito por favor
Target sentence: pon mi podcast favorito por favor
Generated sentence: por favor modifique mi calendario con este evento
BLEU score: 0.375
Cossine similarity to generated: 0.88078916
Cossine similarity to target: 0.8463628 

Original sentence: inicia o podcast esta semana para o jantar
Target sentence: comienza esta semana para cenar el podcast
Generated sentence: comienza e

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> es,0.761036,4.187897,0.412073
pt -> it -> es,0.748819,4.274804,0.392679


### Study Case #2: English - Swedish - German
In this case, we intend to evaluate how good is a translation between two languages from the Anglo-Saxon language family, such as English and German, and if adding a language from the same family, such as Swedish, affects the quality of the translation.

In [23]:
exp2 = avaliate_possible_paths(['en', 'sv', 'de'])
exp2

Original sentence: continue to next podcast
Target sentence: fahre mit dem nächsten podcast fort
Generated sentence: beginn die nächste episode zu spielen
BLEU score: 0
Cossine similarity to generated: 0.851653
Cossine similarity to target: 0.6785552 

Original sentence: start next podcast episode
Target sentence: starte die nächste folge des podcasts
Generated sentence: folgende podcast episode starten
BLEU score: 0
Cossine similarity to generated: 0.715421
Cossine similarity to target: 0.6182311 

Original sentence: play my favorite podcast please
Target sentence: spiel bitte meinen lieblings podcast
Generated sentence: spiel bitte meinen lieblings podcast
BLEU score: 1.0
Cossine similarity to generated: 0.8784077
Cossine similarity to target: 0.8784077 

Original sentence: start this week for dinner podcast
Target sentence: starte den this week for dinner podcast
Generated sentence: zeitplan für diese woche
BLEU score: 0
Cossine similarity to generated: 0.805292
Cossine similarity t

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
en -> de,0.751231,4.850476,0.32094
en -> sv -> de,0.737642,4.951251,0.288412


### Study Case #3: Portuguese - English - Spanish
In this case, we intend to evaluate how good is a translation between two languages from the Romance language family, such as Portuguese and Spanish, and if adding a language from an outer group, such as English from the Anglo-Saxon, affects the quality of the translation.

In [24]:
exp3 = avaliate_possible_paths(['pt', 'en', 'es'])
exp3

Original sentence: continua para o próximo podcast
Target sentence: continuar con el siguiente podcast
Generated sentence: iniciar el próximo podcast
BLEU score: 0.38940039153570244
Cossine similarity to generated: 0.88823247
Cossine similarity to target: 0.8825095 

Original sentence: começa o próximo episódio do podcast
Target sentence: empieza el siguiente episodio del podcast
Generated sentence: comienza el siguiente episodio del podcast
BLEU score: 0.8333333333333334
Cossine similarity to generated: 0.9409791
Cossine similarity to target: 0.9405708 

Original sentence: reproduz o meu podcast favorito por favor
Target sentence: pon mi podcast favorito por favor
Generated sentence: por favor modifique mi calendario con este evento
BLEU score: 0.375
Cossine similarity to generated: 0.88078916
Cossine similarity to target: 0.8463628 

Original sentence: inicia o podcast esta semana para o jantar
Target sentence: comienza esta semana para cenar el podcast
Generated sentence: comienza e

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> es,0.761036,4.187897,0.412073
pt -> en -> es,0.735565,4.411218,0.394879


### Study Case #4: English - Portuguese - German
In this case, we intend to evaluate how good is a translation between two languages from the Anglo-Saxon language family, such as English and German, and if adding a language from an outer group, such as Portuguese from the Romance, affects the quality of the translation.

In [25]:
exp4 = avaliate_possible_paths(['pt', 'en', 'es'])
exp4

Original sentence: continua para o próximo podcast
Target sentence: continuar con el siguiente podcast
Generated sentence: iniciar el próximo podcast
BLEU score: 0.38940039153570244
Cossine similarity to generated: 0.88823247
Cossine similarity to target: 0.8825095 

Original sentence: começa o próximo episódio do podcast
Target sentence: empieza el siguiente episodio del podcast
Generated sentence: comienza el siguiente episodio del podcast
BLEU score: 0.8333333333333334
Cossine similarity to generated: 0.9409791
Cossine similarity to target: 0.9405708 

Original sentence: reproduz o meu podcast favorito por favor
Target sentence: pon mi podcast favorito por favor
Generated sentence: por favor modifique mi calendario con este evento
BLEU score: 0.375
Cossine similarity to generated: 0.88078916
Cossine similarity to target: 0.8463628 

Original sentence: inicia o podcast esta semana para o jantar
Target sentence: comienza esta semana para cenar el podcast
Generated sentence: comienza e

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> es,0.761036,4.187897,0.412073
pt -> en -> es,0.735565,4.411218,0.394879


## Experiments

### Experiment #1: Portuguese - Spanish - German
In this experiment, we intend to evaluate translations using languages from different groups.

In [26]:
exp5 = avaliate_possible_paths(['pt', 'es', 'de'])
exp5

Original sentence: continua para o próximo podcast
Target sentence: fahre mit dem nächsten podcast fort
Generated sentence: stell die nächste podcast episode zum abspielen ein
BLEU score: 0.12500000000000003
Cossine similarity to generated: 0.8257882
Cossine similarity to target: 0.7637129 

Original sentence: começa o próximo episódio do podcast
Target sentence: starte die nächste folge des podcasts
Generated sentence: wie ist der status vom derzeitigem wetter
BLEU score: 0
Cossine similarity to generated: 0.809924
Cossine similarity to target: 0.6960387 

Original sentence: reproduz o meu podcast favorito por favor
Target sentence: spiel bitte meinen lieblings podcast
Generated sentence: schalte bitte meinen lieblings radiosender ein
BLEU score: 0.5
Cossine similarity to generated: 0.80715036
Cossine similarity to target: 0.77412176 

Original sentence: inicia o podcast esta semana para o jantar
Target sentence: starte den this week for dinner podcast
Generated sentence: was steht he

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> de,0.672054,4.72753,0.217575
pt -> es -> de,0.665095,4.762326,0.204248


### Experiment #2: Portuguese - Swedish - German 

In [27]:
exp6 = avaliate_possible_paths(['pt', 'sv', 'de'])
exp6

Original sentence: continua para o próximo podcast
Target sentence: fahre mit dem nächsten podcast fort
Generated sentence: stell die nächste podcast episode zum abspielen ein
BLEU score: 0.12500000000000003
Cossine similarity to generated: 0.8257882
Cossine similarity to target: 0.7637129 

Original sentence: começa o próximo episódio do podcast
Target sentence: starte die nächste folge des podcasts
Generated sentence: wie ist der status vom derzeitigem wetter
BLEU score: 0
Cossine similarity to generated: 0.809924
Cossine similarity to target: 0.6960387 

Original sentence: reproduz o meu podcast favorito por favor
Target sentence: spiel bitte meinen lieblings podcast
Generated sentence: schalte bitte meinen lieblings radiosender ein
BLEU score: 0.5
Cossine similarity to generated: 0.80715036
Cossine similarity to target: 0.77412176 

Original sentence: inicia o podcast esta semana para o jantar
Target sentence: starte den this week for dinner podcast
Generated sentence: was steht he

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> de,0.672054,4.72753,0.217575
pt -> sv -> de,0.657271,4.815911,0.197072


### Experiment #3: English - Swedish - Italian

In [28]:
exp7 = avaliate_possible_paths(['en', 'sv', 'it'])
exp7

Original sentence: continue to next podcast
Target sentence: continua col prossimo podcast
Generated sentence: passa al prossimo episodio del podcast per favore
BLEU score: 0.25
Cossine similarity to generated: 0.8070029
Cossine similarity to target: 0.63051754 

Original sentence: start next podcast episode
Target sentence: avvia il prossimo episodio del podcast
Generated sentence: podcast prossimo episodio
BLEU score: 0.36787944117144233
Cossine similarity to generated: 0.71518207
Cossine similarity to target: 0.6761684 

Original sentence: play my favorite podcast please
Target sentence: riproduci il mio podcast preferito per favore
Generated sentence: vorrei ascoltare la playlist del mio matrimonio
BLEU score: 0.14285714285714285
Cossine similarity to generated: 0.86869645
Cossine similarity to target: 0.81960255 

Original sentence: start this week for dinner podcast
Target sentence: avvia il podcast this week for dinner
Generated sentence: per favore avvia musica classica per la 

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
en -> it,0.708238,5.442024,0.225563
en -> sv -> it,0.692051,5.569716,0.19016


### Experiment #4: English - Spanish - Italian

In [29]:
exp8 = avaliate_possible_paths(['en', 'es', 'it'])
exp8

Original sentence: continue to next podcast
Target sentence: continua col prossimo podcast
Generated sentence: passa al prossimo episodio del podcast per favore
BLEU score: 0.25
Cossine similarity to generated: 0.8070029
Cossine similarity to target: 0.63051754 

Original sentence: start next podcast episode
Target sentence: avvia il prossimo episodio del podcast
Generated sentence: podcast prossimo episodio
BLEU score: 0.36787944117144233
Cossine similarity to generated: 0.71518207
Cossine similarity to target: 0.6761684 

Original sentence: play my favorite podcast please
Target sentence: riproduci il mio podcast preferito per favore
Generated sentence: vorrei ascoltare la playlist del mio matrimonio
BLEU score: 0.14285714285714285
Cossine similarity to generated: 0.86869645
Cossine similarity to target: 0.81960255 

Original sentence: start this week for dinner podcast
Target sentence: avvia il podcast this week for dinner
Generated sentence: per favore avvia musica classica per la 

Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
en -> it,0.708238,5.442024,0.225563
en -> es -> it,0.701003,5.49442,0.21146


## Saving results to spreadsheet

In [30]:
path = 'results.xlsx'
with pd.ExcelWriter(path) as writer:
    exp1.to_excel(writer, sheet_name = 'Study case 1')
    exp2.to_excel(writer, sheet_name = 'Study case 2')
    exp3.to_excel(writer, sheet_name = 'Study case 3')
    exp4.to_excel(writer, sheet_name = 'Study case 4')
    exp5.to_excel(writer, sheet_name = 'Experiment 1')
    exp6.to_excel(writer, sheet_name = 'Experiment 2')
    exp7.to_excel(writer, sheet_name = 'Experiment 3')
    exp8.to_excel(writer, sheet_name = 'Experiment 4')

## New results

In [31]:
rom = ['pt', 'it', 'es'] #romance
ang = ['en', 'sv', 'de'] #anglo-saxon

In [34]:
def avaliate_multiple_paths(paths):
    '''
    Avaliate multiple paths
    
    Params:
    - languages: list of paths
    
    Return:
    - Dataframe with the score of each path on cossine similarity, euclidean distance and Bleu score
    
    Example of usage:
    avaliate_paths([('pt', 'es'), ('it', 'en')])
    '''
    
    scores = []
    index = [ ' -> '.join(p) for p in paths]
    
    for p in paths:
        print('->'.join(p), end='...')
        scores.append(avaliate_path(p, debug=False))
        print('ok')
        
    df = pd.DataFrame(data=scores, columns=['Cosine Similarity', 'Euclidean Distance', 'Bleu Score'], index = index)
    # df = pd.DataFrame(data = scores, columns = ['Cosine Similarity', 'Euclidean Distance'], index = index)
    print(f"Cosine similarity mean = {df['Cosine Similarity'].mean()}")
    print(f"Euclidean distance mean = {df['Euclidean Distance'].mean()}")
    print(f"BLEU mean = {df['Bleu Score'].mean()}")
    print(f"Standard deviation: \n{df.std()}\n")

    return df

### Direct translation, same family

In [35]:
#tradução direta mesma família
rom_rom = list(it.permutations(rom, 2))
ang_ang = list(it.permutations(ang, 2))

direct_same = avaliate_multiple_paths(rom_rom + ang_ang)
direct_same

pt->it...ok
pt->es...ok
it->pt...ok
it->es...ok
es->pt...ok
es->it...ok
en->sv...ok
en->de...ok
sv->en...ok
sv->de...ok
de->en...ok
de->sv...ok
Cosine similarity mean = 0.7430349514544252
Euclidean distance mean = 4.509141633167239
BLEU mean = 0.3379800452414116
Standard deviation: 
Cosine Similarity     0.023056
Euclidean Distance    0.251984
Bleu Score            0.061631
dtype: float64



Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> it,0.704933,4.668492,0.269054
pt -> es,0.761036,4.187897,0.412073
it -> pt,0.704933,4.668492,0.265319
it -> es,0.735707,4.464251,0.31252
es -> pt,0.761036,4.187897,0.397233
es -> it,0.735707,4.46425,0.315689
en -> sv,0.773172,4.657575,0.409668
en -> de,0.751231,4.850476,0.32094
sv -> en,0.773171,4.657575,0.423461
sv -> de,0.732132,4.22616,0.25261


### Direct translation, different family

In [36]:
#tradução direta familia diferente
rom_ang = list(it.product(rom, ang))
ang_rom = list(it.product(ang, rom))

direct_diff = avaliate_multiple_paths(rom_ang + ang_rom)
direct_diff

pt->en...ok
pt->sv...ok
pt->de...ok
it->en...ok
it->sv...ok
it->de...ok
es->en...ok
es->sv...ok
es->de...ok
en->pt...ok
en->it...ok
en->es...ok
sv->pt...ok
sv->it...ok
sv->es...ok
de->pt...ok
de->it...ok
de->es...ok
Cosine similarity mean = 0.6967593549816353
Euclidean distance mean = 5.01797187808681
BLEU mean = 0.23339732741645436
Standard deviation: 
Cosine Similarity     0.021032
Euclidean Distance    0.241808
Bleu Score            0.060107
dtype: float64



Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> en,0.710168,5.357246,0.330341
pt -> sv,0.678615,4.972782,0.230546
pt -> de,0.672054,4.72753,0.217575
it -> en,0.708238,5.442024,0.28346
it -> sv,0.68428,5.065522,0.214822
it -> de,0.683391,4.834031,0.181349
es -> en,0.742659,5.080621,0.375541
es -> sv,0.701439,4.921671,0.261141
es -> de,0.689991,4.76032,0.22246
en -> pt,0.710168,5.357246,0.251207


### Indirect translation, same family

In [37]:
#tradução com idioma intermediario igual
rom_rom_rom = list(it.permutations(rom, 3))
ang_ang_ang = list(it.permutations(ang, 3))

indirect_same = avaliate_multiple_paths(rom_rom_rom +  ang_ang_ang)
indirect_same

pt->it->es...ok
pt->es->it...ok
it->pt->es...ok
it->es->pt...ok
es->pt->it...ok
es->it->pt...ok
en->sv->de...ok
en->de->sv...ok
sv->en->de...ok
sv->de->en...ok
de->en->sv...ok
de->sv->en...ok
Cosine similarity mean = 0.731165156107012
Euclidean distance mean = 4.598723040890902
BLEU mean = 0.31006295799143163
Standard deviation: 
Cosine Similarity     0.020582
Euclidean Distance    0.262778
Bleu Score            0.054278
dtype: float64



Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> it -> es,0.748819,4.274804,0.392679
pt -> es -> it,0.696832,4.728963,0.251871
it -> pt -> es,0.723071,4.573865,0.293648
it -> es -> pt,0.696831,4.728964,0.251933
es -> pt -> it,0.723071,4.573865,0.282416
es -> it -> pt,0.748819,4.274803,0.372951
en -> sv -> de,0.737642,4.951251,0.288412
en -> de -> sv,0.757116,4.780757,0.36975
sv -> en -> de,0.72351,4.282699,0.234285
sv -> de -> en,0.757116,4.780756,0.364858


### Indirect translation, start language different family

In [38]:
ang_rom_rom = [(a1, r1, r2) for (r1, r2), a1 in it.product(rom_rom, ang)]
rom_ang_ang = [(r1, a1, a2) for (a1, a2), r1 in it.product(ang_ang, rom)]

indirect_start = avaliate_multiple_paths(ang_rom_rom +  rom_ang_ang)
indirect_start

en->pt->it...ok
sv->pt->it...ok
de->pt->it...ok
en->pt->es...ok
sv->pt->es...ok
de->pt->es...ok
en->it->pt...ok
sv->it->pt...ok
de->it->pt...ok
en->it->es...ok
sv->it->es...ok
de->it->es...ok
en->es->pt...ok
sv->es->pt...ok
de->es->pt...ok
en->es->it...ok
sv->es->it...ok
de->es->it...ok
pt->en->sv...ok
it->en->sv...ok
es->en->sv...ok
pt->en->de...ok
it->en->de...ok
es->en->de...ok
pt->sv->en...ok
it->sv->en...ok
es->sv->en...ok
pt->sv->de...ok
it->sv->de...ok
es->sv->de...ok
pt->de->en...ok
it->de->en...ok
es->de->en...ok
pt->de->sv...ok
it->de->sv...ok
es->de->sv...ok
Cosine similarity mean = 0.6850931850757063
Euclidean distance mean = 5.097010301046057
BLEU mean = 0.21390608745277584
Standard deviation: 
Cosine Similarity     0.018934
Euclidean Distance    0.256419
Bleu Score            0.052895
dtype: float64



Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
en -> pt -> it,0.695482,5.54244,0.194528
sv -> pt -> it,0.673986,5.132392,0.1449
de -> pt -> it,0.674496,4.887265,0.164321
en -> pt -> es,0.73123,5.173896,0.301534
sv -> pt -> es,0.691123,4.991087,0.192122
de -> pt -> es,0.681299,4.817945,0.195222
en -> it -> pt,0.698156,5.444389,0.227912
sv -> it -> pt,0.667184,5.04001,0.152837
de -> it -> pt,0.658959,4.799254,0.157171
en -> it -> es,0.726628,5.202614,0.285021


### Indirect translation, mid language different family

In [39]:
#tradução com idioma intermediario diferente
rom_ang_rom = [(r1, a1, r2) for (r1, r2), a1 in it.product(rom_rom, ang)]
ang_rom_ang = [(a1, r1, a2) for (a1, a2), r1 in it.product(ang_ang, rom)]

indirect_mid = avaliate_multiple_paths(rom_ang_rom +  ang_rom_ang)
indirect_mid

pt->en->it...ok
pt->sv->it...ok
pt->de->it...ok
pt->en->es...ok
pt->sv->es...ok
pt->de->es...ok
it->en->pt...ok
it->sv->pt...ok
it->de->pt...ok
it->en->es...ok
it->sv->es...ok
it->de->es...ok
es->en->pt...ok
es->sv->pt...ok
es->de->pt...ok
es->en->it...ok
es->sv->it...ok
es->de->it...ok
en->pt->sv...ok
en->it->sv...ok
en->es->sv...ok
en->pt->de...ok
en->it->de...ok
en->es->de...ok
sv->pt->en...ok
sv->it->en...ok
sv->es->en...ok
sv->pt->de...ok
sv->it->de...ok
sv->es->de...ok
de->pt->en...ok
de->it->en...ok
de->es->en...ok
de->pt->sv...ok
de->it->sv...ok
de->es->sv...ok
Cosine similarity mean = 0.7218003548608741
Euclidean distance mean = 4.686341819629901
BLEU mean = 0.29597873164996785
Standard deviation: 
Cosine Similarity     0.024257
Euclidean Distance    0.234353
Bleu Score            0.055582
dtype: float64



Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> en -> it,0.683885,4.847572,0.245341
pt -> sv -> it,0.677082,4.906317,0.226846
pt -> de -> it,0.678369,4.886759,0.235931
pt -> en -> es,0.735565,4.411218,0.394879
pt -> sv -> es,0.729514,4.463664,0.384818
pt -> de -> es,0.726058,4.489707,0.368706
it -> en -> pt,0.683885,4.847571,0.22877
it -> sv -> pt,0.677082,4.906317,0.215679
it -> de -> pt,0.678369,4.886758,0.21059
it -> en -> es,0.712411,4.667339,0.267075


### Indirect translation, end language different family

In [40]:
#tradução com idioma
rom_rom_ang = [(r1, r2, a1) for (r1, r2), a1 in it.product(rom_rom, ang)]
ang_ang_rom = [(a1, a2, r1) for (a1, a2), r1 in it.product(ang_ang, rom)]

indirect_end = avaliate_multiple_paths(rom_rom_ang +  ang_ang_rom)
indirect_end

pt->it->en...ok
pt->it->sv...ok
pt->it->de...ok
pt->es->en...ok
pt->es->sv...ok
pt->es->de...ok
it->pt->en...ok
it->pt->sv...ok
it->pt->de...ok
it->es->en...ok
it->es->sv...ok
it->es->de...ok
es->pt->en...ok
es->pt->sv...ok
es->pt->de...ok
es->it->en...ok
es->it->sv...ok
es->it->de...ok
en->sv->pt...ok
en->sv->it...ok
en->sv->es...ok
en->de->pt...ok
en->de->it...ok
en->de->es...ok
sv->en->pt...ok
sv->en->it...ok
sv->en->es...ok
sv->de->pt...ok
sv->de->it...ok
sv->de->es...ok
de->en->pt...ok
de->en->it...ok
de->en->es...ok
de->sv->pt...ok
de->sv->it...ok
de->sv->es...ok
Cosine similarity mean = 0.6850931696817105
Euclidean distance mean = 5.09701031445598
BLEU mean = 0.2133887891892107
Standard deviation: 
Cosine Similarity     0.018934
Euclidean Distance    0.256419
Bleu Score            0.056748
dtype: float64



Unnamed: 0,Cosine Similarity,Euclidean Distance,Bleu Score
pt -> it -> en,0.698156,5.444389,0.312143
pt -> it -> sv,0.667183,5.04001,0.212312
pt -> it -> de,0.658959,4.799254,0.199895
pt -> es -> en,0.703957,5.403017,0.317437
pt -> es -> sv,0.673067,5.001631,0.228127
pt -> es -> de,0.665095,4.762326,0.204248
it -> pt -> en,0.695482,5.54244,0.25728
it -> pt -> sv,0.673986,5.132392,0.19275
it -> pt -> de,0.674496,4.887265,0.179077
it -> es -> en,0.701003,5.49442,0.267862


### Saving results

In [41]:
path = 'new_results.xlsx'
with pd.ExcelWriter(path) as writer:
    direct_same.describe().to_excel(writer, sheet_name = 'Direct same')
    direct_diff.describe().to_excel(writer, sheet_name = 'Direct diff')
    indirect_same.describe().to_excel(writer, sheet_name = 'Indirect same')
    indirect_start.describe().to_excel(writer, sheet_name = 'Indirect start')
    indirect_mid.describe().to_excel(writer, sheet_name = 'Indirect mid')
    indirect_end.describe().to_excel(writer, sheet_name = 'Indirect end')