# Translation of short phrases using an intermediate language:
## A thesis on leveraging interlingual approaches

## Libraries

In [1]:
import numpy as np
import pandas as pd
import itertools as it

from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

## 1 - Loading data
Loading the models and sentences used.
- Models: https://fasttext.cc/docs/en/crawl-vectors.html
- Sentences: https://github.com/alexa/massive

In [2]:
def load_files(model_path, sentences_path, limit = None):
    '''
    Load models from FastText folder and sentences from Amazon Massive folder.
    
    Params:
    - model_path: path to the folder containing all models used, i.e., FastText
    - sentences_path: path to the folder containing all sentences used, i.e., Amazon_Massive
    - limit: define a limit in case your have low computer power, e.g., 5000
    
    Return:
    Tuple containing the language model and its corresponding sentences
    '''

    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors = 'replace', limit = limit)
    sentences = pd.read_json(sentences_path, lines = True)['utt']
    
    return model, sentences

Defining data path.

In [3]:
FASTTEXT_PATH = 'Datasets/FastText/'
MASSIVE_PATH = 'Datasets/Amazon_Massive/'

In [4]:
PATHS = {
    'de': [ FASTTEXT_PATH + 'cc.de.300.vec', MASSIVE_PATH + 'de-DE.jsonl' ],
    'en': [ FASTTEXT_PATH + 'cc.en.300.vec', MASSIVE_PATH + 'en-US.jsonl' ],
    'es': [ FASTTEXT_PATH + 'cc.es.300.vec', MASSIVE_PATH + 'es-ES.jsonl' ],
    'it': [ FASTTEXT_PATH + 'cc.it.300.vec', MASSIVE_PATH + 'it-IT.jsonl' ],
    'pt': [ FASTTEXT_PATH + 'cc.pt.300.vec', MASSIVE_PATH + 'pt-PT.jsonl' ],
    'sv': [ FASTTEXT_PATH + 'cc.sv.300.vec', MASSIVE_PATH + 'sv-SE.jsonl' ],
}

LANGUAGES = PATHS.keys()

**Note**: the cell below takes approximately 5 to 7 minutes per model, if no limit is set.

In [5]:
MODELS, SENTENCES = {}, {}

for language, value in PATHS.items():
    model = value[0]
    sentences = value[1]

    print(f'Loading {model}...')
    MODELS[language], SENTENCES[language] = load_files(model, sentences, 100000)
    print(f'Finished loading {model}\n')

print('\nAll models and sentences are now loaded!')

Loading Datasets/FastText/cc.de.300.vec...
Finished loading Datasets/FastText/cc.de.300.vec

Loading Datasets/FastText/cc.en.300.vec...
Finished loading Datasets/FastText/cc.en.300.vec

Loading Datasets/FastText/cc.es.300.vec...
Finished loading Datasets/FastText/cc.es.300.vec

Loading Datasets/FastText/cc.it.300.vec...
Finished loading Datasets/FastText/cc.it.300.vec

Loading Datasets/FastText/cc.pt.300.vec...
Finished loading Datasets/FastText/cc.pt.300.vec

Loading Datasets/FastText/cc.sv.300.vec...
Finished loading Datasets/FastText/cc.sv.300.vec


All models and sentences are now loaded!


## 2 - Preparing data

In [6]:
SAMPLES = { key: [] for key in LANGUAGES }

In [7]:
# Since all sentence files have the same length, we chose one at random for the range function.
# We prove this in the cell below
for idx in range(len(SENTENCES['en'])):
    
    actual_sentence = { key: [] for key in LANGUAGES }
    
    try:
        for lang, sent in SENTENCES.items():
            for word in sent[idx].split(' '):
                actual_sentence[lang].append(MODELS[lang][word])

    except KeyError:
        continue
    
    for key, value in actual_sentence.items():
        SAMPLES[key].append([SENTENCES[key][idx], sum(value)])

In [8]:
for key in SENTENCES:
    SIZE_SAMPLES = len(SAMPLES[key])
    SIZE_SENTENCES = len(SENTENCES[key])
    print(
        f'Total sentences in { key } file: { SIZE_SENTENCES }'
        f' -> Model { key } samples: { SIZE_SAMPLES } ({ SIZE_SAMPLES / SIZE_SENTENCES * 100:.2f}%)'
    )

Total sentences in de file: 16521 -> Model de samples: 2930 (17.74%)
Total sentences in en file: 16521 -> Model en samples: 2930 (17.74%)
Total sentences in es file: 16521 -> Model es samples: 2930 (17.74%)
Total sentences in it file: 16521 -> Model it samples: 2930 (17.74%)
Total sentences in pt file: 16521 -> Model pt samples: 2930 (17.74%)
Total sentences in sv file: 16521 -> Model sv samples: 2930 (17.74%)


Splitting into train and test.

In [9]:
SPLIT_RATE = int(SIZE_SAMPLES * 0.7)

TRAIN_SET = { key: SAMPLES[key][:SPLIT_RATE] for key in LANGUAGES }
TEST_SET = { key: SAMPLES[key][SPLIT_RATE:] for key in LANGUAGES }

## 3 - Evaluating Control Group

Theorically speaking, translating the vector that one sentence represents to another should result in a similar sentence. For that purpose, we evaluate our results using the cosine similarity, which range is from -1 to 1.

In [10]:
TRANSLATIONS = { key: { lang: None for lang in LANGUAGES if lang != key } for key in LANGUAGES }

for origin, target in it.permutations(LANGUAGES, 2): 

    samples_origin = [sample[1] for sample in TRAIN_SET[origin]]
    samples_target = [sample[1] for sample in TRAIN_SET[target]]

    U, Sig, Vt = np.linalg.svd(np.transpose(samples_origin) @ samples_target)
    
    TRANSLATOR = np.transpose(Vt) @ np.transpose(U)
    TRANSLATIONS[origin][target] = TRANSLATOR

In [11]:
def evaluate_single_cosine_similarity(origin_lang, target_lang):
    '''
    Evaluate cosine similarity between single sentences.
    Cosine similarity has an interval from -1 to 1, and the closer to 1 the value is, more similar the params are.

    Params:
    - origin_lang: language in which the words in word_list are written
    - target_lang: language you wish to know the translation

    Example of usage:
    evaluate_single_cosine_similarity('pt', 'en')
    '''
    
    for index in range(5):
        print(TEST_SET[origin_lang][index][0], '->', TEST_SET[target_lang][index][0])

        vector_translated = TRANSLATIONS[origin_lang][target_lang] @ TEST_SET[origin_lang][index][1]
        vector_target = TEST_SET[target_lang][index][1]

        print("Cossine similarity:", cosine_similarity([vector_translated], [vector_target])[0][0], "\n")

### Portuguese -> Spanish

In [12]:
evaluate_single_cosine_similarity('pt', 'es')

verificar a minha lista -> comprueba mi lista
Cossine similarity: 0.7896989 

salta para o próximo episódio -> salta al siguiente episodio
Cossine similarity: 0.7312537 

próxima -> siguiente
Cossine similarity: 0.4090119 

o próximo -> siguiente
Cossine similarity: 0.32732537 

pass -> la proxima
Cossine similarity: 0.040762633 



### English -> German

In [13]:
evaluate_single_cosine_similarity('en', 'de')

check my list -> meine liste überprüfen
Cossine similarity: 0.83399355 

skip to next episode -> springe zur nächsten folge
Cossine similarity: 0.6362786 

next -> nächste
Cossine similarity: 0.7237926 

next one -> der nächste
Cossine similarity: 0.61569357 

skip -> überspringen
Cossine similarity: 0.39044032 



### Avaliating path
We use the following metrics for that purpose:
- Cosine similarity
- Euclidean distance

In [14]:
def pairwise(iterable):
    '''
    Return successive overlapping pairs taken from the input iterable.
    The number of 2-tuples in the output iterator will be one fewer than the number of inputs. 
    It will be empty if the input iterable has fewer than two values.
    pairwise('ABCDEFG') --> AB BC CD DE EF FG

    Source: https://docs.python.org/3/library/itertools.html#itertools.pairwise
    '''
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

In [15]:
def avaliate_path(path):
    '''
    Avaliate the translation path using cosine similarity, euclidean distance and manhattan distance.

    Params:
    - path: path of desired translation
    
    Return:
    - Score of each avaliation method
    
    Example of usage:
    avaliate_path(['pt', 'en', 'es'])
    '''
    
    translation_matrix = np.identity(300)

    for (origin, target) in pairwise(path):
        translation_matrix = TRANSLATIONS[origin][target] @ translation_matrix
    
    vectors = [translation_matrix @ v for _, v in TEST_SET[path[0]] ]
    vectors_target = [v for _, v in TEST_SET[path[-1]]]
    
    mean_cos_sim = sum([cosine_similarity([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    mean_euc_dist = sum([euclidean_distances([v1], [v2]) for v1, v2 in zip(vectors, vectors_target)])/ len(vectors)
    
    return mean_cos_sim[0][0], mean_euc_dist[0][0]

In [16]:
def avaliate_possible_paths(languages):
    '''
    Avaliate possible paths from first to last language on the list, changing the languages in the middle
    
    Params:
    - languages: list of languages
    
    Return:
    - Dataframe with the score of each path on cossine similarity, euclidean distance and manhattan distance
    
    Example of usage:
    avaliate_paths(['pt', 'en', 'es'])
    '''
    
    start = languages[0]
    end = languages[-1]
    
    paths = [[start, end]]
    for i in range(len(languages) - 2):
        for comb in it.combinations(languages[1: -1], i + 1):
            paths.append([start] + list(comb) + [end])
    
    scores = [avaliate_path(p) for p in paths]
    index = [ ' -> '.join(p) for p in paths]
    return pd.DataFrame(data=scores, columns=['Cosine Similarity', 'Euclidean Distance'], index = index)


## 4 - Study Cases

### Study Case #1: Portuguese - Italian - Spanish
In this case, we intend to evaluate how good is a translation between two languages from the Romance language family, such as Portuguese and Spanish, and if adding a language from the same family, such as Italian, affects the quality of the translation.

In [17]:
exp1 = avaliate_possible_paths(['pt', 'it', 'es'])
exp1

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> es,0.745831,3.608248
pt -> it -> es,0.725382,3.728033


### Study Case #2: English - Swedish - German
In this case, we intend to evaluate how good is a translation between two languages from the Anglo-Saxon language family, such as English and German, and if adding a language from the same family, such as Swedish, affects the quality of the translation.

In [18]:
exp2 = avaliate_possible_paths(['en', 'de', 'sv'])
exp2

Unnamed: 0,Cosine Similarity,Euclidean Distance
en -> sv,0.770322,4.096114
en -> de -> sv,0.7507,4.223161


### Study Case #3: Portuguese - English - Spanish
In this case, we intend to evaluate how good is a translation between two languages from the Romance language family, such as Portuguese and Spanish, and if adding a language from an outer group, such as English from the Anglo-Saxon, affects the quality of the translation.

In [19]:
exp3 = avaliate_possible_paths(['pt', 'en', 'es'])
exp3

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> es,0.745831,3.608248
pt -> en -> es,0.718346,3.794853


### Study Case #4: English - Portuguese - German
In this case, we intend to evaluate how good is a translation between two languages from the Anglo-Saxon language family, such as English and German, and if adding a language from an outer group, such as Portuguese from the Romance, affects the quality of the translation.

In [20]:
exp4 = avaliate_possible_paths(['pt', 'en', 'es'])
exp4

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> es,0.745831,3.608248
pt -> en -> es,0.718346,3.794853


## Experiments

### Experiment #1: Portuguese - Spanish - German
In this experiment, we intend to evaluate translations using languages from different groups.

In [21]:
exp5 = avaliate_possible_paths(['pt', 'es', 'de'])
exp5

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> de,0.64682,4.090184
pt -> es -> de,0.640827,4.115533


### Experiment #2: Portuguese - Swedish - German 

In [22]:
exp6 = avaliate_possible_paths(['pt', 'sv', 'de'])
exp6

Unnamed: 0,Cosine Similarity,Euclidean Distance
pt -> de,0.64682,4.090184
pt -> sv -> de,0.633571,4.161656


### Experiment #3: English - Swedish - Italian

In [23]:
exp7 = avaliate_possible_paths(['en', 'sv', 'it'])
exp7

Unnamed: 0,Cosine Similarity,Euclidean Distance
en -> it,0.700218,4.791466
en -> sv -> it,0.67862,4.928406


### Experiment #4: English - Spanish - Italian

In [24]:
exp8 = avaliate_possible_paths(['en', 'es', 'it'])
exp8

Unnamed: 0,Cosine Similarity,Euclidean Distance
en -> it,0.700218,4.791466
en -> es -> it,0.688069,4.861872


## Saving results to spreadsheet

In [25]:
path = 'results.xlsx'
with pd.ExcelWriter(path) as writer:
    exp1.to_excel(writer, sheet_name = 'Study case 1')
    exp2.to_excel(writer, sheet_name = 'Study case 2')
    exp3.to_excel(writer, sheet_name = 'Study case 3')
    exp4.to_excel(writer, sheet_name = 'Study case 4')
    exp5.to_excel(writer, sheet_name = 'Experiment 1')
    exp6.to_excel(writer, sheet_name = 'Experiment 2')
    exp7.to_excel(writer, sheet_name = 'Experiment 3')
    exp8.to_excel(writer, sheet_name = 'Experiment 4')