In [3]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import random as rd
import matplotlib.pyplot as plt

# 1-Loading Things
Loading the models, sentences and analogies used.
Models: https://fasttext.cc/docs/en/crawl-vectors.html
Sentences: https://github.com/alexa/massive

In [4]:
#Path to models
model_path_pt = 'datasets/cc.pt.300.vec'
model_path_en = 'datasets/cc.en.300.vec'

#Loading models
model_pt = KeyedVectors.load_word2vec_format(model_path_pt, unicode_errors='replace')
model_en = KeyedVectors.load_word2vec_format(model_path_en, unicode_errors='replace')

In [10]:
#Path to sentences
sentences_path_pt = 'datasets/1.0/data/pt-PT.jsonl'
sentences_path_en = 'datasets/1.0/data/en-US.jsonl'

#Loading sentences
sentences_pt = pd.read_json(sentences_path_pt, lines = True)['utt']
sentences_en = pd.read_json(sentences_path_en, lines = True)['utt']

# 2-Preparing Data

In [11]:
matrix_pt = []
matrix_en = [] 
for (s_pt, s_en) in zip(sentences_pt, sentences_en):
    tot_pt = []
    tot_en = []
    try:
        for w in s_pt.split(' '):
            tot_pt.append(model_pt[w])
        for w in s_en.split(' '):
            tot_en.append(model_en[w])
    except KeyError:
        continue
    matrix_pt.append(sum(tot_pt))
    matrix_en.append(sum(tot_en))
    
f'{len(sentences_pt)} -> {len(matrix_pt)} ({len(matrix_pt)/len(sentences_pt)*100:.2f}%)'

'16521 -> 15378 (93.08%)'

# 3-Making the translation

In [12]:
#compute the translator matrix
A = matrix_pt
B = matrix_en

U, Sig, Vt = np.linalg.svd(np.transpose(A)@B)
translator = np.transpose(Vt) @ np.transpose(U)

In [13]:
model_en.most_similar(translator @ model_pt['sapato'])

[('shoes', 0.5022848844528198),
 ('shoe', 0.4925771951675415),
 ('shoes.', 0.469224750995636),
 ('handbag', 0.4686202108860016),
 ('high-heels', 0.42868536710739136),
 ('shoes.It', 0.4274202585220337),
 ('shoes.I', 0.42516860365867615),
 ('wear', 0.4195747971534729),
 ('stilettos', 0.41740575432777405),
 ('dress', 0.41310709714889526)]

In [14]:
model_pt.most_similar(translator.T @ model_en["can't"])

[('não', 0.4933251738548279),
 ('Não', 0.4648018181324005),
 ('nem', 0.4586043357849121),
 ('.Não', 0.42639946937561035),
 ('consigo.Não', 0.4222370386123657),
 ('ainda.Não', 0.42222297191619873),
 ('queria.Não', 0.42034509778022766),
 ('5.Não', 0.4118790626525879),
 ('9.Não', 0.4051348865032196),
 ('aí.Não', 0.4007856249809265)]

# 4-Evaluate *future work

In [None]:
def avalia(modelo, analogias, tradutor = None, modelo_o = None):
    tradutor = np.identity(len(modelo[0])) if tradutor is None else tradutor
    modelo_o = modelo if modelo_o is None else modelo_o
    results = {}
    
    for k in analogias:
        hits, error = 0, 0
        for an in analogias[k]:
            try:
                a = modelo_o.get_vector(an[0], norm = True)
                b = modelo_o.get_vector(an[1], norm = True)
                c = modelo_o.get_vector(an[2], norm = True)
            except KeyError:
                error += 1
            else:
                    for r in modelo.most_similar([tradutor @ (b - a + c)], topn = 4):
                        if r[0] in an[:3]: continue
                        if r[0] == an[3]: hits += 1
                        break
        results[k] = hits/(len(analogias_en[k]) - error)*100

In [None]:
pd.DataFrame.from_dict([br1, br2, br1_br2, br2_br1], orient='index', columns=['Glove', 'W2V', 'Glove->W2V', 'W2V->Glove'])