In [39]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from gensim.models import KeyedVectors
import pickle
import copy
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
l1_emb_fn = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
l2_emb_fn = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'
limit = 200000

In [75]:
def read_emb(emb_fn, limit):
    model = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=limit)
    model.syn0 /= np.sqrt((model.syn0 ** 2).sum(1))[:, None]
    return model

def read_T(T_fn):
    with open(T_fn, 'rb') as f:
        data = pickle.load(f)
    return data

def translate(emb, T):
    univ = copy.deepcopy(emb)
    univ.syn0 = np.dot(univ.syn0, T)
    univ.syn0 /= np.sqrt((univ.syn0 ** 2).sum(1))[:, None]
    return univ

def n_closest(w, u, n):
    closest_words = []
    cos_mx = cosine_similarity(w, u.syn0)
    sim_mx = np.argsort(-cos_mx, axis=1)
    for i in range(n):
        word = u.index2word[sim_mx[0][i]]
        closest_words.append(word)
    return closest_words

def debug(w1, w2, u1, u2, n):
    if w1 not in u1:
        print('{} is not found'.format(w1))
        return
    if w2 not in u2:
        print('{} is not found'.format(w2))
        return
    wv_1 = u1[w1].reshape((1, 300))
    wv_2 = u2[w2].reshape((1, 300))
    cos_sim = np.dot(wv_1, wv_2.transpose())
    closest_1 = n_closest(wv_1, u2, n)
    closest_2 = n_closest(wv_2, u1, n)
    
    print(cos_sim)
    print(closest_1)
    print(closest_2)
    print(wv_1[0][0:10])
    print(wv_2[0][0:10])
    
    return wv_1, wv_2, cos_sim, closest_1, closest_2

Reading embeddings

In [9]:
emb_1 = read_emb(l1_emb_fn, limit)
emb_2 = read_emb(l2_emb_fn, limit)

Train on Dinu

In [80]:
T_fn = '/mnt/permanent/home/eszti/dipterv/panlex/output_results/smith_all/20180308_1927_54/train_mod/T_4000.pickle'
T = read_T(T_fn)
univ_d_1 = translate(emb_1, T[0])
univ_d_2 = translate(emb_2, T[1])

In [91]:
w1 = 'dog'
w2 = 'cane'

wv_1, wv_2, cos_sim, cl1, cl2 = debug(w1, w2, univ_d_1, univ_d_2, 10)

[[ 0.99993038]]
['cani', 'cane', 'dog', 'cagnolino', 'procioni', 'abbaiare', 'testardo', 'socievole', 'socievoli', 'animali']
['dog', 'canine', 'dogs', 'watchdogs', 'puppies', 'rabbit', 'frightened', 'veterinarian', 'kennels', 'puppy']
[ 0.0145114   0.03289737 -0.0520917   0.03513986  0.00192922  0.07308776
 -0.03758637  0.05800073  0.07053109  0.00188078]
[ 0.01377985  0.03418651 -0.05230034  0.0358889   0.00202369  0.07207256
 -0.03770745  0.05677222  0.07036008  0.00236728]


In [93]:
w1 = 'dog'
w2 = 'cucina'

wv_1, wv_2, cos_sim, cl1, cl2 = debug(w1, w2, univ_d_1, univ_d_2, 10)

[[ 0.99980128]]
['cani', 'cane', 'dog', 'cagnolino', 'procioni', 'abbaiare', 'testardo', 'socievole', 'socievoli', 'animali']
['kitchen', 'cooking', 'dishes', 'food', 'ingredients', 'seasoning', 'pantries', 'crockery', 'condiments', 'meals']
[ 0.0145114   0.03289737 -0.0520917   0.03513986  0.00192922  0.07308776
 -0.03758637  0.05800073  0.07053109  0.00188078]
[ 0.01474648  0.03322181 -0.05158025  0.03659101  0.00309963  0.07147693
 -0.03862757  0.05758141  0.07215406  0.00177409]


Train on PanLex

In [86]:
T_fn = '/mnt/permanent/home/eszti/dipterv/panlex/output_results/panlex_eng_ita_7/20180317_1207_53/train_mod/T_50.pickle'
T = read_T(T_fn)
univ_p_1 = translate(emb_1, T[0])
univ_p_2 = translate(emb_2, T[1])

In [94]:
w1 = 'dog'
w2 = 'cane'

wv_1, wv_2, cos_sim, cl1, cl2 = debug(w1, w2, univ_p_1, univ_p_2, 10)

[[ 0.9979369]]
['oggettivamente', 'oggettività', 'istintivamente', 'salivazione', 'sagomatura', 'appesantimento', 'affettività', 'sempliciotto', 'interiorizzazione', 'soggettività']
['frighteningly', 'frightening', 'idiotic', 'endearing', 'implication', 'hooves', 'dysregulation', 'pampered', 'summarization', 'contentiousness']
[-0.02737453  0.06105617  0.0059088   0.03615927 -0.07429747  0.00924886
 -0.04875521  0.01311637 -0.06402654  0.13528873]
[-0.02665024  0.06386573  0.00369053  0.04138017 -0.07881358  0.01112839
 -0.03885825  0.00860662 -0.06324127  0.13772528]


In [95]:
w1 = 'dog'
w2 = 'cucina'

wv_1, wv_2, cos_sim, cl1, cl2 = debug(w1, w2, univ_p_1, univ_p_2, 10)

[[ 0.99747038]]
['oggettivamente', 'oggettività', 'istintivamente', 'salivazione', 'sagomatura', 'appesantimento', 'affettività', 'sempliciotto', 'interiorizzazione', 'soggettività']
['kitchen', 'delighting', 'ingredients', 'honesty', 'scullery', 'powerlessness', 'cooking', 'fridge', 'plumbing', 'ruthlessness']
[-0.02737453  0.06105617  0.0059088   0.03615927 -0.07429747  0.00924886
 -0.04875521  0.01311637 -0.06402654  0.13528873]
[-0.02264532  0.06251199  0.00824265  0.04108921 -0.07344838  0.00098236
 -0.04322457  0.01352201 -0.06740391  0.13101251]


In [97]:
w1 = 'kitchen'
w2 = 'gardino'

wv_1, wv_2, cos_sim, cl1, cl2 = debug(w1, w2, univ_p_1, univ_p_2, 10)

[[ 0.99646086]]
['cucinare', 'bollitura', 'insaporire', 'condimento', 'cucinato', 'panificazione', 'cottura', 'ricettazione', 'aerazione', 'ricottura']
['rebelliousness', 'thoughtfulness', 'honestly', 'cluelessness', 'egalitarianism', 'thoughtfully', 'really', 'demeanor', 'understand', 'maliciously']
[-0.02112803  0.0606989   0.00580154  0.03761325 -0.06886814  0.00220369
 -0.04659207  0.00973079 -0.06481651  0.13105734]
[-0.02200809  0.06149752  0.01117745  0.04631791 -0.06935405  0.00759622
 -0.04819484  0.00892893 -0.05779945  0.12891731]
