In [1]:
import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import copy
from gensim.models import KeyedVectors

In [2]:
# Conf
langs = ['eng', 'deu', 'hun']

Read embeddings and indices

In [3]:
fold = '/home/eszti/projects/dipterv/notebooks/panlex/train_try/t0'

d_models = dict()
for l in langs:
    fn = os.path.join(fold, '{}.vec'.format(l))
    model = KeyedVectors.load_word2vec_format(fn, binary=False)
    d_models[l] = model

In [4]:
# test
print(d_models['hun'].syn0.shape)
print(d_models['hun'].index2word)

(4, 300)
['kutya', 'macska', 'nap', 'hold']


Read words pairs from tsv

In [5]:
def read_word_pairs_tsv(fn, id1, id2):
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines) if i > 0]
    return data

def wp_list_2_dict(lang_pair, wp_l):
    l1 = lang_pair[0]
    l2 = lang_pair[1]
    l12 = dict()
    l21 = dict()
    for (w1, w2) in wp_l:
        if w1 not in l12:
            l12[w1] = [w2]
        else:
            l12[w1] += w2
        if w2 not in l21:
            l21[w2] = [w1]
        else:
            l21[w2] += w1
    return l12, l21

In [6]:
fold = '/home/eszti/projects/dipterv/notebooks/panlex/train_try/t0'
id1 = 2
id2 = 3
# Dict for word pairs
d_wps = dict()
done = set()
for lang1 in langs:
    for lang2 in langs:
        lang_pair = tuple(sorted([lang1, lang2]))
        if lang1 == lang2 or lang_pair in done:
            continue
        done.add(lang_pair)
        l1 = lang_pair[0]
        l2 = lang_pair[1]
        fn = os.path.join(fold, '{0}_{1}.tsv'.format(l1, l2))
        data = read_word_pairs_tsv(fn, id1, id2)
        d_wps[lang_pair] = data

d_dict = dict()
for ((l1, l2), wp_l) in d_wps.items():
    l12, l21 = wp_list_2_dict((l1, l2), wp_l)
    d_dict[(l1, l2)] = l12
    d_dict[(l2, l1)] = l21

In [7]:
d_dict

{('deu', 'eng'): {'hund': ['dog'],
  'katze': ['cat'],
  'mond': ['moon'],
  'sonne': ['sun']},
 ('deu', 'hun'): {'hund': ['kutya'],
  'katze': ['macska'],
  'mond': ['hold'],
  'sonne': ['nap']},
 ('eng', 'deu'): {'cat': ['katze'],
  'dog': ['hund'],
  'moon': ['mond'],
  'sun': ['sonne']},
 ('eng', 'hun'): {'cat': ['macska'],
  'dog': ['kutya'],
  'moon': ['hold'],
  'sun': ['nap']},
 ('hun', 'deu'): {'hold': ['mond'],
  'kutya': ['hund'],
  'macska': ['katze'],
  'nap': ['sonne']},
 ('hun', 'eng'): {'hold': ['moon'],
  'kutya': ['dog'],
  'macska': ['cat'],
  'nap': ['sun']}}

Calculate precision

In [8]:
def calc_precision(precs, model_src_tr, model_tar, dict_scr_2_tar):
    W_src_tr = model_src_tr.syn0
    W_tar = model_tar.syn0
    idx_src_tr = model_src_tr.index2word
    idx_tar = model_tar.index2word
    
    cos_mx = cosine_similarity(W_src_tr, W_tar)
    sim_mx = np.argsort(-cos_mx)
    max_prec = max(precs)
    prec_cnt = np.zeros(shape=(1, max_prec))
    print('word: \ttranslations in dict: \tclosest words after translation: \t')
    for i, r in enumerate(sim_mx):
        key_word = idx_src_tr[i]
        value_words = dict_scr_2_tar[key_word]
        closest_words = []
        for j in range(max_prec):       
            ans = np.where(r==j)
            idx_orig = ans[0][0]
            word = idx_tar[idx_orig]
            closest_words.append(word)
            if word in value_words:
                prec_cnt[0][j] = prec_cnt[0][j] + 1
        print('{}"\t{}\t{}'.format(key_word, value_words, closest_words))
    print(prec_cnt)
    for i, val in enumerate(precs):
        sum_hit = np.sum(prec_cnt[0][0:val])
        print('prec {} : {}'.format(val, float(sum_hit)/sim_mx.shape[0]))

Testing precision calculation

In [9]:
# Precision without translation
calc_precision([1], d_models['eng'], d_models['deu'], d_dict[('eng', 'deu')])
calc_precision([2], d_models['eng'], d_models['deu'], d_dict[('eng', 'deu')])
calc_precision([3], d_models['eng'], d_models['deu'], d_dict[('eng', 'deu')])
calc_precision([4], d_models['eng'], d_models['deu'], d_dict[('eng', 'deu')])

word: 	translations in dict: 	closest words after translation: 	
dog"	['hund']	['mond']
cat"	['katze']	['mond']
sun"	['sonne']	['hund']
moon"	['mond']	['katze']
[[ 0.]]
prec 1 : 0.0
word: 	translations in dict: 	closest words after translation: 	
dog"	['hund']	['mond', 'hund']
cat"	['katze']	['mond', 'katze']
sun"	['sonne']	['hund', 'katze']
moon"	['mond']	['katze', 'hund']
[[ 0.  2.]]
prec 2 : 0.5
word: 	translations in dict: 	closest words after translation: 	
dog"	['hund']	['mond', 'hund', 'sonne']
cat"	['katze']	['mond', 'katze', 'sonne']
sun"	['sonne']	['hund', 'katze', 'sonne']
moon"	['mond']	['katze', 'hund', 'sonne']
[[ 0.  2.  1.]]
prec 3 : 0.75
word: 	translations in dict: 	closest words after translation: 	
dog"	['hund']	['mond', 'hund', 'sonne', 'katze']
cat"	['katze']	['mond', 'katze', 'sonne', 'hund']
sun"	['sonne']	['hund', 'katze', 'sonne', 'mond']
moon"	['mond']	['katze', 'hund', 'sonne', 'mond']
[[ 0.  2.  1.  1.]]
prec 4 : 1.0


Doing training

In [10]:
def train(langs, d_models, d_wps, dim, iters, lr=0.3):
    nb_langs = len(langs)

    # Init graphs
    graph = tf.Graph()
    with graph.as_default():
        # TF variables 
        # Placeholder for 2 words
        tf_w1 = tf.placeholder(tf.float32, shape=[None, dim])
        tf_w2 = tf.placeholder(tf.float32, shape=[None, dim])
        # Placeholder for indexing the T matrix
        tf_idx_l1 = tf.placeholder(tf.int32)
        tf_idx_l2 = tf.placeholder(tf.int32)
        # Translation matrices
        tf_T = tf.Variable(tf.truncated_normal([nb_langs, dim, dim]))

        # Loss
        loss = tf.matmul(tf.matmul(tf_w1, tf_T[tf_idx_l1]), tf.transpose(tf.matmul(tf_w2, tf_T[tf_idx_l2])))
        # Applying SGD
        optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)

    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()

        for i in range(iters):
            for ((l1, l2), wp_l) in d_wps.items():
                print('Iterating: {0} - {1}'.format(l1, l2))
                idx_l1 = langs.index(l1)
                idx_l2 = langs.index(l2)
                for (w1, w2) in wp_l:
                    emb1 = d_models[l1][w1].reshape((1, 300))
                    emb2 = d_models[l2][w2].reshape((1, 300))
                     # Run the computations
                    _, l, T = session.run([optimizer, loss, tf_T], 
                                          feed_dict={tf_w1 : emb1, 
                                                     tf_w2 : emb2, 
                                                     tf_idx_l1 : idx_l1, 
                                                     tf_idx_l2 : idx_l2})
                    print(l)
    return T

Testing train function

In [11]:
T = train(langs, d_models, d_wps, 300, 7)

Iterating: deu - eng
[[ 40.94573212]]
[[-19876.484375]]
[[-146514.484375]]
[[-4673988.5]]
Iterating: deu - hun
[[-28717.79296875]]
[[ -2.42939728e+08]]
[[ -2.32002970e+09]]
[[ -8.90352353e+10]]
Iterating: eng - hun
[[  1.67616307e+09]]
[[ -1.52865761e+12]]
[[ -6.07842258e+12]]
[[ -2.39473955e+14]]
Iterating: deu - eng
[[  1.85105352e+13]]
[[ -1.66006083e+15]]
[[ -1.50816477e+16]]
[[ -4.98228060e+17]]
Iterating: deu - hun
[[  9.37692636e+16]]
[[ -1.00782775e+19]]
[[ -1.03338286e+20]]
[[ -4.24081740e+21]]
Iterating: eng - hun
[[  1.23800981e+20]]
[[ -6.84461035e+22]]
[[ -2.66906336e+23]]
[[ -1.07605438e+25]]
Iterating: deu - eng
[[  8.91777218e+23]]
[[ -7.35616862e+25]]
[[ -6.70628651e+26]]
[[ -2.22096296e+28]]
Iterating: deu - hun
[[  4.19260492e+27]]
[[ -4.47655937e+29]]
[[ -4.59196540e+30]]
[[ -1.88518558e+32]]
Iterating: eng - hun
[[  5.51923054e+30]]
[[ -3.04153264e+33]]
[[ -1.18589159e+34]]
[[ -4.78179792e+35]]
Iterating: deu - eng
[[  3.96452254e+34]]
[[ -3.26865355e+36]]
[[ -2.97

In [12]:
m1 = copy.deepcopy(d_models['eng'])
m2 = copy.deepcopy(m1)

W = copy.deepcopy(m2.syn0)
T = np.random.rand(300,300)
# print(W)
# print(T)
m2.syn0 = np.dot(W, T)
# print(m2.syn0)
print(m1.most_similar('dog'))
print(m2.most_similar('dog'))

[('cat', 0.6380517482757568), ('moon', 0.2543850839138031), ('sun', 0.1620684415102005)]
[('cat', 0.5572469234466553), ('moon', 0.5255287885665894), ('sun', -0.3962933123111725)]
