In [32]:
import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

In [11]:
# Conf
langs = ['eng', 'deu', 'hun']

In [4]:
def read_word2vec_text(vectors_fn, vocab_fn="", fmt="float64"):
    with open(vectors_fn) as f:
        file_pos = f.tell()
        V = np.loadtxt(f, dtype=str, comments=None, usecols=(0,), skiprows=1)
        word2index = {w: i for i, w in enumerate(V)}
        f.seek(file_pos)
        W = np.loadtxt(f, dtype=fmt, comments=None, skiprows=1, converters={0: lambda x: 0.0})
        if len(word2index) != len(W):
            print >>sys.stderr, "WARNING:", len(word2index),
            print >>sys.stderr, "disjoint words in embedding of length", len(W)
    return W[:, 1:], word2index

Read embeddings and indices

In [5]:
fold = '/home/eszti/projects/dipterv/notebooks/panlex/train_try/t0'

d_embs = dict()
d_idxs = dict()
for l in langs:
    fn = os.path.join(fold, '{}.vec'.format(l))
    model = read_word2vec_text(fn)
    d_embs[l] = model[0]
    d_idxs[l] = model[1]

In [9]:
# test
print(d_embs['hun'].shape)
print(d_idxs['hun'])

(2, 300)
{'kutya': 0, 'macska': 1}


Read words pairs from tsv

In [54]:
def read_word_pairs_tsv(fn, id1, id2):
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines) if i > 0]
    return data

In [55]:
fold = '/home/eszti/projects/dipterv/notebooks/panlex/train_try/t0'
id1 = 2
id2 = 3
# Dict for word pairs
d_wps = dict()
done = set()
for lang1 in langs:
    for lang2 in langs:
        lang_pair = tuple(sorted([lang1, lang2]))
        if lang1 == lang2 or lang_pair in done:
            continue
        done.add(lang_pair)
        l1 = lang_pair[0]
        l2 = lang_pair[1]
        fn = os.path.join(fold, '{0}_{1}.tsv'.format(l1, l2))
        data = read_word_pairs_tsv(fn, id1, id2)
        d_wps[lang_pair] = data

In [56]:
d_wps

{('deu', 'eng'): [('katze', 'cat'), ('hund', 'dog')],
 ('deu', 'hun'): [('hund', 'kutya'), ('katze', 'macska')],
 ('eng', 'hun'): [('cat', 'macska'), ('dog', 'kutya')]}

Calculate precision

In [None]:
def calc_precision(precs, trans, orig, trans_list, orig_list, trans_orig_dict):
    cos_mx = cosine_similarity(trans, orig)
    sim_mx = np.argsort(-cos_mx)
    max_prec = max(precs)
    prec_cnt = np.zeros(shape=(1, max_prec))
    print('word: \ttranslations in dict: \tclosest words after translation: \t')
    for i, r in enumerate(sim_mx):
        key_word = trans_list[i]
        value_words = trans_orig_dict[key_word]
        closest_words = []
        for j in range(max_prec):       
            ans = np.where(r==j)
            idx_orig = ans[0][0]
            word = orig_list[idx_orig]
            closest_words.append(word)
            if word in value_words:
                prec_cnt[0][j] = prec_cnt[0][j] + 1
        print('{}"\t{}\t{}'.format(word.encode('utf-8'), value_words, closest_words))
    print prec_cnt
    for i, val in enumerate(precs):
        sum_hit = np.sum(prec_cnt[0][0:val])
        print('prec {} : {}'.format(val, float(sum_hit)/sim_mx.shape[0]))

Doing training

In [8]:
def train(langs, d_embs, d_idxs, d_wordpairs, dim, iters, lr=0.3):
    nb_langs = len(langs)

    # Init graphs
    graph = tf.Graph()
    with graph.as_default():
        # TF variables 
        # Placeholder for 2 words
        tf_w1 = tf.placeholder(tf.float64, shape=[None, dim])
        tf_w2 = tf.placeholder(tf.float64, shape=[None, dim])
        # Placeholder for indexing the T matrix
        tf_idx_l1 = tf.placeholder(tf.int32)
        tf_idx_l2 = tf.placeholder(tf.int32)
        # Translation matrices
        tf_T = tf.Variable(tf.truncated_normal([nb_langs, dim, dim]))

        # Loss
        loss = tf.matmul(tf.matmul(tf_w1, tf_T[tf_idx_l1]), tf.transpose(tf.matmul(tf_w2, tf_T[tf_idx_l2])))

        optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)

    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()

        for i in range(iters):
             # Run the computations
            _, l, T = session.run([optimizer, loss, tf_T])
            
            session.run(train_step, feed_dict={x: batch_x, y: batch_y})