In [1]:
import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import copy
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import json
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Conf
langs = ['eng', 'ita']
dim = 300


sil2fb_fn = '/home/eszti/projects/dipterv/notebooks/panlex/data/sil2fb.json'

with open(sil2fb_fn) as f:
     sil2fb = json.load(f)

In [3]:
# Read embeddings

fold = '/mnt/permanent/Language/Multi/FB' 

d_models = dict()
for l in langs:
    fn = os.path.join(fold, 'wiki.{}'.format(sil2fb[l]), 'wiki.{}.vec'.format(sil2fb[l]))
    print('Reading embedding from {}'.format(fn))
    model = KeyedVectors.load_word2vec_format(fn, binary=False)
    model.syn0 /= np.sqrt((model.syn0**2).sum(1))[:, None]
    d_models[l] = model
    print('Embedding for {} is read'.format(l))

Reading embedding from /mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec
Embedding for eng is read
Reading embedding from /mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec
Embedding for ita is read


In [4]:
# Read word pairs from tsv

def read_word_pairs_tsv(fn, id1, id2, header=True):
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines) if i > 0 or header == False]
    return data

def wp_list_2_dict(lang_pair, wp_l):
    l1 = lang_pair[0]
    l2 = lang_pair[1]
    l12 = dict()
    l21 = dict()
    for (w1, w2) in wp_l:
        if w1 not in l12:
            l12[w1] = [w2]
        else:
            l12[w1].append(w2)
        if w2 not in l21:
            l21[w2] = [w1]
        else:
            l21[w2].append(w1)
    return l12, l21

fold = '/home/eszti/projects/dipterv/notebooks/panlex/smith/test'
id1 = 0
id2 = 1

# Dict for word pairs
d_wps = dict()
done = set()
for lang1 in langs:
    for lang2 in langs:
        lang_pair = tuple(sorted([lang1, lang2]))
        if lang1 == lang2 or lang_pair in done:
            continue
        done.add(lang_pair)
        l1 = lang_pair[0]
        l2 = lang_pair[1]
        fn = os.path.join(fold, '{0}_{1}.tsv'.format(l1, l2))
        data = read_word_pairs_tsv(fn, id1, id2, False)
        d_wps[lang_pair] = data

# Dict for dictionaries between each languages
d_dict = dict()
for ((l1, l2), wp_l) in d_wps.items():
    l12, l21 = wp_list_2_dict((l1, l2), wp_l)
    d_dict[(l1, l2)] = l12
    d_dict[(l2, l1)] = l21
    
# Dict for filtered models containing only the words used for training
d_tr_mods = dict()
for ((l1, l2), d) in d_dict.items():
    print('Reading {0}-{1} dictionary'.format(l1, l2))
    tr_mod = KeyedVectors()
    nf_list = []
    for i, w in enumerate(list(d.keys())):
        # Check if there's an embedding to the word
        if w not in d_models[l1]:
            nf_list.append(w)
    print('Words not found in embedding: {}'.format(nf_list))
    tr_mod.index2word = [x for x in list(d.keys()) if x not in nf_list]
    tr_mod.syn0 = np.ndarray(shape=(len(tr_mod.index2word), dim), dtype=np.float32)
    # Adding embedding to train model
    for i, w in enumerate(tr_mod.index2word):
        tr_mod.syn0[i, :] = d_models[l1][w]
    # Deleting not forund words from word pairs list
    change = False
    if l1 < l2:
        lang1 = l1; lang2 = l2
    else:
        lang1 = l2; lang2 = l1; change = True
    d_wps[(lang1, lang2)] = [(w1, w2) for (w1, w2) in d_wps[(lang1, lang2)] 
                             if not ((change and w2 in nf_list) or (not change and w1 in nf_list))]
    d_tr_mods[(l1, l2)] = tr_mod

Reading eng-ita dictionary
Words not found in embedding: []
Reading ita-eng dictionary
Words not found in embedding: ['kostunica', 'ridimensioni', 'oligopolistica']


In [15]:
len(d_wps[('eng', 'ita')])
len(d_dict[('eng', 'ita')])
len(d_dict[('ita', 'eng')])
len(d_models['eng'].index2word)
len(d_models['eng'].syn0)
len(d_models['ita'].index2word)
len(d_models['ita'].syn0)
len(d_tr_mods[('eng', 'ita')].syn0)
len(d_tr_mods[('ita', 'eng')].syn0)

d_tr_mods[('eng', 'ita')].index2word.index('kostunica')

1866

1500

1849

2519370

2519370

871053

871053

1500

1846

619

In [None]:
# Function to calculate precision

# model_src : source language embeddings (need to have syn0 and index2word properites) (after translation)
# model_tar : target language embeddings (need to have syn0 and index2word properites) (can be don in orig or universal space)
# dict_scr_2_tar : dictionary from source to target
def calc_precision(precs, model_src, model_tar, dict_scr_2_tar, verbose=False):
    W_src = model_src.syn0
    W_tar = model_tar.syn0
    idx_src = model_src.index2word
    idx_tar = model_tar.index2word
    
    cos_mx = cosine_similarity(W_src, W_tar)
    sim_mx = np.argsort(-cos_mx)
    max_prec = max(precs)
    prec_cnt = np.zeros(shape=(1, max_prec))
    if verbose:
        print('word: \ttranslations in dict: \tclosest words after translation: \t')
    for i, r in enumerate(sim_mx):
        key_word = idx_src[i]
        value_words = dict_scr_2_tar[key_word]
        closest_words = []
        for j in range(max_prec):       
            ans = np.where(r==j)
            idx_orig = ans[0][0]
            word = idx_tar[idx_orig]
            closest_words.append(word)
            if word in value_words:
                prec_cnt[0][j] = prec_cnt[0][j] + 1
        if verbose:
            print('{}"\t{}\t{}'.format(key_word, value_words, closest_words))
    if verbose:
        print(prec_cnt)
    prec_pcnts = []
    for i, val in enumerate(precs):
        sum_hit = np.sum(prec_cnt[0][0:val])
        pcnt = float(sum_hit)/sim_mx.shape[0]
        if verbose:
            print('prec {} : {}'.format(val, pcnt))
        prec_pcnts.append(pcnt)
    return prec_pcnt

In [None]:
# Read T mx from file
fn = ''

with open(fn) as f:
    nzpf = np.load(f)
    T = nzpf['T']
    
T1 = T[0]
T2 = T[1]

In [None]:
l1 = 'eng'
l2 = 'ita'
m1_tr = d_tr_mods[(l1, l2)]
m2_tr = d_tr_mods[(l2, l1)]
m1 = d_models[l1]
m2 = d_models[l2]
# Prec l1 - l2 = eng - ita
m1_tr.syn0 = np.dot(m1_tr.syn0, T1)
m2.syn0 = np.dot(m2.syn0, T2)
precs_1 = calc_precision(precs_to_calc, m1_tr, m2, d_dict[(l1, l2)], verbose=False)
precs_1
# Prec l2 - l1 = ita - eng
m2_tr.syn0 = np.dot(m2_tr.syn0, T2)
m1.syn0 = np.dot(m1.syn0, T1)
precs_2 = calc_precision(precs_to_calc, m2_tr, m1, d_dict[(l2, l1)], verbose=False)
precs_2