In [21]:
import pickle
import numpy as np
import scipy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

Load our data

( {lang : swad_list, emb_full (norm), emb_fn, not_found_list, T}, univ(norm)) )

In [3]:
# Path our data 
pickle_path = '/home/eszti/projects/recap/find_univ_proc/find_univ_proc.pickle'

with open(pickle_path) as f:
    data = pickle.load(f)
    
# Our data
eng_data = data[0]['eng']
ita_data = data[0]['ita']

# Swadesh words used for training
sw_en = eng_data[0]
sw_it = ita_data[0]

# Embed paths
eng_emb_fn = eng_data[2]
ita_emb_fn = ita_data[2]
print(eng_emb_fn)
print(ita_emb_fn)

# Not found lists
sw_nf_en = eng_data[3]
sw_nf_it = ita_data[3]

# Transformation mx-s
T_en = eng_data[4]
T_it = ita_data[4]

/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec
/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec


Load Smith's test set

In [4]:
test_pairs_path = '/home/eszti/projects/smith/transmat/data/OPUS_en_it_europarl_test.txt'
en_it_dict = dict()
it_en_dict = dict()

with open(test_pairs_path) as f:
    lines = f.read().splitlines()
    for line in lines:
        words = line.strip().decode('utf-8').split(' ')
        en = words[0]
        it = words[1]
        if en not in en_it_dict.keys():
            en_it_dict[en] = []
        en_it_dict[en].append(it)
        if it not in it_en_dict.keys():
            it_en_dict[it] = []
        it_en_dict[it].append(en)

In [5]:
print('En-It len: {}'.format(len(en_it_dict)))
print('It-En len: {}'.format(len(it_en_dict)))

En-It len: 1500
It-En len: 1849


Load our previosly selected set embeddings

In [6]:
fb_emb_data_path = 'data/our_data_raw_09_29.pickle'

with open(fb_emb_data_path) as f:
    fb_emb_data = pickle.load(f)
    
(wl_en, emb_en, id_en) = fb_emb_data[0]
(wl_it, emb_it, id_it) = fb_emb_data[1]

print('Not found in English: {}'.format(len(wl_en) - len(id_en)))
print('Not found in Italian: {}'.format(len(wl_it) - len(id_it)))

Not found in English: 0
Not found in Italian: 3


In [7]:
def get_not_found_words_idx(wl, idxs):
    nf_i = []
    nf_w = []
    for i in range(len(wl)):
        if i not in idxs:
            nf_i.append(i)
            nf_w.append(wl[i])
    return nf_i, nf_w

In [8]:
nf_en_i, nf_en_w = get_not_found_words_idx(wl_en, id_en)
nf_it_i, nf_it_w = get_not_found_words_idx(wl_it, id_it)

Build excluded words list
- get found swadesh words (words from Swadesh used during training)
- get not found embeddings (words from Smith's test set)
- concat the two list


In [9]:
def get_ex_words(swad_list, nf_idxs, nf_embeddings):
    # get found swadesh words, they were used for training
    ex_words = []
    for i, w in enumerate(swad_list):
        if i not in nf_idxs:
            ex_words.append(w)
    # get not found embedding
    ex_words += nf_embeddings
    return ex_words

In [10]:
# Get filtered dictionary
def get_filt_dict(d1, ex1, ex2):
    fd1 = dict()
    for k, vs in d1.iteritems():
        if k in ex1:
            print('"{}" is removed because it is in ex1'.format(k))
            continue
        zero_stays = True
        for v in vs:
            if v not in ex2:
                zero_stays = False
        if zero_stays:
            print('"{}" is removed because all values "{}" are in ex2'.format(k, vs))
            continue
        fd1[k] = vs
    print('\n')
    return fd1

In [11]:
# English words to delete
ex_en = get_ex_words(sw_en, sw_nf_en, nf_en_w)

# Italian words to delete
ex_it = get_ex_words(sw_it, sw_nf_it, nf_it_w)

# Get filtered d1
fd_en = get_filt_dict(en_it_dict, ex_en, ex_it)
# Get filtered d2
fd_it = get_filt_dict(it_en_dict, ex_it, ex_en)

"green" is removed because it is in ex1
"downsize" is removed because all values "[u'ridimensioni']" are in ex2
"head" is removed because it is in ex1
"cold" is removed because it is in ex1
"woman" is removed because all values "[u'donna']" are in ex2
"kostunica" is removed because all values "[u'kostunica']" are in ex2
"close" is removed because it is in ex1
"sole" is removed because all values "[u'sole']" are in ex2
"oligopolistic" is removed because all values "[u'oligopolistica']" are in ex2
"donna" is removed because all values "[u'donna']" are in ex2
"bird" is removed because it is in ex1
"red" is removed because it is in ex1
"neighbour" is removed because all values "[u'vicino']" are in ex2


"vicino" is removed because it is in ex1
"capo" is removed because it is in ex1
"oligopolistica" is removed because it is in ex1
"stretta" is removed because all values "[u'close']" are in ex2
"verdi" is removed because all values "[u'green']" are in ex2
"mano" is removed because it is in e

In [12]:
print('Len orig en-it: {}'.format(len(en_it_dict)))
print('Len filt en-it: {}'.format(len(fd_en)))
print('Len orig it-en: {}'.format(len(it_en_dict)))
print('Len filt it-en: {}'.format(len(fd_it)))

Len orig en-it: 1500
Len filt en-it: 1487
Len orig it-en: 1849
Len filt it-en: 1832


Get embedding matrix
- empty np array with proper size
- insert embeddings line by line
- in parallel create emb_word_list

In [13]:
def get_np_emb(wl, emb_l, id_emb, allowed, rows, cols):
    emb = np.zeros(shape=(rows, cols))
    emb_idx = 0
    emb_wl = []
    for i, w in enumerate(wl):
        if emb_l[i] is None:
            print('exclude: "{}", embed not found'.format(w))
            continue
        if w not in allowed:
            print('exclude: "{}", used for training'.format(w))
            continue
        emb[emb_idx, :] = emb_l[i]
        emb_idx += 1
        emb_wl.append(w)
    print('\n')
    return emb, emb_wl

In [14]:
en_np_emb, emb_wl_en = get_np_emb(wl_en, emb_en, id_en, fd_en.keys(), len(fd_en), 300)

it_np_emb, emb_wl_it = get_np_emb(wl_it, emb_it, id_it, fd_it.keys(), len(fd_it), 300)

exclude: "green", used for training
exclude: "downsize", used for training
exclude: "head", used for training
exclude: "cold", used for training
exclude: "woman", used for training
exclude: "kostunica", used for training
exclude: "close", used for training
exclude: "sole", used for training
exclude: "oligopolistic", used for training
exclude: "donna", used for training
exclude: "bird", used for training
exclude: "red", used for training
exclude: "neighbour", used for training


exclude: "vicino", used for training
exclude: "capo", used for training
exclude: "oligopolistica", embed not found
exclude: "stretta", used for training
exclude: "verdi", used for training
exclude: "mano", used for training
exclude: "kostunica", embed not found
exclude: "rosso", used for training
exclude: "fredda", used for training
exclude: "verde", used for training
exclude: "grande", used for training
exclude: "sole", used for training
exclude: "donna", used for training
exclude: "ridimensioni", embed not fou

In [34]:
# Normalize
norm_en_np_emb = normalize(en_np_emb.astype(np.float))
norm_it_np_emb = normalize(it_np_emb.astype(np.float))

Translate

In [36]:
# Translate
# (Wen*Ten)*Tit'
en_it_np_emb = np.dot(np.dot(norm_en_np_emb, T_en), np.transpose(T_it))

# (Wit*Tit)*Ten'
it_en_np_emb = np.dot(np.dot(norm_it_np_emb, T_it), np.transpose(T_en))

# Normalize
norm_en_it_np_emb = normalize(en_it_np_emb.astype(np.float))
norm_it_en_np_emb = normalize(it_en_np_emb.astype(np.float))

In [65]:
def calc_precision(precs, trans, orig, trans_list, orig_list, trans_orig_dict):
    cos_mx = cosine_similarity(trans, orig)
    sim_mx = np.argsort(-cos_mx)
    max_prec = max(precs)
    prec_cnt = np.zeros(shape=(1, max_prec))
    print('word: \ttranslations in dict: \tclosest words after translation: \t')
    for i, r in enumerate(sim_mx):
        key_word = trans_list[i]
        value_words = trans_orig_dict[key_word]
        closest_words = []
        for j in range(max_prec):       
            ans = np.where(r==j)
            idx_orig = ans[0][0]
            word = orig_list[idx_orig]
            closest_words.append(word)
            if word in value_words:
                prec_cnt[0][j] = prec_cnt[0][j] + 1
        print('{}"\t{}\t{}'.format(word.encode('utf-8'), value_words, closest_words))
    print prec_cnt
    for i, val in enumerate(precs):
        sum_hit = np.sum(prec_cnt[0][0:val])
        print('prec {} : {}'.format(val, float(sum_hit)/sim_mx.shape[0]))

In [66]:
calc_precision([1,3,5], norm_en_it_np_emb, norm_it_np_emb, emb_wl_en, emb_wl_it, fd_en)

word: 	translations in dict: 	closest words after translation: 	
autunno"	[u'addolcire']	[u'testuale', u'jihad', u'cabilia', u'aborrire', u'autunno']
brasiliana"	[u'sergei']	[u'attigua', u'terni', u'marco', u'risposta', u'brasiliana']
maoisti"	[u'magnetica', u'magnetici', u'magnetico']	[u'tempestoso', u'costituzionalista', u'docilmente', u'agis', u'maoisti']
pluviale"	[u'auspicabile']	[u'illuminazione', u'aghi', u'malgascio', u'gasolio', u'pluviale']
predefinita"	[u'idealizzata', u'idealizzato']	[u'argentini', u'sedie', u'interoperabili', u'centri', u'predefinita']
documento"	[u'bretton']	[u'multilingue', u'omeopatia', u'promossa', u'mary', u'documento']
viareggio"	[u'intercomunicazione']	[u'dichiarazione', u'abdicando', u'vermi', u'ricercatori', u'viareggio']
egon"	[u'auc']	[u'unisono', u'deposit', u'universalmente', u'nanterre', u'egon']
avvelenamenti"	[u'coscienze']	[u'sigla', u'spettro', u'aiuti', u'diserbanti', u'avvelenamenti']
numbers"	[u'sbloccata', u'sbloccate']	[u'identit\xe0

In [37]:
cos_mx_it_en = cosine_similarity(norm_en_it_np_emb, norm_it_np_emb)
sim_mx_it_en = np.argsort(-cos_mx_it_en)

In [38]:
for i, r in enumerate(sim_mx_it_en):
    ans = np.where(sim_mx_it_en[i]==0)
    idx_orig = ans[0][0]
    print('"{}"\t"{}"'.format(emb_wl_en[i].encode('utf-8'), emb_wl_it[idx_orig].encode('utf-8')))
    print(fd_en[emb_wl_en[i]])

"sweetening"	"testuale"
[u'addolcire']
"sergei"	"attigua"
[u'sergei']
"magnetic"	"tempestoso"
[u'magnetica', u'magnetici', u'magnetico']
"desirable"	"illuminazione"
[u'auspicabile']
"idealised"	"argentini"
[u'idealizzata', u'idealizzato']
"woods"	"multilingue"
[u'bretton']
"intercommunication"	"dichiarazione"
[u'intercomunicazione']
"auc"	"unisono"
[u'auc']
"consciences"	"sigla"
[u'coscienze']
"unblocked"	"identità"
[u'sbloccata', u'sbloccate']
"centres"	"richiede"
[u'centri']
"physiological"	"igad"
[u'fisiologica', u'fisiologiche', u'fisiologici', u'fisiologico']
"pensions"	"diluenti"
[u'pensioni']
"censored"	"ricchezza"
[u'censurata', u'censurati', u'censurato']
"bike"	"riaccende"
[u'motocicletta']
"lord"	"obiettivo"
[u'lord']
"worth"	"imprenditore"
[u'pena']
"prefacing"	"deve"
[u'prefazione']
"haughtiness"	"riaccesa"
[u'superbia']
"regional"	"rivalutata"
[u'regionale', u'regionali']
"fragmenting"	"precari"
[u'frammentano', u'frammentare']
"rearguard"	"infondate"
[u'retroguardia']
"a