# Evaluating embeddings

In [4]:
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
fn = '/mnt/store/eszti/data/20170406_0948/T1_10000000.npy'
with open(fn) as f:
    T1 = np.load(f)
    
fn = '/mnt/store/eszti/data/20170406_0948/T_10000000.npy'
with open(fn) as f:
    T = np.load(f)
    
fn = '/mnt/store/eszti/data/20170406_0948/A_10000000.npy'
with open(fn) as f:
    A = np.load(f)
    
print(A.shape)
print(T.shape)
print(T1.shape)

(110, 300)
(79, 300, 300)
(300, 300)


In [26]:
emb_dir = '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058'
embed_files = [os.path.join(emb_dir, f) for f in listdir(emb_dir) 
               if isfile(join(emb_dir, f)) and f.endswith('.npy')]
embed_files

['/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_heb.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_zlm.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_kir.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_tel.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_slv.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_tat.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_wrz.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_spa.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_bos.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_cym.npy',
 '/home/eszti/data/embeddings/fb_trans/embedding/swad_fb_110_20170405_2058/eng_fin.npy',
 '/home/eszti/data/em

In [7]:
lang_codes = [fn.split('.')[0][-3:] for fn in embed_files]
print(len(lang_codes))
lang_codes.insert(0, lang_codes[-1])
lang_codes.pop()

80


'eng'

In [39]:
output_dir = '/home/eszti/data/embeddings/univ_trans/first_trial/'
lang_cnt = len(embed_files)

# Load English embedding
with open(embed_files[-1]) as f:
    en_emb = np.load(f)

W = np.ndarray(shape=(lang_cnt, en_emb.shape[0], en_emb.shape[1]), dtype=np.float32)
trans = np.ndarray(shape=(lang_cnt, en_emb.shape[0], en_emb.shape[1]), dtype=np.float32)

i = 0
W[i, :, :] = en_emb
for embed_fn in embed_files:
    if 'eng_eng' in embed_fn:
        continue
    i += 1
    with open(embed_fn) as f:
        emb = np.load(f)
    W[i, :, :] = emb
    
for i in range(len(lang_codes)):
    if i == 0:
        trans[i, :, :] = np.dot(W[i, :, :], T1)
    else: 
        trans[i, :, :] = np.dot(W[i, :, :], T[i - 1])
    with open(join(output_dir, '{}.npy'.format(lang_codes[i])), 'w') as f:
        np.save(f, trans[i, :, :])

In [9]:
def get_cos_sim_mx(emb):
    cnt = emb.shape[0]
    mx = np.ndarray(shape=(cnt, cnt), dtype=np.float32)
    for i in range(0, cnt):
        for j in range(0, i + 1):
            sim = cosine_similarity(emb[i].reshape(1, -1), emb[j].reshape(1, -1))
            mx[i][j] = sim
            mx[j][i] = sim
    return mx

def calc_values(orig, trans, univ, univ_cos):
    orig_flat = np.ndarray.flatten(orig)
    trans_flat = np.ndarray.flatten(trans)
    univ_flat = np.ndarray.flatten(univ)
    
    ret = []
    
    # Diff(orig, trans): frob norm
    ret.append(np.linalg.norm(orig - trans))
    # Diff(orig, univ): frob norm
    ret.append(np.linalg.norm(orig - univ))
    # Diff(trans, univ): frob norm
    ret.append(np.linalg.norm(trans - univ))
    
    orig_cos_flat = np.ndarray.flatten(get_cos_sim_mx(orig))
    trans_cos_flat = np.ndarray.flatten(get_cos_sim_mx(trans))
    univ_cos_flat = np.ndarray.flatten(univ_cos)
    
    # Correlation between orig and translated cos sim mx-s
    ret.append(scipy.stats.pearsonr(orig_cos_flat, trans_cos_flat))
    # Correlation between orig and univ cos sim mx-s
    ret.append(scipy.stats.pearsonr(orig_cos_flat, univ_cos_flat))
    # Correlation between trans and univ cos sim mx-s
    ret.append(scipy.stats.pearsonr(univ_cos_flat, trans_cos_flat))
    
    return ret

In [35]:
univ_cos = get_cos_sim_mx(A)
stats = []

for i in range(len(lang_codes)):
    print('Calc language: {}'.format(lang_codes[i]))
    row = [lang_codes[i]]
    row += calc_values(W[i, :, :], trans[i, :, :], A, univ_cos)
    stats.append(row)
    print(row)


Calc language: eng
['eng', 0.0, 0.40092891, 0.40092891, (1.0, 0.0), (0.99980241, 0.0), (0.99980241, 0.0)]
Calc language: heb
['heb', 14.875912, 14.875926, 0.46582678, (0.53900075, 0.0), (0.53988481, 0.0), (0.99975592, 0.0)]
Calc language: zlm
['zlm', 14.738148, 14.740572, 0.46735415, (0.55985451, 0.0), (0.55939579, 0.0), (0.99975598, 0.0)]
Calc language: kir
['kir', 14.854175, 14.853262, 0.48045668, (0.49459833, 0.0), (0.49531868, 0.0), (0.9997406, 0.0)]
Calc language: tel
['tel', 14.8531, 14.872325, 0.46531272, (0.49177274, 0.0), (0.49484402, 0.0), (0.99975646, 0.0)]
Calc language: slv
['slv', 14.891821, 14.896851, 0.44599283, (0.51929784, 0.0), (0.52284801, 0.0), (0.99977553, 0.0)]
Calc language: tat
['tat', 14.927641, 14.920938, 0.4958545, (0.39927328, 0.0), (0.39941591, 0.0), (0.99972647, 0.0)]
Calc language: wrz
['wrz', 14.918386, 14.918633, 0.48865092, (0.67383468, 0.0), (0.6738286, 0.0), (0.99973494, 0.0)]
Calc language: spa
['spa', 14.876044, 14.890875, 0.52533048, (0.65509766,

In [36]:
import csv
outfn = '/home/eszti/data/embeddings/univ_trans/first_trial/stat.csv'
with open(outfn,'wb') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerows(stats)

In [53]:
for i in range(len(lang_codes)):
    

(80, 6)

In [44]:
a = np.array([1, 2, 5, -1])
b = np.array([3, 4, 6, 2])

scipy.stats.pearsonr(a, b)

(0.99541807440750374, 0.0045819255924962618)