In [1]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from gensim.models import KeyedVectors
import pickle

In [2]:
train_fn = '/home/eszti/projects/dipterv/panlex/data/smith/train/eng_ita.tsv'
valid_fn = '/home/eszti/projects/dipterv/panlex/data/smith/valid/eng_ita.tsv'
test_fn = '/home/eszti/projects/dipterv/panlex/data/smith/test/eng_ita.tsv'

eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'

limit = None

tr_en_fn = 'train_eng.pickle'
va_en_fn = 'valid_eng.pickle'
te_en_fn = 'test_eng.pickle'

tr_it_fn = 'train_ita.pickle'
va_it_fn = 'valid_ita.pickle'
te_it_fn = 'test_ita.pickle'

In [3]:
def read_emb(emb_fn, limit):
    model = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=limit)
    model.syn0 /= np.sqrt((model.syn0 ** 2).sum(1))[:, None]
    return model

In [4]:
def read_word_pairs_tsv(fn, id1, id2):
    wl1 = set()
    wl2 = set()
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines)]
    for (w1, w2) in data:
        wl1.add(w1)
        wl2.add(w2)
    return data, wl1, wl2

In [5]:
# saving format: (vocab: ordered list according to frequency, emb_dict: word - nparray dictionary)
def get_filtered_embedding(emb, wl, fn):
    to_save = []
    for w in wl:
        if w in emb:
            to_save.append([w, emb[w], emb.index2word.index(w)])
        else:
            print('not found: {}'.format(w))
    to_save.sort(key=lambda x: x[2])
    dim = 300
    vocab = [l[0] for l in to_save]
    filtered_mod = dict()
    for i, w in enumerate(vocab):
        filtered_mod[w] = emb[w]
    with open(fn, 'wb') as f:
        pickle.dump(file=f, obj=filtered_mod)
    return filtered_mod

In [6]:
m_en = read_emb(eng_emb, limit)
m_it = read_emb(ita_emb, limit)

In [7]:
train_wp, tr_en, tr_it = read_word_pairs_tsv(train_fn, 0, 1)
valid_wp, va_en, va_it = read_word_pairs_tsv(valid_fn, 0, 1)
test_wp, te_en, te_it = read_word_pairs_tsv(test_fn, 0, 1)

In [8]:
# En
print('en train')
m_en_tr_fil = get_filtered_embedding(m_en, tr_en, tr_en_fn)
print('en valid')
m_en_va_fil = get_filtered_embedding(m_en, va_en, va_en_fn)
print('en test')
m_en_te_fil = get_filtered_embedding(m_en, te_en, te_en_fn)

print('it train')
m_it_tr_fil = get_filtered_embedding(m_it, tr_it, tr_it_fn)
print('it valid')
m_it_va_fil = get_filtered_embedding(m_it, va_it, va_it_fn)
print('it test')
m_it_te_fil = get_filtered_embedding(m_it, te_it, te_it_fn)

en train
en valid
en test
it train
not found: prelaurea
it valid
it test
not found: ridimensioni
not found: kostunica
not found: oligopolistica


In [9]:
def load(fn):
    with open(fn, 'rb') as f:
        data = pickle.load(f)
    return data

In [10]:
def test_load(fn, message):
    print(message)
    emb = load(fn)
    print('vocab length: {}'.format(len(emb.keys())))

In [11]:
test_load(tr_en_fn, 'train en')
test_load(tr_it_fn, 'train it')
test_load(va_en_fn, 'valid en')
test_load(va_it_fn, 'valid it')
test_load(te_en_fn, 'test en')
test_load(te_it_fn, 'test it')

train en
vocab length: 3216
train it
vocab length: 4133
valid en
vocab length: 500
valid it
vocab length: 499
test en
vocab length: 1500
test it
vocab length: 1846


In [12]:
word = 'for'
m_en_tr_load = load(tr_en_fn)
# m_en_tr_load.keys()
word in tr_en

True

In [21]:
len(np.linalg.norm(m_en.syn0, axis=0))
len(np.linalg.norm(m_en.syn0, axis=1))
len(np.linalg.norm(m_en.syn0, axis=1) == 1)

300

2519370

2519370

In [22]:
len(np.linalg.norm(m_it.syn0, axis=0))
len(np.linalg.norm(m_it.syn0, axis=1))
len(np.linalg.norm(m_it.syn0, axis=1) == 1)

300

871053

871053