In [12]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from gensim.models import KeyedVectors
import pickle

In [13]:
train_fn = '/home/eszti/projects/dipterv/panlex/data/smith/train/eng_ita.tsv'
valid_fn = '/home/eszti/projects/dipterv/panlex/data/smith/valid/eng_ita.tsv'
test_fn = '/home/eszti/projects/dipterv/panlex/data/smith/test/eng_ita.tsv'

eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'

limit = None

tr_en_fn = 'train_eng.pickle'
va_en_fn = 'valid_eng.pickle'
te_en_fn = 'test_eng.pickle'

tr_it_fn = 'train_ita.pickle'
va_it_fn = 'val_ita.pickle'
te_it_fn = 'test_ita.pickle'

In [14]:
def read_emb(emb_fn, limit):
    model = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=limit)
    return model

In [15]:
def read_word_pairs_tsv(fn, id1, id2, header=True):
    wl1 = set()
    wl2 = set()
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines) if i > 0 or header == False]
    for (w1, w2) in data:
        wl1.add(w1)
        wl2.add(w2)
    return data, wl1, wl2

In [19]:
# saving format: (vocab: ordered list according to frequency, emb_dict: word - nparray dictionary)
def get_filtered_embedding(emb, wl, fn):
    to_save = []
    for w in wl:
        if w in emb:
            to_save.append([w, emb[w], emb.index2word.index(w)])
        else:
            print('not found: {}'.format(w))
    to_save.sort(key=lambda x: x[2])
    dim = 300
    vocab = [l[0] for l in to_save]
    filtered_mod = KeyedVectors()
    filtered_mod.index2word = vocab
    filtered_mod.syn0 = np.ndarray(shape=(len(filtered_mod.index2word), dim), dtype=np.float32)
    for i, w in enumerate(filtered_mod.index2word):
        filtered_mod.syn0[i, :] = emb[w]
    with open(fn, 'wb') as f:
        pickle.dump(file=f, obj=filtered_mod)
    return filtered_mod

In [11]:
m_en = read_emb(eng_emb, limit)
m_it = read_emb(ita_emb, limit)

In [17]:
train_wp, tr_en, tr_it = read_word_pairs_tsv(train_fn, 0, 1)
valid_wp, va_en, va_it = read_word_pairs_tsv(valid_fn, 0, 1)
test_wp, te_en, te_it = read_word_pairs_tsv(test_fn, 0, 1)

In [20]:
# En
print('en train')
m_en_tr_fil = get_filtered_embedding(m_en, tr_en, tr_en_fn)
print('en valid')
m_en_va_fil = get_filtered_embedding(m_en, va_en, va_en_fn)
print('en test')
m_en_te_fil = get_filtered_embedding(m_en, te_en, te_en_fn)

print('it train')
m_it_tr_fil = get_filtered_embedding(m_it, tr_it, tr_it_fn)
print('it valid')
m_it_va_fil = get_filtered_embedding(m_it, va_it, va_it_fn)
print('it test')
m_it_te_fil = get_filtered_embedding(m_it, te_it, te_it_fn)

en train
en valid
en test
it train
not found: prelaurea
it valid
it test
not found: kostunica
not found: ridimensioni
not found: oligopolistica


In [21]:
def load(fn):
    with open(fn, 'rb') as f:
        data = pickle.load(f)
    return data

In [22]:
def test_load(fn, message):
    print(message)
    emb = load(fn)
    print('vocab length: {}'.format(len(emb.index2word)))

In [23]:
test_load(tr_en_fn, 'train en')
test_load(tr_it_fn, 'train it')
test_load(va_en_fn, 'valid en')
test_load(va_it_fn, 'valid it')
test_load(te_en_fn, 'test en')
test_load(te_it_fn, 'test it')

train en
vocab length: 3215
train it
vocab length: 4132
valid en
vocab length: 499
valid it
vocab length: 498
test en
vocab length: 1499
test it
vocab length: 1845


In [29]:
word = 'photo'
m_en_tr_load = load(tr_en_fn)
m_en_tr_load.index2word
m_en_tr_load.syn0[m_en_tr_load.index2word.index(word)] == m_en[word]

['that',
 'with',
 'this',
 'his',
 'not',
 'are',
 'which',
 'also',
 'new',
 'first',
 'page',
 'you',
 'had',
 'article',
 'who',
 'all',
 'their',
 'been',
 'made',
 'its',
 'people',
 'may',
 'after',
 'other',
 'her',
 'can',
 'more',
 'when',
 'time',
 'american',
 'such',
 'discussion',
 'links',
 'only',
 'some',
 'see',
 'united',
 'years',
 'world',
 'university',
 'during',
 'state',
 'states',
 'national',
 'most',
 'city',
 'used',
 'then',
 'than',
 'county',
 'external',
 'where',
 'will',
 'what',
 'any',
 'these',
 'january',
 'march',
 'august',
 'july',
 'being',
 'film',
 'him',
 'many',
 'south',
 'september',
 'between',
 'october',
 'three',
 'june',
 'well',
 'use',
 'war',
 'under',
 'them',
 'april',
 'born',
 'link',
 'while',
 'part',
 'november',
 'players',
 'list',
 'february',
 'known',
 'second',
 'name',
 'group',
 'history',
 'series',
 'just',
 'north',
 'work',
 'before',
 'since',
 'season',
 'both',
 'high',
 'through',
 'district',
 'now',
 'com

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,