In [81]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pickle
from gensim.models import KeyedVectors

In [82]:
train_fn = '/mnt/permanent/home/eszti/dipterv/panlex/data/smith/original/train/eng_ita.tsv'

eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'

limit = None

tr_en_fn = 'eng.pickle'

tr_it_fn = 'ita.pickle'

In [83]:
class EmbeddingModel():
    def __init__(self):
        self.syn0 = None
        self.index2word = None

    def normalize(self):
        self.syn0 /= np.sqrt((self.syn0 ** 2).sum(1))[:, None]

    def _read(self, fn, limit=None, lexicon=None):
        raise NotImplementedError

    def read(self, fn, limit=None, lexicon=None):
        print('Reading embedding from {}'.format(fn))
        self._read(fn, limit, lexicon)
        self.normalize()
        print('Syn0 size: {0}'.format(self.syn0.shape))
        
    def get(self, word):
        if word not in self.index2word:
            raise ValueError('Out of dictionary word: {}'.format(word))
        else:
            idx = self.index2word.index(word)
            return self.syn0[idx]

class TextEmbedding(EmbeddingModel):
    def __init__(self):
        EmbeddingModel.__init__(self)

    def _read(self, fn, limit=None, lexicon=None):
        model = KeyedVectors.load_word2vec_format(fn, binary=False, limit=limit)
        self.syn0 = model.syn0
        self.index2word = model.index2word
        
class PickleEmbedding(EmbeddingModel):
    def __init__(self):
        EmbeddingModel.__init__(self)

    def _read(self, fn, limit=None, lexicon=None, encoding='utf-8'):
        data = load_pickle(fn)
        self.syn0 = data[0]
        self.index2word = data[1]

In [84]:
def read_word_pairs_tsv(fn, id1, id2):
    wl1 = set()
    wl2 = set()
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines)]
    for (w1, w2) in data:
        wl1.add(w1)
        wl2.add(w2)
    return data, wl1, wl2

In [107]:
def get_filtered_embedding(emb, wl, fn):
    to_save = []
    for w in wl:
        if w in emb.index2word:
            to_save.append([w, emb.get(w), emb.index2word.index(w)])
        else:
            print('not found: {}'.format(w))
    to_save.sort(key=lambda x: x[2])
    dim = 300
    vocab = [l[0] for l in to_save]
    filtered_mod = np.ndarray(shape=(len(vocab), dim))
    for i, w in enumerate(vocab):
        filtered_mod[i, :] = emb.get(w)
    return filtered_mod, vocab

In [86]:
m_en = TextEmbedding()
m_it = TextEmbedding()

In [87]:
data, wl1, wl2 = read_word_pairs_tsv(train_fn, 0, 1)

In [88]:
m_en.read(fn=eng_emb)
m_it.read(fn=ita_emb)

Reading embedding from /mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec
Syn0 size: (2519370, 300)
Reading embedding from /mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec
Syn0 size: (871053, 300)


In [89]:
len(np.where(abs(np.linalg.norm(m_en.syn0, axis=1)-1) > 0.0001)[0] )

0

In [110]:
print('en train')
en_syn0, en_i2w = get_filtered_embedding(m_en, wl1, tr_en_fn)
print('it train')
it_syn0, it_i2r = get_filtered_embedding(m_it, wl2, tr_it_fn)

en train
it train
not found: prelaurea


In [111]:
def save(fn, syn0, i2r):
    with open(fn, 'wb') as f:
        pickle.dump(file=f, obj=(syn0, i2r))

In [113]:
save(tr_en_fn, en_syn0, en_i2w)
save(tr_it_fn, it_syn0, it_i2r)

In [99]:
with open(tr_en_fn, 'rb') as f:
    data = pickle.load(f)

In [100]:
len(data[1])
len(data[0])
data[0].shape

3442

3442

(3442, 300)

In [109]:
'parties' in en_i2w

True

In [104]:
'parties' in wl1

True