In [1]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pickle
from gensim.models import KeyedVectors

In [36]:
train_fn = '/mnt/permanent/home/eszti/dipterv/panlex/data/smith/original/train/eng_ita.tsv'

eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'

limit = None

tr_en_fn = 'eng.pickle'

tr_it_fn = 'ita.pickle'

In [3]:
class EmbeddingModel():
    def __init__(self):
        self.syn0 = None
        self.index2word = None

    def normalize(self):
        self.syn0 /= np.sqrt((self.syn0 ** 2).sum(1))[:, None]

    def _read(self, fn, limit=None, lexicon=None):
        raise NotImplementedError

    def read(self, fn, limit=None, lexicon=None):
        print('Reading embedding from {}'.format(fn))
        self._read(fn, limit, lexicon)
        self.normalize()
        print('Syn0 size: {0}'.format(self.syn0.shape))
        
    def get(self, word):
        if word not in self.index2word:
            raise ValueError('Out of dictionary word: {}'.format(word))
        else:
            idx = self.index2word.index(word)
            return self.syn0[idx]

class TextEmbedding(EmbeddingModel):
    def __init__(self):
        EmbeddingModel.__init__(self)

    def _read(self, fn, limit=None, lexicon=None):
        model = KeyedVectors.load_word2vec_format(fn, binary=False, limit=limit)
        self.syn0 = model.syn0
        self.index2word = model.index2word
        
class PickleEmbedding(EmbeddingModel):
    def __init__(self):
        EmbeddingModel.__init__(self)

    def _read(self, fn, limit=None, lexicon=None, encoding='utf-8'):
        data = load_pickle(fn)
        self.syn0 = data[0]
        self.index2word = data[1]

In [4]:
def read_word_pairs_tsv(fn, id1, id2):
    wl1 = set()
    wl2 = set()
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines)]
    for (w1, w2) in data:
        wl1.add(w1)
        wl2.add(w2)
    return data, wl1, wl2

In [61]:
def get_filtered_embedding(emb, wl, fn):
    to_save = []
    for w in wl:
        if w in emb.index2word:
            to_save.append([w, emb.get(w), emb.index2word.index(w)])
        else:
            print('not found: {}'.format(w))
    to_save.sort(key=lambda x: x[2])
    dim = 300
    vocab = [l[0] for l in to_save]
    filtered_mod = dict()
    for i, w in enumerate(vocab):
        filtered_mod[w] = emb.get(w)
    vocab = [v.encode('utf-8') for v in vocab]
    emb.index2word = vocab
    return emb

In [6]:
m_en = TextEmbedding()
m_it = TextEmbedding()

In [7]:
data, wl1, wl2 = read_word_pairs_tsv(train_fn, 0, 1)

In [8]:
m_en.read(fn=eng_emb)
m_it.read(fn=ita_emb)

Reading embedding from /mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec
Syn0 size: (2519370, 300)
Reading embedding from /mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec
Syn0 size: (871053, 300)


In [34]:
len(np.where(abs(np.linalg.norm(m_en.syn0, axis=1)-1) > 0.0001)[0] )

0

In [55]:
print('en train')
emb_en_f = get_filtered_embedding(m_en, wl1, tr_en_fn)
print('it train')
emb_it_f = get_filtered_embedding(m_it, wl2, tr_it_fn)

en train
it train
not found: prelaurea


In [56]:
def save(fn, emb):
    with open(fn, 'wb') as f:
        pickle.dump(file=f, obj=(emb.syn0, emb.index2word))

In [62]:
save(tr_en_fn, emb_en_f)
save(tr_it_fn, emb_it_f)

In [65]:
with open(tr_en_fn, 'rb') as f:
    data = pickle.load(f)

ValueError: binary mode doesn't take an encoding argument

In [50]:
m_en.index2word

[',',
 '.',
 'the',
 '</s>',
 'of',
 '-',
 'in',
 'and',
 "'",
 ')',
 '(',
 'to',
 'a',
 'is',
 'was',
 'on',
 's',
 'for',
 'as',
 'by',
 'that',
 'it',
 'with',
 'from',
 'at',
 'he',
 'this',
 'be',
 'i',
 'an',
 'utc',
 'his',
 'not',
 '–',
 'are',
 'or',
 'talk',
 'which',
 'also',
 'has',
 'were',
 'but',
 'have',
 '#',
 'one',
 'rd',
 'new',
 'first',
 'page',
 'no',
 'you',
 'they',
 'had',
 'article',
 't',
 'who',
 '?',
 'all',
 'their',
 'there',
 'been',
 'made',
 'its',
 'people',
 'may',
 'after',
 '%',
 'other',
 'should',
 'two',
 'score',
 'her',
 'can',
 'would',
 'more',
 'if',
 'she',
 'about',
 'when',
 'time',
 'team',
 'american',
 'such',
 'th',
 'do',
 'discussion',
 'links',
 'only',
 'some',
 'up',
 'see',
 'united',
 'years',
 'into',
 '/',
 'school',
 'so',
 'world',
 'university',
 'during',
 'out',
 'state',
 'states',
 'national',
 'wikipedia',
 'year',
 'most',
 'city',
 'over',
 'used',
 'then',
 'd',
 'than',
 'county',
 'external',
 'm',
 'where',
 '