In [161]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import warnings
import copy

In [162]:
fn_en = '/mnt/permanent/home/eszti/dipterv/panlex/data/smith/smith_all_data/en_orig.txt'
fn_it = '/mnt/permanent/home/eszti/dipterv/panlex/data/smith/smith_all_data/it.txt'
limit = None
encoding = 'latin-1'

In [163]:
class EmbeddingModel():
    def __init__(self):
        self.syn0 = None
        self.index2word = None

    def normalize(self):
        self.syn0 /= np.sqrt((self.syn0 ** 2).sum(1))[:, None]
        print('normalized, # > 0.001: {}'.format(
            len(np.where(abs(np.linalg.norm(self.syn0, axis=1) - 1) > 0.0001)[0]) ))

    def _read(self, fn, limit=None, lexicon=None, encoding='utf-8'):
        raise NotImplementedError

    def read(self, fn, limit=None, lexicon=None, encoding='utf-8'):
        print('Reading embedding from {}'.format(fn))
        self._read(fn, limit, lexicon, encoding)
        self.normalize()
        print('Syn0 size: {0}'.format(self.syn0.shape))

    def get(self, word):
        if word not in self.index2word:
            raise ValueError('Out of dictionary word: {}'.format(word))
        else:
            idx = self.index2word.index(word)
            return self.syn0[idx]

class TextEmbedding(EmbeddingModel):
    def __init__(self):
        EmbeddingModel.__init__(self)

    def _read(self, fn, limit=None, lexicon=None, encoding='utf-8'):
        id2row = []
        def filter_lines(f):
            for i,line in enumerate(f):
                if limit is not None and i > limit:
                    break
                word = line.split()[0]
                if i != 0 and (lexicon is None or word in lexicon):
                    id2row.append(word)
                    yield line

        #get the number of columns
        with open(fn, encoding=encoding) as f:
            f.readline()
            ncols = len(f.readline().split())

        with open(fn, encoding=encoding) as f:
            m = np.matrix(np.loadtxt(filter_lines(f),
                          comments=None, usecols=range(1,ncols)))
        self.syn0 = np.asarray(m)
        self.index2word = id2row

In [164]:
emb_mod_it = TextEmbedding()
emb_mod_it.read(fn=fn_it, limit=limit, encoding=encoding)

Reading embedding from /mnt/permanent/home/eszti/dipterv/panlex/data/smith/smith_all_data/it.txt
Syn0 size: (200000, 300)


In [165]:
emb_mod_en = TextEmbedding()
emb_mod_en.read(fn=fn_en, limit=limit, encoding=encoding)

Reading embedding from /mnt/permanent/home/eszti/dipterv/panlex/data/smith/smith_all_data/en_orig.txt
Syn0 size: (200000, 300)


In [166]:
def print_num(num):
    print(emb_mod_en.index2word[num])
    print(emb_mod_en.syn0[num])
    
def check_nans(emb):
    nans = np.where(np.isnan(emb.syn0).any(axis=1))[0]
    print(len(nans))
    print(nans)
    for nan in nans:
        print_num(nan)
    print('new embedding: ')
    ok_emb = copy.deepcopy(emb)
    ok_emb.syn0 = np.delete(ok_emb.syn0, nans, axis=0)
    ok_emb.index2word = [w for i, w in enumerate(emb.index2word) if i not in nans]
    print(len(ok_emb.syn0))
    print(len(ok_emb.index2word))
    return ok_emb

In [167]:
ok_en = check_nans(emb_mod_en)

0
[]
new embedding: 
200000
200000


In [168]:
ok_it = check_nans(emb_mod_it)

0
[]
new embedding: 
200000
200000


In [169]:
emb_mod_en.get('dog')

array([  3.22591000e-01,   1.04048000e-01,  -9.45600000e-02,
         2.13572000e-01,  -4.74810000e-02,   1.03910000e-02,
         1.93032000e-01,   1.51680000e-02,   9.28640000e-02,
         7.24290000e-02,  -3.52992000e-01,   7.16410000e-02,
        -6.70860000e-02,  -2.02955000e-01,   2.54483000e-01,
        -1.54914000e-01,   3.56023000e-01,  -4.03375000e-01,
         1.10762000e-01,  -1.34364000e-01,  -1.61798000e-01,
         1.52484000e-01,   2.97463000e-01,  -1.30503000e-01,
         1.58495000e-01,   4.00892000e-01,  -9.22500000e-03,
        -3.20443000e-01,   1.03258000e-01,   4.61652000e-01,
        -2.13170000e-01,  -3.48790000e-02,   1.51938000e-01,
         1.01818000e-01,   3.54113000e-01,   1.81026000e-01,
         1.85837000e-01,   2.14690000e-01,  -2.12894000e-01,
        -2.83443000e-01,   1.86400000e-02,   1.44705000e-01,
         1.46270000e-02,   2.40523000e-01,   3.54185000e-01,
        -9.54620000e-02,  -2.26810000e-02,  -8.40850000e-02,
         2.05705000e-01,

In [170]:
ok_en.get('dog')

array([  3.22591000e-01,   1.04048000e-01,  -9.45600000e-02,
         2.13572000e-01,  -4.74810000e-02,   1.03910000e-02,
         1.93032000e-01,   1.51680000e-02,   9.28640000e-02,
         7.24290000e-02,  -3.52992000e-01,   7.16410000e-02,
        -6.70860000e-02,  -2.02955000e-01,   2.54483000e-01,
        -1.54914000e-01,   3.56023000e-01,  -4.03375000e-01,
         1.10762000e-01,  -1.34364000e-01,  -1.61798000e-01,
         1.52484000e-01,   2.97463000e-01,  -1.30503000e-01,
         1.58495000e-01,   4.00892000e-01,  -9.22500000e-03,
        -3.20443000e-01,   1.03258000e-01,   4.61652000e-01,
        -2.13170000e-01,  -3.48790000e-02,   1.51938000e-01,
         1.01818000e-01,   3.54113000e-01,   1.81026000e-01,
         1.85837000e-01,   2.14690000e-01,  -2.12894000e-01,
        -2.83443000e-01,   1.86400000e-02,   1.44705000e-01,
         1.46270000e-02,   2.40523000e-01,   3.54185000e-01,
        -9.54620000e-02,  -2.26810000e-02,  -8.40850000e-02,
         2.05705000e-01,