In [1]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from gensim.models import KeyedVectors
import pickle

In [2]:
train_fn = '/home/eszti/projects/dipterv/panlex/data/smith/train/eng_ita.tsv'
valid_fn = '/home/eszti/projects/dipterv/panlex/data/smith/valid/eng_ita.tsv'
test_fn = '/home/eszti/projects/dipterv/panlex/data/smith/test/eng_ita.tsv'

eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'

limit = 100

tr_en_fn = 'train_eng.pickle'
va_en_fn = 'valid_eng.pickle'
te_en_fn = 'test_eng.pickle'

tr_it_fn = 'train_ita.pickle'
va_it_fn = 'valid_ita.pickle'
te_it_fn = 'test_ita.pickle'

In [54]:
def read_emb(emb_fn, limit):
    model = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=limit)
    model.syn0 /= np.sqrt((model.syn0 ** 2).sum(1))[:, None]
    return model

In [4]:
def read_word_pairs_tsv(fn, id1, id2):
    wl1 = set()
    wl2 = set()
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines)]
    for (w1, w2) in data:
        wl1.add(w1)
        wl2.add(w2)
    return data, wl1, wl2

In [5]:
# saving format: (vocab: ordered list according to frequency, emb_dict: word - nparray dictionary)
def get_filtered_embedding(emb, wl, fn):
    to_save = []
    for w in wl:
        if w in emb:
            to_save.append([w, emb[w], emb.index2word.index(w)])
        else:
            print('not found: {}'.format(w))
    to_save.sort(key=lambda x: x[2])
    dim = 300
    vocab = [l[0] for l in to_save]
    filtered_mod = dict()
    for i, w in enumerate(vocab):
        filtered_mod[w] = emb[w]
    with open(fn, 'wb') as f:
        pickle.dump(file=f, obj=filtered_mod)
    return filtered_mod

In [55]:
m_en = read_emb(eng_emb, limit)
m_it = read_emb(ita_emb, limit)

In [53]:
len(np.linalg.norm(m_en.syn0, axis=0))
len(np.linalg.norm(m_en.syn0, axis=1))
np.where(abs((np.linalg.norm(m_en.syn0, axis=1) - 1) < 0.001))[0].shape

300

100

(0,)

In [38]:
norm_1 = np.linalg.norm(m_en.syn0, axis=1)
norm_1

array([ 2.05265021,  2.36800265,  2.0135386 ,  4.93735266,  2.30300355,
        3.1205101 ,  2.2763052 ,  1.84824586,  2.67162514,  2.64436626,
        2.63488674,  2.43568325,  2.33243322,  2.61603403,  2.66848278,
        2.78512669,  2.54240227,  2.49497628,  2.44217229,  2.76239896,
        2.20248604,  2.52166271,  2.51009583,  2.78188396,  2.88394308,
        2.74271393,  2.71097398,  2.90605712,  3.34197187,  3.22211385,
        4.03677797,  2.79201961,  2.77948022,  3.30798578,  2.93071795,
        2.8114984 ,  3.89324832,  2.08943868,  2.2465353 ,  2.85685635,
        3.03525686,  2.16017437,  2.60903144,  4.03101206,  2.3092289 ,
        5.78212929,  3.29784703,  2.55242205,  4.7982769 ,  3.58481169,
        3.34903812,  2.49354959,  2.76574612,  3.68919659,  3.66752005,
        2.83445549,  3.81883478,  2.64196205,  2.68851066,  2.71749902,
        2.91065049,  3.83681989,  2.87836981,  3.88532877,  2.77112293,
        2.66413093,  5.90777588,  2.49742579,  3.88017988,  2.68

In [25]:
len(np.linalg.norm(m_it.syn0, axis=0))
len(np.linalg.norm(m_it.syn0, axis=1))
len(np.linalg.norm(m_it.syn0, axis=1) == 1)

300

100

100

In [7]:
train_wp, tr_en, tr_it = read_word_pairs_tsv(train_fn, 0, 1)
valid_wp, va_en, va_it = read_word_pairs_tsv(valid_fn, 0, 1)
test_wp, te_en, te_it = read_word_pairs_tsv(test_fn, 0, 1)

In [16]:
# En
print('en train')
m_en_tr_fil = get_filtered_embedding(m_en, tr_en, tr_en_fn)
print('en valid')
m_en_va_fil = get_filtered_embedding(m_en, va_en, va_en_fn)
print('en test')
m_en_te_fil = get_filtered_embedding(m_en, te_en, te_en_fn)

print('it train')
m_it_tr_fil = get_filtered_embedding(m_it, tr_it, tr_it_fn)
print('it valid')
m_it_va_fil = get_filtered_embedding(m_it, va_it, va_it_fn)
print('it test')
m_it_te_fil = get_filtered_embedding(m_it, te_it, te_it_fn)

en train
not found: absolute
not found: eye
not found: refused
not found: text
not found: spain
not found: respectively
not found: lay
not found: mouth
not found: candidates
not found: scope
not found: competitions
not found: bills
not found: earth
not found: empire
not found: bass
not found: duties
not found: potential
not found: described
not found: dynamic
not found: variables
not found: meetings
not found: deeply
not found: final
not found: opinion
not found: supporters
not found: credit
not found: bond
not found: view
not found: fair
not found: mainly
not found: pairs
not found: libraries
not found: extended
not found: mixed
not found: muslim
not found: inclusion
not found: sample
not found: free
not found: ridge
not found: bars
not found: annually
not found: presented
not found: raised
not found: sector
not found: kingdom
not found: periods
not found: forget
not found: anna
not found: represented
not found: variable
not found: communist
not found: ethics
not found: length
not fou

not found: function
not found: norwegian
not found: depends
not found: last
not found: argument
not found: restrictions
not found: demonstration
not found: exploration
not found: historical
not found: rank
not found: jimmy
not found: zero
not found: hearing
not found: nuclear
not found: poetry
not found: promotion
not found: although
not found: foods
not found: village
not found: capture
not found: implement
not found: tried
not found: learned
not found: congress
not found: rare
not found: risk
not found: communication
not found: lower
not found: dictionary
not found: seats
not found: none
not found: creation
not found: teaching
not found: depth
not found: plants
not found: resident
not found: control
not found: discrimination
not found: why
not found: archaeological
not found: leader
not found: blair
not found: egypt
not found: great
not found: south
not found: piano
not found: accident
not found: considering
not found: imagine
not found: writers
not found: david
not found: factory
no

not found: ancient
not found: billion
not found: prices
not found: steam
not found: prevention
not found: exercises
not found: grey
not found: investors
not found: relatively
not found: kent
not found: singer
not found: kind
not found: seem
not found: ticket
not found: intellectual
not found: ali
not found: producers
not found: board
not found: cape
not found: accept
not found: hoped
not found: grow
not found: event
not found: stages
not found: questions
not found: pieces
not found: sarah
not found: friendly
not found: organic
not found: mixture
not found: advance
not found: seminars
not found: lessons
not found: today
not found: season
not found: mortgage
not found: roof
not found: province
not found: recorded
not found: rugby
not found: residence
not found: call
not found: tropical
not found: towards
not found: canada
not found: serious
not found: acted
not found: identify
not found: fought
not found: smart
not found: plc
not found: areas
not found: india
not found: guy
not found: as

not found: incontestable
not found: hostage
not found: giulia
not found: sediments
not found: softens
not found: reserved
not found: responsibilities
not found: montenegro
not found: fabs
not found: hands
not found: paradoxical
not found: worth
not found: moratoria
not found: played
not found: polarise
not found: madman
not found: helmsman
not found: princess
not found: sima
not found: cip
not found: altercations
not found: outlying
not found: threefold
not found: orkney
not found: externalised
not found: marburg
not found: bucket
not found: ethnological
not found: equidae
not found: pariahs
not found: unbundling
not found: benes
not found: bloodthirsty
not found: patrol
not found: brazilian
not found: quibble
not found: sari
not found: globalisation
not found: ferment
not found: bookshop
not found: hobart
not found: adopted
not found: signalling
not found: auction
not found: kabila
not found: immobility
not found: totems
not found: poisonings
not found: regionalisation
not found: sele

not found: ala
not found: condizione
not found: iniziali
not found: commenti
not found: trimestre
not found: sofferenze
not found: albero
not found: latina
not found: perseguire
not found: religiose
not found: visto
not found: mai
not found: duca
not found: vuoto
not found: descrivere
not found: momento
not found: popolo
not found: operativa
not found: quercia
not found: promessa
not found: spettacolare
not found: navi
not found: lineare
not found: rotte
not found: sportivo
not found: carriere
not found: africa
not found: interessante
not found: condiviso
not found: romantico
not found: tavole
not found: entry
not found: yorkshire
not found: pulizia
not found: sci
not found: diventando
not found: matematici
not found: calcolato
not found: eccezionale
not found: descrizione
not found: onori
not found: installati
not found: controversia
not found: prospettive
not found: tazza
not found: tecnici
not found: caro
not found: canale
not found: virtuale
not found: nucleari
not found: internazi

not found: procedura
not found: preparare
not found: comandante
not found: scelta
not found: banca
not found: scrittori
not found: secondi
not found: tropicale
not found: revisioni
not found: ordinato
not found: logica
not found: ban
not found: stessa
not found: marito
not found: intelligenti
not found: ossigeno
not found: olandesi
not found: residenza
not found: controlli
not found: appena
not found: appartamento
not found: oratori
not found: meccanica
not found: approcci
not found: energia
not found: uccelli
not found: clausola
not found: preoccupazione
not found: dichiarazione
not found: fed
not found: votare
not found: scozzese
not found: birmingham
not found: made
not found: radio
not found: deve
not found: dramma
not found: distrutte
not found: strettamente
not found: cinema
not found: brigata
not found: les
not found: semplicemente
not found: galleria
not found: vendere
not found: ristorante
not found: colin
not found: orientale
not found: solida
not found: vestiti
not found: ef

not found: morboso
not found: efficiente
not found: insostituibili
not found: barbados
not found: politico
not found: caratterizzano
not found: censurati
not found: deplorare
not found: libraio
not found: ambienti
not found: collegio
not found: affarismo
not found: globalizzazione
not found: poso
not found: inés
not found: inchinati
not found: tipica
not found: bocche
not found: tempistiche
not found: edd
not found: dipendenza
not found: vivai
not found: macchinazioni
not found: subdolamente
not found: spencer
not found: ricattato
not found: sbalordisce
not found: extremis
not found: omosessuali
not found: jaspers
not found: lord
not found: mirza
not found: restyling
not found: rappresaglie
not found: rosso
not found: gideon
not found: passatempo
not found: devastato
not found: apprezzato
not found: chetichella
not found: idilliaco
not found: esternalizzati
not found: carceri
not found: certificate
not found: regolarizzato
not found: winkler
not found: shahid
not found: vincenti
not fo

In [9]:
def load(fn):
    with open(fn, 'rb') as f:
        data = pickle.load(f)
    return data

In [10]:
def test_load(fn, message):
    print(message)
    emb = load(fn)
    print('vocab length: {}'.format(len(emb.keys())))

In [11]:
test_load(tr_en_fn, 'train en')
test_load(tr_it_fn, 'train it')
test_load(va_en_fn, 'valid en')
test_load(va_it_fn, 'valid it')
test_load(te_en_fn, 'test en')
test_load(te_it_fn, 'test it')

train en
vocab length: 42
train it
vocab length: 31
valid en
vocab length: 7
valid it
vocab length: 6
test en
vocab length: 4
test it
vocab length: 3


In [12]:
word = 'for'
m_en_tr_load = load(tr_en_fn)
# m_en_tr_load.keys()
word in tr_en

True