In [29]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import os
from gensim.models import KeyedVectors

In [39]:
eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'

wp_folder = '/mnt/permanent/home/eszti/dipterv/panlex/data/smith/original/train'

train_fn = 'train_eng_ita.tsv'
valid_fn = 'valid_eng_ita.tsv'

langs = ['eng', 'ita']
idx1 = 0
idx2 = 1
eng = 'eng'
ita = 'ita'

limit = 350000
tr_rat = 9 # tr_rat*10 % will be train data

In [25]:
def read_emb(emb_fn, limit):
    model = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=limit)
    model.syn0 /= np.sqrt((model.syn0 ** 2).sum(1))[:, None]
    return model

In [20]:
def read_word_pairs_tsv(fn, id1, id2, header=True):
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines) if i > 0 or header == False]
    return data

def get_word_pairs_dict(langs, wp_folder, idx1, idx2):
    word_pairs_dict = dict()
    done = set()
    for lang1 in langs:
        for lang2 in langs:
            lang_pair = tuple(sorted([lang1, lang2]))
            if lang1 == lang2 or lang_pair in done:
                continue
            done.add(lang_pair)
            l1 = lang_pair[0]
            l2 = lang_pair[1]
            fn = os.path.join(wp_folder, '{0}_{1}.tsv'.format(l1, l2))
            print('Reading word pair file: {0}'.format(fn))
            data = read_word_pairs_tsv(fn, idx1, idx2, False)
            word_pairs_dict[lang_pair] = data
            print('Number of word pairs found: {0}'.format(len(data)))
    return word_pairs_dict

In [35]:
def get_not_found_list(vocab, embedding):
    nf_list = []
    for i, w in enumerate(vocab):
        # Check if there's an embedding to the word
        if w not in embedding:
            nf_list.append(w)
    return nf_list

def wp_list_2_dict(wp_l):
    l12 = dict()
    l21 = dict()
    for (w1, w2) in wp_l:
        if w1 not in l12:
            l12[w1] = [w2]
        else:
            l12[w1].append(w2)
        if w2 not in l21:
            l21[w2] = [w1]
        else:
            l21[w2].append(w1)
    return l12, l21

def get_two_lang_dictionaries(embeddings, word_pairs_dict):
        dictionaries = dict()
        updated_word_pairs = dict()
        for ((l1, l2), wp_l) in word_pairs_dict.items():
            print('Processing {0}-{1}...'.format(l1, l2))
            # Find words without embeddings
            [l1_vocab, l2_vocab] = zip( *wp_l)
            l1_vocab = list(set(l1_vocab))
            l2_vocab = list(set(l2_vocab))
            print('Words in {0}: {1}'.format(l1, len(l1_vocab)))
            print('Words in {0}: {1}'.format(l2, len(l2_vocab)))
            nf_l1 = get_not_found_list(vocab=l1_vocab, embedding=embeddings[l1])
            print('Words not found in embedding {0}: {1}'.format(l1, len(nf_l1)))
            print(nf_l1)
            nf_l2 = get_not_found_list(vocab=l2_vocab, embedding=embeddings[l2])
            print('Words not found in embedding {0}: {1}'.format(l2, len(nf_l2)))
            print(nf_l2)
            # Update word list
            print('Updating word pair list {0}-{1}'.format(l1, l2))
            updated_wp_l = [(w1, w2) for (w1, w2) in wp_l if w1 not in nf_l1 and w2 not in nf_l2]
            print('Word pairs list legth: {0} ->  {1} '.format(len(wp_l), len(updated_wp_l)))
            updated_word_pairs[(l1, l2)] = updated_wp_l
            # Create dictioary
            print('Creating dictionary for: {0}-{1}'.format(l1, l2))
            l12, l21 = wp_list_2_dict(updated_wp_l)
            dictionaries[(l1, l2)] = l12
            dictionaries[(l2, l1)] = l21
            print('# word in: {0}-{1}:\t{2}'.format(l1.upper(), l2, len(l12)))
            print('# word in: {0}-{1}:\t{2}'.format(l2.upper(), l1, len(l21)))
        return dictionaries, updated_word_pairs

In [30]:
m_en = read_emb(eng_emb, limit)
m_it = read_emb(ita_emb, limit)

In [31]:
word_pairs_dict = get_word_pairs_dict(langs, wp_folder, idx1, idx2)

Reading word pair file: /mnt/permanent/home/eszti/dipterv/panlex/data/smith/original/train/eng_ita.tsv
Number of word pairs found: 5000


In [36]:
embeddings = dict()
embeddings[eng] = m_en
embeddings[ita] = m_it

dictionaries, updated_word_pairs = get_two_lang_dictionaries(embeddings, word_pairs_dict)

Processing eng-ita...
Words in eng: 3442
Words in ita: 4549
Words not found in embedding eng: 0
[]
Words not found in embedding ita: 1
['prelaurea']
Updating word pair list eng-ita
Word pairs list legth: 5000 ->  4999 
Creating dictionary for: eng-ita
# word in: ENG-ita:	3442
# word in: ITA-eng:	4548


In [37]:
dictionaries[(eng, ita)]

{'parties': ['parti', 'partiti'],
 'exposed': ['esposte', 'esposti'],
 'trying': ['cercando'],
 'drinks': ['bevande'],
 'joined': ['aderito'],
 'fail': ['fail'],
 'brother': ['fratello'],
 'component': ['componente'],
 'understanding': ['comprensione'],
 'study': ['studio'],
 'dramatic': ['drammatica', 'drammatiche', 'drammatici', 'drammatico'],
 'ships': ['navi'],
 'committee': ['comitato', 'commissione'],
 'classical': ['classica', 'classico'],
 'permanent': ['permanente', 'permanenti'],
 'imperial': ['imperiale', 'imperiali'],
 'stones': ['pietre'],
 'rocks': ['rocce', 'scogli'],
 'destroyed': ['distrutte', 'distrutti', 'distrutto'],
 'chicago': ['chicago'],
 'honest': ['onesta', 'onesti', 'onesto'],
 'fans': ['tifosi'],
 'investors': ['investitori'],
 'aware': ['consapevole', 'consapevoli'],
 'use': ['uso', 'utilizzo'],
 'peak': ['picco'],
 'contacts': ['contatti'],
 'limit': ['limitare', 'limite'],
 'creating': ['creare'],
 'album': ['album'],
 'worst': ['peggio', 'peggiore', 'peg

In [54]:
def split(dictionaries):
    i = 0
    tr = []
    val = []
    for (k, vals) in dictionaries[(eng, ita)].items():
        wp_s = []
        for v in vals:
            wp_s.append((k, v))
        if i % 10 < tr_rat:
            tr += wp_s
        else:
            val += wp_s
        i += 1
    return tr, val

In [55]:
train_wps, valid_wps = split(dictionaries)

In [56]:
len(train_wps)
len(valid_wps)

4498

501

In [57]:
train_wps

[('parties', 'parti'),
 ('parties', 'partiti'),
 ('exposed', 'esposte'),
 ('exposed', 'esposti'),
 ('trying', 'cercando'),
 ('drinks', 'bevande'),
 ('joined', 'aderito'),
 ('fail', 'fail'),
 ('brother', 'fratello'),
 ('component', 'componente'),
 ('understanding', 'comprensione'),
 ('dramatic', 'drammatica'),
 ('dramatic', 'drammatiche'),
 ('dramatic', 'drammatici'),
 ('dramatic', 'drammatico'),
 ('ships', 'navi'),
 ('committee', 'comitato'),
 ('committee', 'commissione'),
 ('classical', 'classica'),
 ('classical', 'classico'),
 ('permanent', 'permanente'),
 ('permanent', 'permanenti'),
 ('imperial', 'imperiale'),
 ('imperial', 'imperiali'),
 ('stones', 'pietre'),
 ('rocks', 'rocce'),
 ('rocks', 'scogli'),
 ('destroyed', 'distrutte'),
 ('destroyed', 'distrutti'),
 ('destroyed', 'distrutto'),
 ('honest', 'onesta'),
 ('honest', 'onesti'),
 ('honest', 'onesto'),
 ('fans', 'tifosi'),
 ('investors', 'investitori'),
 ('aware', 'consapevole'),
 ('aware', 'consapevoli'),
 ('use', 'uso'),
 ('us

In [50]:
def get_wlists(wps):
    wl1 = set()
    wl2 = set()
    for (w1, w2) in wps:
        wl1.add(w1)
        wl2.add(w2)
    return wl1, wl2

In [59]:
tr_en, tr_it = get_wlists(train_wps)
va_en, va_it = get_wlists(valid_wps)

len(tr_en)
len(tr_it)
len(va_en)
len(va_it)

3098

4129

344

499

In [61]:
overlap_en = set(tr_en) & set(va_en)
overlap_it = set(tr_it) & set(va_it)
len(overlap_en)
len(overlap_it)

0

80

In [64]:
def wp_list_2_dict(wp_l):
    l12 = dict()
    l21 = dict()
    for (w1, w2) in wp_l:
        if w1 not in l12:
            l12[w1] = [w2]
        else:
            l12[w1].append(w2)
        if w2 not in l21:
            l21[w2] = [w1]
        else:
            l21[w2].append(w1)
    return l12, l21

In [65]:
tr_en_it, tr_it_en = wp_list_2_dict(train_wps)
va_en_it, va_it_en = wp_list_2_dict(valid_wps)

In [69]:
fatal = 0
for w in overlap_it:
    print(w)
    w_tr = tr_it_en[w]
    w_va = va_it_en[w]
    if w_tr == w_va:
        fatal += 1
    print('train: {}'.format(w_tr))
    print('test: {}'.format(w_va))
    
print('fatal: {}'.format(fatal))

vendita
train: ['sale', 'sales']
test: ['selling']
ogni
train: ['every', 'any']
test: ['each']
voto
train: ['voting']
test: ['vote']
contatti
train: ['contacts']
test: ['contact']
quota
train: ['proportion']
test: ['share']
vendere
train: ['sell']
test: ['selling']
porta
train: ['brings', 'door']
test: ['leads']
migliore
train: ['better']
test: ['best']
morte
train: ['kill']
test: ['death']
membri
train: ['members']
test: ['member']
animali
train: ['animals']
test: ['animal']
ferroviario
train: ['railway']
test: ['rail']
dettagli
train: ['detail']
test: ['details']
ramo
train: ['arm']
test: ['branch']
maggiore
train: ['greater']
test: ['increased']
aula
train: ['chamber']
test: ['house']
ascolto
train: ['listen']
test: ['listening']
importo
train: ['amount']
test: ['sum']
assemblea
train: ['assembly']
test: ['house']
carriera
train: ['career']
test: ['careers']
ultimi
train: ['recent']
test: ['last']
discussione
train: ['discussion']
test: ['debate']
promozione
train: ['promotion']
tes