In [1]:
# Import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt

In [2]:
eng_emb = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
ita_emb = '/mnt/permanent/Language/Multi/FB/wiki.it/wiki.it.vec'
limit = None
limits_tr = [i*50000 for i in range(1, 16)]
limits_te = [i*50000 for i in range(1, 41)]

train_fn = '/home/eszti/projects/dipterv/panlex/data/smith/train/eng_ita.tsv'
test_fn = '/home/eszti/projects/dipterv/panlex/data/smith/test/eng_ita.tsv'

figsize_x = 8
figsize_y = 4

In [3]:
def read_emb(emb_fn, limit):
    model = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=limit)
    return model

In [4]:
def read_word_pairs_tsv(fn, id1, id2, header=True):
    wl1 = set()
    wl2 = set()
    with open(fn) as f:
        lines = f.readlines()
        data = [(line.split()[id1], line.split()[id2]) for i, line in enumerate(lines) if i > 0 or header == False]
    for (w1, w2) in data:
        wl1.add(w1)
        wl2.add(w2)
    return data, wl1, wl2

In [5]:
def check_emb_coverage(emb, wl, limits):
    found = [[] for i in limits]    
    for w in wl:
        n_from = 0
        n_to = len(limits)
        if w in emb:
            idx = emb.index2word.index(w)
            for i, l in enumerate(limits):
                if idx > l:
                    n_from = i + 1
        else:
            # do not add any of the found lists
            n_from = len(limits)
        for i in range(n_from, n_to):
            found[i].append(w)
    return found

In [6]:
def check_wp_coverage(wp_l, found1, found2, limits):
    found = [[] for i in limits]    
    for (w1, w2) in wp_l:
        for i, l in enumerate(limits):
            if w1 in found1[i] and w2 in found2[i]:
                found[i].append((w1, w2))
    return found

In [7]:
def plot(x, y, title, xlab, ylab):
    plt.figure(figsize=(figsize_x,figsize_y))    
    plt.title(title)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.grid()
    plt.plot(x, y, 'o')
    plt.legend()
    plt.show()

In [None]:
m_en = read_emb(eng_emb, limit)
m_it = read_emb(ita_emb, limit)
len(m_en.index2word)
len(m_it.index2word)

In [None]:
train_wp, tr_en, tr_it = read_word_pairs_tsv(train_fn, 0, 1)
test_wp, te_en, te_it = read_word_pairs_tsv(test_fn, 0, 1)
len(train_wp)
len(tr_en)
len(tr_it)
len(test_wp)
len(te_en)
len(te_it)

In [None]:
found_tr_en = check_emb_coverage(emb=m_en, wl=tr_en, limits=limits_tr)
found_tr_it = check_emb_coverage(emb=m_it, wl=tr_it, limits=limits_tr)

stat_tr_en = []
stat_tr_it = []

print('train en : {}'.format(len(tr_en)))
for i, l in enumerate(limits_tr):
    y = len(found_tr_en[i])
    stat_tr_en.append(y)
    print('limit - {0} : {1}'.format(l, y))
print('not found: {}'.format(set(tr_en) - set(found_tr_en[-1])))
plot(limits_tr, stat_tr_en, 'train en', 'limits', 'found')

print('train it : {}'.format(len(tr_it)))
for i, l in enumerate(limits_tr):
    y = len(found_tr_it[i])
    stat_tr_it.append(y)
    print('limit - {0} : {1}'.format(l, y))
print('not found: {}'.format(set(tr_it) - set(found_tr_it[-1])))
plot(limits_tr, stat_tr_it, 'train it', 'limits', 'found')

In [None]:
# proving that not found words are not in the embedding
'prelaurea' in m_it

In [None]:
found_te_en = check_emb_coverage(emb=m_en, wl=te_en, limits=limits_te)
found_te_it = check_emb_coverage(emb=m_it, wl=te_it, limits=limits_te)

stat_te_en = []
stat_te_it = []

print('test en : {}'.format(len(te_en)))
for i, l in enumerate(limits_te):
    y = len(found_te_en[i])
    stat_te_en.append(y)
    print('limit - {0} : {1}'.format(l, y))
print('not found: {}'.format(set(te_en) - set(found_te_en[-1])))
plot(limits_te, stat_te_en, 'test en', 'limits', 'found')

print('test it : {}'.format(len(te_it)))
for i, l in enumerate(limits_te):
    y = len(found_te_it[i])
    stat_te_it.append(y)
    print('limit - {0} : {1}'.format(l, y))
print('not found: {}'.format(set(te_it) - set(found_te_it[-1])))
plot(limits_te, stat_te_it, 'test it', 'limits', 'found')

In [None]:
# proving that not found words are not in the embedding
'ridimensioni' in m_it
'kostunica' in m_it
'oligopolistica' in m_it

In [None]:
found_tr_wp = check_wp_coverage(train_wp, found_tr_en, found_tr_it, limits_tr)
found_te_wp = check_wp_coverage(test_wp, found_te_en, found_te_it, limits_te)

stat_tr_wp = []
stat_te_wp = []

print('train : {}'.format(len(train_wp)))
for i, l in enumerate(limits_tr):
    y = len(found_tr_wp[i])
    stat_tr_wp.append(y)
    print('limit - {0} : {1}'.format(l, y))
print('not found: {}'.format(set(train_wp) - set(found_tr_wp[-1])))
plot(limits_tr, stat_tr_wp, 'train word pairs', 'limits', 'found')

print('test : {}'.format(len(test_wp)))
for i, l in enumerate(limits_te):
    y = len(found_te_wp[i])
    stat_te_wp.append(y)
    print('limit - {0} : {1}'.format(l, y))
print('not found: {}'.format(set(test_wp) - set(found_te_wp[-1])))
plot(limits_te, stat_te_wp, 'test word pairs', 'limits', 'found')

In [None]:
def print_all():
    plt.figure(figsize=(figsize_x, figsize_y))
    plt.title('train')
    plt.xlabel('limit')
    plt.ylabel('found')
    plt.grid()
    plt.plot(limits_tr, stat_tr_en, 'ro', label='en: {}'.format(len(tr_en)))
    plt.plot(limits_tr, stat_tr_it, 'bo', label='it: {}'.format(len(tr_it)))
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(figsize_x, figsize_y))
    plt.title('test')
    plt.xlabel('limit')
    plt.ylabel('found')
    plt.grid()
    plt.plot(limits_te, stat_te_en, 'ro', label='en: {}'.format(len(te_en)))
    plt.plot(limits_te, stat_te_it, 'bo', label='it: {}'.format(len(te_it)))
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(figsize_x,figsize_y))
    plt.title('word pairs')
    plt.xlabel('limit')
    plt.ylabel('found')
    plt.grid()
    plt.plot(limits_tr, stat_tr_wp, 'ro', label='train: {}'.format(len(train_wp)))
    plt.plot(limits_te, stat_te_wp, 'bo', label='test: {}'.format(len(test_wp)))
    plt.legend()
    plt.show()

In [None]:
print_all()