# Find universal embedding

In [26]:
from __future__ import print_function
from nbformat import current
import io
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
def execute_notebook(nbfile):    
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    
    ip = get_ipython()
    
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
        
execute_notebook("functions.ipynb")        
swad_idx = []

Load Eglish embeddings

In [27]:
en_swad_fn = '/home/eszti/data/panlex_swadesh/swadesh110/test/eng-000.txt'
en_embed_fn = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'

en_swad, en_emb, en_nfi = get_embedding(en_swad_fn, swad_idx, en_embed_fn)
print(en_swad)
print(en_emb)
print(en_nfi)

[u'all', u'as', u'bark', u'abdomen', u'big', u'bird', u'bite', u'black', u'blood', u'bone', u'breast', u'burn', u'claw', u'cloud', u'cold', u'come', u'die', u'dog', u'drink', u'dry', u'ear', u'earth', u'eat', u'egg', u'eye', u'fat', u'feather', u'fire', u'fish', u'fly', u'foot', u'full', u'give', u'good', u'green', u'hair', u'hand', u'head', u'hear', u'heart', u'horn', u'i', u'kill', u'knee', u'know', u'leaf', u'lie', u'liver', u'long', u'louse', u'male', u'many', u'flesh', u'moon', u'mountain', u'mouth', u'name', u'neck', u'new', u'night', u'nose', u'not', u'adulterous', u'human', u'rain', u'red', u'path', u'root', u'round', u'sand', u'say', u'see', u'seed', u'sit', u'skin', u'sleep', u'little', u'smoke', u'stand', u'star', u'stone', u'sun', u'swim', u'tail', u'that', u'this', u'you', u'tongue', u'tooth', u'tree', u'two', u'go', u'hot', u'water', u'we', u'what', u'white', u'who', u'female', u'yellow', u'far', u'heavy', u'near', u'salt', u'short', u'snake', u'thin', u'wind', u'worm', u

Load German embeddings

In [29]:
de_swad_fn = '/home/eszti/data/panlex_swadesh/swadesh110/test/deu.txt'
de_embed_fn = '/mnt/permanent/Language/Multi/FB/wiki.de/wiki.de.vec'

de_swad, de_emb, de_nfi = get_embedding(de_swad_fn, swad_idx, de_embed_fn)

print(de_swad)
print(de_emb)
print(de_nfi)

[u'all', u'asche', u'borke', u'bauch', u'gross', u'vogel', u'beissen', u'schwarz', u'blut', u'knochen', u'brust', u'brennen', u'kralle', u'wolke', u'kalt', u'kommen', u'sterben', u'hund', u'trinken', u'trocken', u'ohr', u'erde', u'essen', u'ei', u'auge', u'dick', u'feder', u'feuer', u'fisch', u'fliegen', u'fuss', u'voll', u'geben', u'gut', u'gr\xfcn', u'haar', u'hand', u'kopf', u'h\xf6ren', u'herz', u'horn', u'ich', u'umbringen', u'knie', u'kennen', u'blatt', u'liegen', u'leber', u'lang', u'laus', u'mann', u'viel', u'fleisch', u'mond', u'berg', u'mund', u'name', u'nacken', u'neu', u'nacht', u'nase', u'nicht', u'ein', u'mensch', u'regen', u'rot', u'pfad', u'wurzel', u'rund', u'sand', u'sagen', u'sehen', u'samen', u'sitzen', u'haut', u'schlafen', u'klein', u'rauch', u'stehen', u'stern', u'stein', u'sonne', u'schwimmen', u'schwanz', u'das', u'dieses', u'du', u'zunge', u'zahn', u'baum', u'zwei', u'gehen', u'warm', u'wasser', u'wir', u'was', u'weiss', u'wer', u'frau', u'gelb', u'weit', u'sc

Check what the similar words are in English

In [30]:
_, _, sims_en = get_corr(en_emb, en_swad)
sims_en['dog']

[u'dog',
 u'bite',
 u'louse',
 u'claw',
 u'bird',
 u'snake',
 u'eat',
 u'nose',
 u'hair',
 u'ear',
 u'fat',
 u'tooth',
 u'bone',
 u'fish',
 u'foot',
 u'tail',
 u'egg',
 u'cold',
 u'skin',
 u'hot',
 u'eye',
 u'feather',
 u'tongue',
 u'bark',
 u'mouth',
 u'flesh',
 u'little',
 u'drink',
 u'kill',
 u'mountain',
 u'stand',
 u'sleep',
 u'go',
 u'night',
 u'head',
 u'neck',
 u'big',
 u'hand',
 u'good',
 u'blood',
 u'heart',
 u'horn',
 u'swim',
 u'liver',
 u'human',
 u'stone',
 u'worm',
 u'hear',
 u'moon',
 u'say',
 u'you',
 u'sit',
 u'that',
 u'fire',
 u'black',
 u'know',
 u'tree',
 u'white',
 u'what',
 u'fly',
 u'come',
 u'red',
 u'breast',
 u'sand',
 u'we',
 u'yellow',
 u'short',
 u'adulterous',
 u'smoke',
 u'dry',
 u'name',
 u'male',
 u'long',
 u'not',
 u'heavy',
 u'knee',
 u'rain',
 u'root',
 u'star',
 u'thin',
 u'path',
 u'far',
 u'who',
 u'female',
 u'abdomen',
 u'wind',
 u'leaf',
 u'many',
 u'water',
 u'salt',
 u'this',
 u'year',
 u'die',
 u'two',
 u'all',
 u'lie',
 u'cloud',
 u'sun',

Check what the similar words are in German

In [31]:
_, _, sims_de = get_corr(de_emb, de_swad)
sims_de['hund']

[u'hund',
 u'mensch',
 u'schwanz',
 u'mann',
 u'beissen',
 u'schlange',
 u'frau',
 u'nase',
 u'vogel',
 u'schlafen',
 u'ohr',
 u'bauch',
 u'kralle',
 u'fisch',
 u'fleisch',
 u'knochen',
 u'kopf',
 u'haut',
 u'trinken',
 u'mund',
 u'haar',
 u'umbringen',
 u'zunge',
 u'leber',
 u'blut',
 u'nacken',
 u'brust',
 u'mond',
 u'stein',
 u'baum',
 u'wer',
 u'laus',
 u'sagen',
 u'sterben',
 u'wolke',
 u'knie',
 u'herz',
 u'gut',
 u'hand',
 u'auge',
 u'wurm',
 u'h\xf6ren',
 u'was',
 u'fliegen',
 u'kennen',
 u'nicht',
 u'wir',
 u'viel',
 u'fuss',
 u'stern',
 u'nacht',
 u'ei',
 u'zahn',
 u'rauch',
 u'sehen',
 u'dick',
 u'ich',
 u'ein',
 u'weiss',
 u'essen',
 u'feder',
 u'du',
 u'schwer',
 u'wurzel',
 u'wasser',
 u'erde',
 u'schwimmen',
 u'sonne',
 u'borke',
 u'feuer',
 u'sitzen',
 u'pfad',
 u'wind',
 u'gehen',
 u'kalt',
 u'voll',
 u'schwarz',
 u'berg',
 u'regen',
 u'gross',
 u'brennen',
 u'klein',
 u'geben',
 u'warm',
 u'das',
 u'horn',
 u'nah',
 u'salz',
 u'name',
 u'lang',
 u'zwei',
 u'asche',
 u

Train universal embedding based on English and German

In [32]:
    W = np.ndarray(shape=(2, len(en_swad), en_emb.shape[1]), dtype=np.float32)
    W[0, :, :] = en_emb
    W[1, :, :] = de_emb
    T1, T, A = train(W, num_steps=50000)

Initialized
Loss at step 0: 386.768829
Loss at step 100: 379.394958
Loss at step 200: 372.529663
Loss at step 300: 366.101318
Loss at step 400: 360.043884
Loss at step 500: 354.297607
Loss at step 600: 348.810577
Loss at step 700: 343.538330
Loss at step 800: 338.442688
Loss at step 900: 333.491821
Loss at step 1000: 328.661011
Loss at step 1100: 323.927979
Loss at step 1200: 319.276276
Loss at step 1300: 314.692871
Loss at step 1400: 310.166809
Loss at step 1500: 305.689850
Loss at step 1600: 301.255463
Loss at step 1700: 296.859192
Loss at step 1800: 292.497009
Loss at step 1900: 288.166016
Loss at step 2000: 283.864563
Loss at step 2100: 279.591309
Loss at step 2200: 275.345062
Loss at step 2300: 271.125122
Loss at step 2400: 266.931458
Loss at step 2500: 262.763947
Loss at step 2600: 258.622375
Loss at step 2700: 254.507156
Loss at step 2800: 250.418518
Loss at step 2900: 246.356934
Loss at step 3000: 242.322845
Loss at step 3100: 238.316513
Loss at step 3200: 234.338730
Loss at st

In [33]:
corr_mx, sim_corr, sims_univ = get_corr(A, en_swad)

In [34]:
corr_mx

array([[ 1.00000024,  0.27097768,  0.14398828, ...,  0.22070432,
         0.17658763,  0.25892368],
       [ 0.27097768,  0.99999994,  0.13445655, ...,  0.11244301,
         0.15854077,  0.25677812],
       [ 0.14398828,  0.13445655,  1.00000012, ...,  0.27488026,
         0.28635505,  0.09526135],
       ..., 
       [ 0.22070432,  0.11244301,  0.27488026, ...,  0.99999994,
         0.23121968,  0.15674552],
       [ 0.17658763,  0.15854077,  0.28635505, ...,  0.23121968,
         1.00000012,  0.12600701],
       [ 0.25892368,  0.25677812,  0.09526135, ...,  0.15674552,
         0.12600701,  1.        ]], dtype=float32)

Check what the similar words are in the universal embedding

In [35]:
sims_univ['dog']

[u'dog',
 u'bite',
 u'louse',
 u'claw',
 u'bird',
 u'snake',
 u'eat',
 u'nose',
 u'hair',
 u'ear',
 u'fat',
 u'tooth',
 u'bone',
 u'fish',
 u'foot',
 u'tail',
 u'egg',
 u'cold',
 u'skin',
 u'hot',
 u'eye',
 u'feather',
 u'tongue',
 u'bark',
 u'mouth',
 u'flesh',
 u'little',
 u'drink',
 u'kill',
 u'mountain',
 u'stand',
 u'sleep',
 u'go',
 u'night',
 u'head',
 u'neck',
 u'big',
 u'hand',
 u'good',
 u'blood',
 u'horn',
 u'heart',
 u'swim',
 u'liver',
 u'human',
 u'stone',
 u'worm',
 u'hear',
 u'moon',
 u'say',
 u'you',
 u'sit',
 u'that',
 u'fire',
 u'black',
 u'know',
 u'tree',
 u'white',
 u'what',
 u'fly',
 u'come',
 u'red',
 u'breast',
 u'sand',
 u'we',
 u'yellow',
 u'short',
 u'adulterous',
 u'smoke',
 u'dry',
 u'name',
 u'male',
 u'long',
 u'not',
 u'heavy',
 u'knee',
 u'rain',
 u'root',
 u'star',
 u'thin',
 u'path',
 u'far',
 u'who',
 u'female',
 u'abdomen',
 u'wind',
 u'leaf',
 u'many',
 u'water',
 u'salt',
 u'this',
 u'year',
 u'die',
 u'two',
 u'all',
 u'lie',
 u'cloud',
 u'sun',

In [36]:

de_emb_trans = np.dot(de_emb, T[0])

dog = en_emb[0]
hund = de_emb_trans[0]

print(np.linalg.norm(dog))
print(np.linalg.norm(hund))
sim = cosine_similarity(dog.reshape(1, -1), hund.reshape(1, -1))
# print(sim)
# print(dog)
# print(A[0])

print(np.linalg.norm(dog - A[0]))
de_emb.astype(np.float32)

1.0
0.999773
0.000689033


array([[-0.06213669,  0.05129817, -0.04741741, ..., -0.02501231,
        -0.01856263,  0.06243059],
       [-0.03386743,  0.04923715, -0.03906514, ..., -0.0276785 ,
        -0.02018742,  0.06505576],
       [-0.07601882,  0.07272251, -0.04302153, ...,  0.06049242,
         0.09194499,  0.14450699],
       ..., 
       [-0.10472649,  0.04958409,  0.03859929, ..., -0.03333807,
         0.0022068 ,  0.06766976],
       [-0.00937888,  0.07365981,  0.0020786 , ...,  0.0239687 ,
         0.01208459,  0.13262165],
       [-0.09250893, -0.02600697, -0.09789319, ..., -0.05426902,
         0.03866027,  0.11533482]], dtype=float32)