# Translate

In [2]:
import json
from __future__ import print_function
from nbformat import current
import logging
import io, os, time, sys
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
def execute_notebook(nbfile):    
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    
    ip = get_ipython()
    
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
        
execute_notebook("functions.ipynb")   

In [3]:
num = 110
silcodes_fn = '/home/eszti/projects/dipterv/univ_embedding/res/swad_fb_{}.json'.format(num)

with open(silcodes_fn) as f:
    silcodes = json.load(f)
    
sil2fbcodes_fn = '/home/eszti/projects/dipterv/univ_embedding/res/sil2fbcodes.json'
with open(sil2fbcodes_fn) as f:
    sil2fb = json.load(f)

swad_idx = []
en_swad_fn = '/home/eszti/data/panlex_swadesh/swadesh{}/eng-000.txt'.format(num)
en_embed_fn = '/mnt/permanent/Language/Multi/FB/wiki.en/wiki.en.vec'
en_swad, en_emb, en_nfi = get_embedding(en_swad_fn, swad_idx, en_embed_fn)

Not found list len: 0
[u'all', u'ash', u'bark', u'abdomen', u'big', u'bird', u'bite', u'black', u'blood', u'bone', u'breast', u'burn', u'claw', u'cloud', u'cold', u'come', u'dead', u'dog', u'drink', u'dry', u'ear', u'earth', u'eat', u'egg', u'eye', u'fat', u'feather', u'fire', u'fish', u'fly', u'foot', u'full', u'give', u'good', u'green', u'hair', u'hand', u'head', u'hear', u'heart', u'horn', u'i', u'kill', u'knee', u'know', u'leaf', u'lie', u'liver', u'long', u'louse', u'male', u'many', u'flesh', u'moon', u'hill', u'mouth', u'name', u'neck', u'new', u'night', u'nose', u'not', u'adulterous', u'human', u'rain', u'red', u'path', u'root', u'round', u'sand', u'say', u'see', u'seed', u'sit', u'hide', u'sleep', u'little', u'smoke', u'stand', u'star', u'rock', u'sun', u'swim', u'tail', u'that', u'this', u'thou', u'tongue', u'tooth', u'tree', u'grind', u'go', u'hot', u'water', u'we', u'what', u'white', u'who', u'female', u'yellow', u'distant', u'dull', u'close', u'salt', u'short', u'snake', u'



In [14]:
main_folder = '/home/eszti/data/embeddings/fb_trans/'
time_str = time.strftime("%H%M")
date_str = time.strftime("%Y%m%d")
trans_dir = os.path.join(main_folder, 'trans', '{0}_{1}'.format(date_str, time_str))
embed_dir = os.path.join(main_folder, 'embedding', '{0}_{1}'.format(date_str, time_str))

os.makedirs(trans_dir)
os.makedirs(embed_dir)

logging.info('making directory for translation matrices: {}'.format(trans_dir))
logging.info('making directory for embeddings: {}'.format(embed_dir))

for sil in silcodes:   
    if sil == 'eng':
        continue
    logging.info('Translating {} language...'.format(sil))
    swad_fn = '/home/eszti/data/panlex_swadesh/swadesh{0}/{1}-000.txt'.format(num, sil)
    embed_fn = '/mnt/permanent/Language/Multi/FB/wiki.{0}/wiki.{0}.vec'.format(sil2fb[sil])
    
    print('swad file: {}'.format(swad_fn))
    print('embedding file: {}'.format(embed_fn))
    logging.info('swadesh file: {}'.format(swad_fn))
    logging.info('embedding file: {}'.format(embed_fn))
    
    swad, emb, nfi = get_embedding(swad_fn, swad_idx, embed_fn)
    
    missing_words = [w for (i, w) in enumerate(en_swad) if i in nfi]
    logging.info('Missing words: {}'.format(missing_words))
    
    print("EMBED:")
    print(emb)
    
    # Filtered English Swadesh
    en_swad_fil = [w for (i, w) in enumerate(en_swad) if i not in nfi]
    en_emb_fil = np.delete(en_emb, nfi, 0)

    W = np.ndarray(shape=(2, len(swad), emb.shape[1]), dtype=np.float32)
    W[0, :, :] = en_emb_fil
    W[1, :, :] = emb
    T1, T, A = train(W, num_steps=50000)
    
    # Save translation matrix
    trans_fn = os.path.join(trans_dir, 'eng_{}'.format(sil))
    with open(trans_fn, 'w') as f:
        np.save(f, T[0])
    
    # Calculate missing embeddings
    en_emb_mis = np.take(en_emb, nfi, 0)
    emb_mis = np.dot(en_emb_mis, T1[0])
    
    # Modify embedding
    idx_before = nfi - range(len(nfi))
    mod_embed = np.insert(emb, idx_before, emb_mis)
    
    # Save modified embedding
    mod_embed_fn = os.path.join(embed_dir, 'eng_{}'.format(sil))
    with open(mod_embed_fn, 'w') as f:
        np.save(f, mod_embed)
        
    break
    

swad file: /home/eszti/data/panlex_swadesh/swadesh110/tat-000.txt
embedding file: /mnt/permanent/Language/Multi/FB/wiki.tt/wiki.tt.vec
Not found list len: 6
[u'\u0431\u0430\u0440\u044b', u'\u043a\u04e9\u043b', u'\u043a\u0430\u0431\u044b\u043a', u'\u043a\u043e\u0440\u0441\u0430\u043a', u'\u0437\u0443\u0440', u'\u043a\u043e\u0448', u'\u0442\u0435\u0448\u043b\u04d9\u0440\u0433\u04d9', u'\u043a\u0430\u0440\u0430', u'kan', u'seyak', u'\u0438\u043c\u0447\u04d9\u043a', u'\u0431\u043e\u043b\u044b\u0442', u'\u0441\u0430\u043b\u043a\u044b\u043d', u'kil', u'il', u'et', u'ec', u'\u043a\u043e\u0440\u044b', u'kolak', u'\u0442\u0443\u0444\u0440\u0430\u043a', u'\u0430\u0448\u0430\u0440\u0433\u0430', u'\u0439\u043e\u043c\u044b\u0440\u043a\u0430', u'kiz', u'\u043a\u0430\u0443\u0440\u044b\u0439', u'ut', u'balok', u'\u043e\u0447\u0430\u0440\u0433\u0430', u'\u0430\u044f\u043a', u'tulo', u'\u0431\u0438\u0440\u0435\u0440\u0433\u04d9', u'\u044f\u0445\u0448\u044b', u'\u044f\u0448\u0435\u043b', u'\u043a\u044b\u

ValueError: shapes (24,) and (300,) not aligned: 24 (dim 0) != 300 (dim 0)

In [37]:
en_emb_mis = np.take(en_emb, nfi, 0)
print(en_emb.shape)
print(emb.shape)
print(en_emb_mis.shape)
print(emb_mis.shape)
print(T[0].shape)

(110, 300)
(86, 300)
(24, 300)
(24,)
(300, 300)


In [44]:
with open(mod_embed_fn) as f:
    a = np.load(f)
mod_embed.shape

trans_fn = os.path.join(trans_dir, 'eng_{}'.format(sil))
with open(trans_fn, 'w') as f:
    np.save(f, T[0])

with open(trans_fn) as f:
    b = np.load(f)

In [50]:
print(a[7])
print(emb[6])

[-0.00102911 -0.01292466 -0.02002739 -0.07683681 -0.01574634  0.12186155
 -0.0443708  -0.09487347  0.0615467  -0.02135201  0.08542475  0.06812111
  0.06824581  0.00152703 -0.00650354 -0.04498217  0.05246906 -0.05029584
 -0.0375698  -0.1642462   0.11949519 -0.00243541 -0.03330092 -0.00301179
  0.04701243  0.08603003  0.09571903 -0.10634333  0.13593654 -0.05885337
  0.10986245 -0.02022966 -0.02585813  0.04824732  0.05251316 -0.07307283
 -0.04734549 -0.07056655 -0.12404237 -0.10819566  0.02274202 -0.017001
  0.01467388 -0.0551046  -0.0472588  -0.03024867 -0.02453047 -0.119883
  0.02405598 -0.0824379   0.01190526 -0.01121893  0.03161435  0.01167349
  0.00277546  0.03274126 -0.13349566 -0.01576459  0.05750442  0.01746485
  0.02760097 -0.00511384  0.02710366  0.04140221 -0.0013046  -0.00644241
  0.05162653  0.02720708  0.00279355  0.04033613  0.01744964 -0.10749914
 -0.00332036 -0.06976509 -0.05806103  0.0842507   0.03002208  0.05475178
  0.04726032 -0.03520191 -0.01965784 -0.11635474  0.109