In [198]:
import json
import pandas as pd
import os

In [199]:
sil2fb_fn = '/home/eszti/projects/dipterv/notebooks/panlex/data/sil2fb.json'

with open(sil2fb_fn) as f:
     sil2fb = json.load(f)

In [200]:
langs = ['eng', 'hun', 'deu']

pan_fold = '/home/eszti/projects/dipterv/notebooks/panlex/data'
emb_fold = '/mnt/permanent/Language/Multi/FB'

Read all panlex dictionaries

In [201]:
df_dicts = dict()
print('will process these langs: {0}'.format(langs))
done = set()
for lang1 in langs:
    for lang2 in langs:
        lang_pair = tuple(sorted([lang1, lang2]))
        if lang1 == lang2 or lang_pair in done:
            continue
        done.add(lang_pair)
        pan_fn = os.path.join(pan_fold, '{0}_{1}.tsv'.format(lang_pair[0], lang_pair[1]))
        print('Reading dictionary from:', pan_fn)
        df = pd.read_csv(pan_fn, sep='\t', header=None, names = ['lang 1', "lang 2", lang_pair[0],  lang_pair[1], 'score'])
        df_dicts[lang_pair] = df

will process these langs: ['eng', 'hun', 'deu']
Reading dictionary from: /home/eszti/projects/dipterv/notebooks/panlex/data/eng_hun.tsv
Reading dictionary from: /home/eszti/projects/dipterv/notebooks/panlex/data/deu_eng.tsv
Reading dictionary from: /home/eszti/projects/dipterv/notebooks/panlex/data/deu_hun.tsv


Creat tables for each language

In [202]:
df_langs = dict()
for lang_pair, df in df_dicts.items():
    lang1 = lang_pair[0]
    lang2 = lang_pair[1]
    if lang1 not in df_langs:
        df_langs[lang1] = df[[lang1]]
    else:
        df_langs[lang1] = pd.concat([df_langs[lang1], df[[lang1]]]).drop_duplicates()
    if lang2 not in df_langs:
        df_langs[lang2] = df[[lang2]]
    else:
        df_langs[lang2] = pd.concat([df_langs[lang2], df[[lang2]]]).drop_duplicates()    

Add found coulumn

In [203]:
for lang in df_langs.keys():
    df_langs[lang] = df_langs[lang].assign(found = False)

Search for embeddings

In [204]:
lookfor = 10
for sil in langs:
    fb = sil2fb[sil]
    emb_fn = os.path.join(emb_fold, 'wiki.{0}/wiki.{0}.vec'.format(fb))
    with open(emb_fn) as f:
        i = 0
        for line in f:
            if i == 0:
                i += 1
                continue
            if i > lookfor:
                break
            fields = line.strip().split(' ')
            w = fields[0]
            w = w.lower()
            df_langs[sil].loc[df_langs[sil][sil] == w, 'found'] = True
            i+=1

Merge tables

In [205]:
df_merged_dicts = dict()
for lang_pair, df in df_dicts.items():
    lang1 = lang_pair[0]
    lang2 = lang_pair[1]
    tmp1 = pd.merge(df, df_langs[lang1], on=lang1)
    tmp2 = pd.merge(tmp1, df_langs[lang2], on=lang2)
    df_merged_dicts[lang_pair] = tmp2
    
df_merged_dicts[('eng', 'hun')].head()

Unnamed: 0,lang 1,lang 2,eng,hun,score,found_x,found_y
0,eng,hun,Romance,román stílusú,9.0,False,False
1,eng,hun,Romance,román,7.0,False,False
2,eng,hun,Latin,román,5.0,False,False
3,eng,hun,Romanian,román,7.0,False,False
4,eng,hun,Romanic,román,7.0,False,False


Language statistics

In [206]:
header = ['lang', 'words', 'found']
df_lang_stat = pd.DataFrame(columns = header)
for lang, df in df_langs.items():
    l = len(df.index)
    f = len(df[df['found'] == True])
    row = pd.DataFrame([[lang, l, f]], columns = header)
    df_lang_stat = df_lang_stat.append(row)

df_lang_stat

Unnamed: 0,lang,words,found
0,deu,227517,7
0,eng,278482,7
0,hun,181405,5


Panlex statistics

In [207]:
header = ['lang1', 'lang2', 'word_pairs', 'found']
df_plx_stat = pd.DataFrame(columns = header)
for lang_pair, df in df_merged_dicts.items():
    lang1 = lang_pair[0]
    lang2 = lang_pair[1]
    l = len(df.index)
    f = len(df[(df['found_x'] == True) & (df['found_y'] == True)])
    row = pd.DataFrame([[lang1, lang2, l, f]], columns = header)
    df_plx_stat = df_plx_stat.append(row)

df_plx_stat

Unnamed: 0,lang1,lang2,word_pairs,found
0,eng,hun,324949,6
0,deu,hun,86978,7
0,deu,eng,461283,10


In [213]:
df_merged_dicts[('deu', 'eng')].loc[(df_merged_dicts[('deu', 'eng')]['found_x'] == True) 
                                    & (df_merged_dicts[('deu', 'eng')]['found_y'] == True)]

Unnamed: 0,lang 1,lang 2,deu,eng,score,found_x,found_y
3626,deu,eng,in,in,7.0,True,True
4364,deu,eng,der,of,7.0,True,True
39470,deu,eng,",",",",8.0,True,True
39471,deu,eng,.,",",8.0,True,True
39475,deu,eng,",",.,8.0,True,True
39477,deu,eng,.,.,8.0,True,True
66468,deu,eng,-,-,8.0,True,True
199837,deu,eng,der,the,4.0,True,True
199838,deu,eng,die,the,5.0,True,True
248145,deu,eng,und,and,5.0,True,True
