List of datasets:

## Everything

Sebjan.pickle, Kamchatka.pickle -- словари формата

`{
    doc:{    # in a dict, named
        meta:[meta],    # metainfo at the head
        text:[
            {        # sentence (in a list, numbered)
                layer:[
                    morphs    # for mb, ge, ps
                    ],
                layer:''    # e.g. translation
            }
        ]
    }
}`

## Converbs

converbs.pickle -- таблица с конвербами + логлайк


## Morphemes

### Raw

Sebjan_morphemes.pickle, Kamchatka_morphemes.pickle -- raw succession of words like this

` (0,),    # root
 ('-B', '-med', 'v'),
 ('-DEŋ', '-pst.ptc', 'v'),
 ('-E', '-ep', 'v')
 (1,)]     # end of the word`

### Single morphemes
 
Sebjan_morphemes_only.pickle, Kamchatka_morphemes_only.pickle -- pd of morphemes with counts only

morphemes_count.pickle -- pd of morphemes with counts only by two corp together

total_morphemes.pickle -- таблица с морфемами + логлайк


 
 ### Pairs
 
 Sebjan_pairs_dict.pickle, Kamchatka_pairs_dict.pickle -- dicts of (morph): {morph_after: N}
 
 pairs.pickle -- pd with pairs and count only
 
 pairs_by_ll.pickle -- pd with pairs and all the stats
 


In [1]:
import os, re, pickle
from pprint import pprint
import numpy as np
import pandas as pd 

In [107]:
pairs = pd.read_pickle('pairs.pickle')
morphs = pd.read_pickle('morpheme_counts.pickle')

In [108]:
# morphs = pd.read_pickle('total_morphemes.pickle')[['morpheme', 'gloss', 'pos', 's_count', 'k_count']]
# morphs.to_pickle('morpheme_counts.pickle')

In [109]:
pairs['morpheme'] = pairs['1morpheme']
pairs['gloss'] = pairs['1gloss']
pairs['pos'] = pairs['1pos']
del pairs['1morpheme']
del pairs['1gloss']
del pairs['1pos']
pairs = pd.merge(pairs, morphs, on=['morpheme', 'gloss', 'pos'], suffixes=['_pair', '_one'])

In [110]:
pairs.head()

Unnamed: 0,2gloss,2morpheme,2pos,s_count_pair,k_count_pair,morpheme,gloss,pos,s_count_one,k_count_one
0,END,END,END,1.0,,BIt,1pl.Y,ptl,1.0,0.0
1,END,END,END,5.0,,BIt,1pl.Y,v,5.0,0.0
2,END,END,END,,4.0,(R)U,1pl.ex,?,0.0,4.0
3,END,END,END,,1.0,(R)U,1pl.ex,n,0.0,1.0
4,ptl,tIt,v,,1.0,(R)U,1pl.ex,v,0.0,322.0


In [111]:
# clean up

pairs['s_count_pair'] = pairs['s_count_pair'].fillna(0)
pairs['k_count_pair'] = pairs['k_count_pair'].fillna(0)
pairs = pairs.loc[pairs['s_count_pair'] != 49803]
pairs = pairs.loc[pairs['gloss'] != 0]
pairs = pairs.loc[pairs['gloss'] != '']
pairs = pairs.loc[pairs['gloss'] != '*']
pairs = pairs.loc[pairs['gloss'] != '***']
pairs = pairs.loc[pairs['gloss'] != '******']
pairs = pairs.loc[pairs['gloss'] != 'ep']

In [112]:
# calculate log-likelihood
# expected = n_of_wds_in_crp_a*(count_in_a+count_in_b)/n_of_wds_in_both
pairs['s_expected'] = pairs['s_count_one']*(pairs['s_count_pair']+pairs['k_count_pair'])/(pairs['s_count_one']+pairs['k_count_one'])
pairs['k_expected'] = pairs['k_count_one']*(pairs['s_count_pair']+pairs['k_count_pair'])/(pairs['s_count_one']+pairs['k_count_one'])
pairs['k_semi_l'] = pairs['k_count_pair']*np.log(pairs['k_count_pair']/pairs['k_expected'])
pairs['s_semi_l'] = pairs['s_count_pair']*np.log(pairs['s_count_pair']/pairs['s_expected'])
pairs['k_semi_l'] = pairs['k_semi_l'].fillna(0)
pairs['s_semi_l'] = pairs['s_semi_l'].fillna(0)
pairs['log_l'] = 2*(pairs['s_semi_l'] + pairs['k_semi_l'])
pairs['ell'] = pairs['log_l']/(82582*np.log(
        pairs['s_expected']*(pairs['s_expected']<pairs['k_expected'])+pairs['k_expected']*(pairs['k_expected']<pairs['s_expected'])
                                     )
                              )
pairs = pairs.sort_values('log_l', ascending=False)

del pairs['k_semi_l']
del pairs['s_semi_l']
del pairs['s_expected']
del pairs['k_expected']

In [113]:
columns = pairs.columns.tolist()
columns = columns[5:8] + columns[:5] + columns[8:]
pairs = pairs[columns]

In [114]:
pairs.head(20)

Unnamed: 0,morpheme,gloss,pos,2gloss,2morpheme,2pos,s_count_pair,k_count_pair,s_count_one,k_count_one,s_expected,k_expected,log_l,ell
2365,R(E),nonfut,v,1pl.in,p,v,384.0,52.0,2084.0,2956.0,180.28254,255.71746,415.043799,0.000968
1679,WEːČ,gnr,v,nonfut,R(E),v,19.0,1506.0,335.0,1891.0,229.503594,1295.496406,358.820077,0.000799
3164,D,prog,v,gnr,WEːČ,v,0.0,458.0,765.0,1675.0,143.594262,314.405738,344.585345,0.00084
3285,RI,pst,v,poss.1pl.in,t(I),v,170.0,9.0,1308.0,2712.0,58.241791,120.758209,317.468213,0.000946
1681,WEːČ,gnr,v,hab,Gr(E),v,67.0,0.0,335.0,1891.0,10.083109,56.916891,253.773354,0.00133
3287,RI,pst,v,poss.1pl.ex,WUn,v,0.0,318.0,1308.0,2712.0,103.468657,214.531343,250.326759,0.000653
3154,D,prog,v,sim.cvb,nIkEn,v,112.0,3.0,765.0,1675.0,36.055328,78.944672,234.270697,0.000791
3157,D,prog,v,cond.cvb,REk,v,111.0,32.0,765.0,1675.0,44.834016,98.165984,129.517872,0.000412
3152,D,prog,v,impf.ptc,RI,v,94.0,21.0,765.0,1675.0,36.055328,78.944672,124.531704,0.000421
3148,D,prog,v,inch,L,v,2.0,165.0,765.0,1675.0,52.358607,114.641393,107.10518,0.000328


In [89]:
# здесь я находила и убирала дубликаты
# pd.concat(g for _, g in morphs.groupby(['morpheme', 'gloss', 'pos']) if len(g) > 1)

# morphs['k_count'][731] = 134
# morphs['k_count'][74] = 233
# morphs['k_count'][494] = 42
# morphs['k_count'][742] = 39
# morphs = morphs.sort_index()
# morphs = morphs.drop(morphs.index[75]).drop(morphs.index[732]).drop(morphs.index[495]).drop(morphs.index[743])

In [117]:
pd.concat(g for _, g in pairs.groupby(['morpheme', 'gloss', 'pos']))

Unnamed: 0,morpheme,gloss,pos,2gloss,2morpheme,2pos,s_count_pair,k_count_pair,s_count_one,k_count_one,s_expected,k_expected,log_l,ell
688,(E)hEgEr,as.if.about.to.do,v,END,END,END,1.0,0.0,1.0,0.0,1.000000,0.000000,0.000000,-0.000000e+00
3630,(E)ldE,soc,v,pst,RI,v,4.0,4.0,46.0,12.0,6.344828,1.655172,3.368349,8.094366e-05
3617,(E)ldE,soc,v,nr,ČEk,v,7.0,0.0,46.0,12.0,5.551724,1.448276,3.245223,1.061008e-04
3631,(E)ldE,soc,v,1pl.ex,(R)U,v,0.0,1.0,46.0,12.0,0.793103,0.206897,3.151073,-2.421835e-05
3613,(E)ldE,soc,v,3sg,n(I),v,1.0,2.0,46.0,12.0,2.379310,0.620690,2.946664,-7.481625e-05
3624,(E)ldE,soc,v,hab,Gr(E),v,5.0,0.0,46.0,12.0,3.965517,1.034483,2.318016,8.279641e-04
3623,(E)ldE,soc,v,ant.cvb,RIdʒI,v,3.0,0.0,46.0,12.0,2.379310,0.620690,1.390810,-3.531287e-05
3622,(E)ldE,soc,v,nonfut,R(E),v,3.0,0.0,46.0,12.0,2.379310,0.620690,1.390810,-3.531287e-05
3628,(E)ldE,soc,v,prog,D,v,2.0,0.0,46.0,12.0,1.586207,0.413793,0.927206,-1.272421e-05
3618,(E)ldE,soc,v,fut,DʒI,v,2.0,0.0,46.0,12.0,1.586207,0.413793,0.927206,-1.272421e-05
