List of datasets:

## Everything

Sebjan.pickle, Kamchatka.pickle -- словари формата

`{
    doc:{    # in a dict, named
        meta:[meta],    # metainfo at the head
        text:[
            {        # sentence (in a list, numbered)
                layer:[
                    morphs    # for mb, ge, ps
                    ],
                layer:''    # e.g. translation
            }
        ]
    }
}`

## Converbs

converbs.pickle -- таблица с конвербами + логлайк


## Morphemes

### Raw

Sebjan_morphemes.pickle, Kamchatka_morphemes.pickle -- raw succession of words like this

` ('ROOT', 'ROOT', 'ROOT'),    # root
 ('-B', '-med', 'v'),
 ('-DEŋ', '-pst.ptc', 'v'),
 ('-E', '-ep', 'v')
 ('END', 'END', 'END')]     # end of the word`

### Single morphemes
 
Sebjan_morphemes_only.pickle, Kamchatka_morphemes_only.pickle -- pd of morphemes with counts only

morpheme_counts.pickle -- pd of morphemes with counts only by two corp together

total_morphemes.pickle -- таблица с морфемами + логлайк


 
 ### Pairs
 
 Sebjan_pairs_dict.pickle, Kamchatka_pairs_dict.pickle -- dicts of (morph): {morph_after: N}
 
 pairs.pickle -- pd with pairs and count only
 
 pairs_by_ll.pickle -- pd with pairs and all the stats
 


In [33]:
import os, re, pickle
from pprint import pprint
import numpy as np
import pandas as pd 

In [8]:
pairs = pd.read_pickle('pairs.pickle')
morphs = pd.read_pickle('morpheme_counts.pickle')

In [108]:
# morphs = pd.read_pickle('total_morphemes.pickle')[['morpheme', 'gloss', 'pos', 's_count', 'k_count']]
# morphs.to_pickle('morpheme_counts.pickle')

In [10]:
pairs['morpheme'] = pairs['1morpheme']
pairs['gloss'] = pairs['1gloss']
pairs['pos'] = pairs['1pos']
del pairs['1morpheme']
del pairs['1gloss']
del pairs['1pos']
pairs = pd.merge(pairs, morphs, on=['morpheme', 'gloss', 'pos'], suffixes=['_pair', '_one'])

In [25]:
pairs.head()

Unnamed: 0,2gloss,2morpheme,2pos,s_count_pair,k_count_pair,morpheme,gloss,pos,s_count_one,k_count_one,s_expected,k_expected,k_semi_l,s_semi_l
0,END,END,END,1.0,0.0,BIt,1pl.Y,ptl,1.0,0.0,1.0,0.0,0.0,0.0
1,END,END,END,5.0,0.0,BIt,1pl.Y,v,5.0,0.0,5.0,0.0,0.0,0.0
2,END,END,END,0.0,4.0,(R)U,1pl.ex,?,0.0,4.0,0.0,4.0,0.0,0.0
3,END,END,END,0.0,1.0,(R)U,1pl.ex,n,0.0,1.0,0.0,1.0,0.0,0.0
4,ptl,tIt,v,0.0,1.0,(R)U,1pl.ex,v,0.0,322.0,0.0,1.0,0.0,0.0


In [26]:
# clean up

pairs['s_count_pair'] = pairs['s_count_pair'].fillna(0)
pairs['k_count_pair'] = pairs['k_count_pair'].fillna(0)
pairs = pairs.loc[pairs['s_count_pair'] != 49803]
pairs = pairs.loc[pairs['gloss'] != 0]
pairs = pairs.loc[pairs['gloss'] != '']
pairs = pairs.loc[pairs['gloss'] != '*']
pairs = pairs.loc[pairs['gloss'] != '***']
pairs = pairs.loc[pairs['gloss'] != '******']
pairs = pairs.loc[pairs['gloss'] != 'ep']
# очищаю от тех, где разница полностью в первой морфеме и которые дают нулевой логлайк
pairs = pairs.loc[pairs['s_count_one'] != 0]
pairs = pairs.loc[pairs['k_count_one'] != 0]

# calculate log-likelihood
# expected = n_of_wds_in_crp_a*(count_in_a+count_in_b)/n_of_wds_in_both
pairs['s_expected'] = pairs['s_count_one']*(pairs['s_count_pair']+pairs['k_count_pair'])/(pairs['s_count_one']+pairs['k_count_one'])
pairs['k_expected'] = pairs['k_count_one']*(pairs['s_count_pair']+pairs['k_count_pair'])/(pairs['s_count_one']+pairs['k_count_one'])
pairs['k_semi_l'] = pairs['k_count_pair']*np.log(pairs['k_count_pair']/pairs['k_expected'])
pairs['s_semi_l'] = pairs['s_count_pair']*np.log(pairs['s_count_pair']/pairs['s_expected'])
pairs['k_semi_l'] = pairs['k_semi_l'].fillna(0)
pairs['s_semi_l'] = pairs['s_semi_l'].fillna(0)
pairs['log_l'] = 2*(pairs['s_semi_l'] + pairs['k_semi_l'])
pairs['ell'] = pairs['log_l']/(82582*np.log(
        pairs['s_expected']*(pairs['s_expected']<pairs['k_expected'])+pairs['k_expected']*(pairs['k_expected']<pairs['s_expected'])
                                     )
                              )
pairs = pairs.sort_values('log_l', ascending=False)

del pairs['k_semi_l']
del pairs['s_semi_l']
del pairs['s_expected']
del pairs['k_expected']

columns = pairs.columns.tolist()
columns = columns[5:8] + columns[:5] + columns[8:]
pairs = pairs[columns]

In [29]:
49800+33112

82912

In [30]:
pairs['s_exp_j'] = 49800*(pairs['s_count_pair']+pairs['k_count_pair'])/82912
pairs['k_exp_j'] = 33112*(pairs['s_count_pair']+pairs['k_count_pair'])/82912
pairs['k_semi_l'] = pairs['k_count_pair']*np.log(pairs['k_count_pair']/pairs['k_exp_j'])
pairs['s_semi_l'] = pairs['s_count_pair']*np.log(pairs['s_count_pair']/pairs['s_exp_j'])
pairs['k_semi_l'] = pairs['k_semi_l'].fillna(0)
pairs['s_semi_l'] = pairs['s_semi_l'].fillna(0)
pairs['log_l_j'] = 2*(pairs['s_semi_l'] + pairs['k_semi_l'])
# pairs['ell'] = pairs['log_l']/(82912*np.log(
#         pairs['s_expected']*(pairs['s_expected']<pairs['k_expected'])+pairs['k_expected']*(pairs['k_expected']<pairs['s_expected'])
#                                      )
#                               )
pairs = pairs.sort_values('log_l_j', ascending=False)

del pairs['k_semi_l']
del pairs['s_semi_l']
del pairs['s_exp_j']
del pairs['k_exp_j']

In [35]:
pairs.head(20)

Unnamed: 0,morpheme,gloss,pos,2gloss,2morpheme,2pos,s_count_pair,k_count_pair,s_count_one,k_count_one,log_l,ell,log_l_j
1649,WEːČ,gnr,v,nonfut,R(E),v,19.0,1506.0,332.0,1901.0,352.851941,0.000788,2579.633725
2999,D,prog,v,gnr,WEːČ,v,0.0,458.0,764.0,1676.0,344.038642,0.000839,840.78179
3403,nIkEn,sim.cvb,v,END,END,END,1039.0,34.0,1076.0,59.0,10.35692,3.1e-05,820.07031
2421,čE,pf.ptc,v,END,END,END,1083.0,73.0,1886.0,210.0,19.989164,5.1e-05,693.58156
3122,RI,pst,v,poss.1pl.ex,WUn,v,0.0,318.0,1302.0,2706.0,249.83405,0.000652,583.774256
1702,Gr(E),hab,v,3sg,n(I),v,463.0,0.0,1731.0,41.0,21.677267,0.000111,472.042224
1697,Gr(E),hab,v,3pl,r,v,417.0,0.0,1731.0,41.0,19.523586,0.000104,425.14386
3111,RI,pst,v,poss.3sg,n(I),v,641.0,1145.0,1302.0,2706.0,9.283717,1.8e-05,423.72795
2982,D,prog,v,pst,RI,v,27.0,301.0,764.0,1676.0,102.24666,0.000267,393.532141
2289,R(E),nonfut,v,3sg,n(I),v,706.0,1178.0,2284.0,3016.0,24.587166,4.4e-05,390.063858


In [38]:
del pairs['ell']
del pairs['log_l_j']
pairs = pairs.sort_values('log_l')

In [39]:
pairs.to_pickle('pairs_by_ll.pickle')

In [34]:
pairs = pd.read_pickle('pairs_by_ll.pickle')

In [89]:
# здесь я находила и убирала дубликаты
# pd.concat(g for _, g in morphs.groupby(['morpheme', 'gloss', 'pos']) if len(g) > 1)

# morphs['k_count'][731] = 134
# morphs['k_count'][74] = 233
# morphs['k_count'][494] = 42
# morphs['k_count'][742] = 39
# morphs = morphs.sort_index()
# morphs = morphs.drop(morphs.index[75]).drop(morphs.index[732]).drop(morphs.index[495]).drop(morphs.index[743])

In [124]:
rel_pairs = pd.concat(g.sort_values('log_l', ascending=False) for _, g in pairs.groupby(['morpheme', 'gloss', 'pos']) if len(g.loc[g['log_l']>10])>0)       

In [126]:
len(rel_pairs['gloss'].unique())

54

In [129]:
rel_pairs = rel_pairs.sort_values('log_l', ascending=False)
rel_pairs['log_l'] = rel_pairs['log_l'].map(lambda x: round(x, 1))
rel_pairs.to_excel('pairs_by_ll.xlsx')

Здесь будет всякий байес и ми

P(B|A) = P(A|B)\*P(A)/P(B)