List of datasets:

## Everything

Sebjan.pickle, Kamchatka.pickle -- словари формата

`{
    doc:{    # in a dict, named
        meta:[meta],    # metainfo at the head
        text:[
            {        # sentence (in a list, numbered)
                layer:[
                    morphs    # for mb, ge, ps
                    ],
                layer:''    # e.g. translation
            }
        ]
    }
}`

## Converbs

converbs.pickle -- таблица с конвербами + логлайк


## Morphemes

### Raw

Sebjan_morphemes.pickle, Kamchatka_morphemes.pickle -- raw succession of words like this

` ('ROOT', 'ROOT', 'ROOT'),    # root
 ('-B', '-med', 'v'),
 ('-DEŋ', '-pst.ptc', 'v'),
 ('-E', '-ep', 'v')
 ('END', 'END', 'END')]     # end of the word`

### Single morphemes
 
Sebjan_morphemes_only.pickle, Kamchatka_morphemes_only.pickle -- pd of morphemes with counts only

morpheme_counts.pickle -- pd of morphemes with counts only by two corp together

total_morphemes.pickle -- таблица с морфемами + логлайк


 
 ### Pairs
 
 Sebjan_pairs_dict.pickle, Kamchatka_pairs_dict.pickle -- dicts of (morph): {morph_after: N}
 
 pairs.pickle -- pd with pairs and count only
 
 pairs_by_ll.pickle -- pd with pairs and all the stats
 



In [1]:
import os, re, pickle
from pprint import pprint
import numpy as np
import pandas as pd 

# Здесь я буду группировать части речи по noun-like/verb-like

## Сначала для одиночных морфем

In [9]:
with open('Kamchatka_morphemes.pickle', 'rb') as f:
    kam_mor = pickle.load(f)
with open('Sebjan_morphemes.pickle', 'rb') as f:
    seb_mor = pickle.load(f)

In [11]:
def group(corp):
    v_like = ['v', 'aux', 'vb']
    n_like = ['n', 'rel.n', 'adj', 'rel', 'num', 'quant']
    res = []
    for line in corp:
        if line[2] in v_like:
            res.append(tuple([*line[:2], 'verbal']))
        elif line[2] in n_like:
            res.append(tuple([*line[:2], 'nominal']))
        else:
            res.append(line)
    return res

seb_gr = group(seb_mor)
kam_gr = group(kam_mor)

In [17]:
from collections import Counter

seb_count = Counter(seb_gr)
kam_count = Counter(kam_gr)


kam_pd = pd.DataFrame({
                       'morpheme': [x[0] for x in kam_count if 'ROOT' not in x and 'END' not in x], 
                       'gloss': [x[1] for x in kam_count if 'ROOT' not in x and 'END' not in x],
                       'pos': [x[2] for x in kam_count if 'ROOT' not in x and 'END' not in x],
                       'k_count': [kam_count[x] for x in kam_count if 'ROOT' not in x and 'END' not in x]
    })

seb_pd = pd.DataFrame({
                       'morpheme': [x[0] for x in seb_count if 'ROOT' not in x and 'END' not in x], 
                       'gloss': [x[1] for x in seb_count if 'ROOT' not in x and 'END' not in x],
                       'pos': [x[2] for x in seb_count if 'ROOT' not in x and 'END' not in x],
                       's_count': [seb_count[x] for x in seb_count if 'ROOT' not in x and 'END' not in x]
    })

seb_pd = seb_pd.sort_values(('s_count'), ascending=False)
kam_pd = kam_pd.sort_values(('k_count'), ascending=False)

total = pd.merge(seb_pd, kam_pd, how='outer', on=['morpheme', 'gloss', 'pos'])
total = total.fillna(0)
total.to_pickle('morpheme_counts_grouped.pickle')

total['s_expected'] = 49804*(total['s_count']+total['k_count'])/(82582)
total['k_expected'] = 32778*(total['s_count']+total['k_count'])/(82582)

total['k_semi_l'] = total['k_count']*np.log(total['k_count']/total['k_expected'])
total['s_semi_l'] = total['s_count']*np.log(total['s_count']/total['s_expected'])
total['k_semi_l'] = total['k_semi_l'].fillna(0)
total['s_semi_l'] = total['s_semi_l'].fillna(0)
total['log_l'] = 2*(total['s_semi_l'] + total['k_semi_l'])

# здесь есть вытаскивание минимального значения из двух столбцов
total['ell'] = total['log_l']/(82582*np.log(
        total['s_expected']*(total['s_expected']<total['k_expected'])+total['k_expected']*(total['k_expected']<total['s_expected'])
                                     )
                              )

del total['k_semi_l']
del total['s_semi_l']
del total['k_expected']
del total['s_expected']

total = total.sort_values('log_l', ascending=False)

total.head()

In [15]:
total.to_pickle('total_morphemes_grouped.pickle')
total.to_excel('total_morphemes_grouped.xlsx')

In [None]:
total = pd.read_pickle('total_morphemes_grouped.pickle')



## Теперь для пар

In [None]:
def remove_ep(corp):
    res = []
    for x in corp:
        if 'ep' not in x:
            res.append(x)
    return res

seb_gr = remove_ep(seb_gr)
kam_gr = remove_ep(kam_gr)

# bigramm-style!
k_pairs = Counter([(kam_gr[i], kam_gr[i+1]) for i in range(len(kam_gr)-1)])
s_pairs = Counter([(seb_gr[i], seb_gr[i+1]) for i in range(len(seb_gr)-1)])

kpd_pairs = pd.DataFrame({
                       'morpheme': [x[0][0] for x in k_pairs], 
                       'gloss': [x[0][1] if len(x[0])==3 else x[0][0] for x in k_pairs],
                       'pos': [x[0][2] if len(x[0])==3 else x[0][0] for x in k_pairs],
                       '2morpheme': [x[1][0] for x in k_pairs], 
                       '2gloss': [x[1][1] if len(x[1])==3 else x[1][0] for x in k_pairs],
                       '2pos': [x[1][2] if len(x[1])==3 else x[1][0] for x in k_pairs],
                       'k_count': [k_pairs[x] for x in k_pairs]
    })

spd_pairs = pd.DataFrame({
                       'morpheme': [x[0][0] for x in s_pairs], 
                       'gloss': [x[0][1] if len(x[0])==3 else x[0][0] for x in s_pairs],
                       'pos': [x[0][2] if len(x[0])==3 else x[0][0] for x in s_pairs],
                       '2morpheme': [x[1][0] for x in s_pairs], 
                       '2gloss': [x[1][1] if len(x[1])==3 else x[1][0] for x in s_pairs],
                       '2pos': [x[1][2] if len(x[1])==3 else x[1][0] for x in s_pairs],
                       's_count': [s_pairs[x] for x in s_pairs]
    })


pairs = pd.merge(spd_pairs, kpd_pairs, how='outer', on=['morpheme', 'gloss', 'pos', '2morpheme', '2gloss', '2pos'])

pairs = pairs.sort_values(['gloss', 'morpheme', 'pos'])

morphs = pd.read_pickle('morpheme_counts_grouped.pickle')
pairs = pd.merge(pairs, morphs, on=['morpheme', 'gloss', 'pos'], suffixes=['_pair', '_one'])

pairs['s_count_pair'] = pairs['s_count_pair'].fillna(0)
pairs['k_count_pair'] = pairs['k_count_pair'].fillna(0)
pairs = pairs.loc[pairs['s_count_pair'] != 49803]
pairs = pairs.loc[pairs['gloss'] != 0]
pairs = pairs.loc[pairs['gloss'] != '']
pairs = pairs.loc[pairs['gloss'] != '*']
pairs = pairs.loc[pairs['gloss'] != '***']
pairs = pairs.loc[pairs['gloss'] != '******']
pairs = pairs.loc[pairs['gloss'] != 'ep']

# calculate log-likelihood
# expected = n_of_wds_in_crp_a*(count_in_a+count_in_b)/n_of_wds_in_both
pairs['s_expected'] = pairs['s_count_one']*(pairs['s_count_pair']+pairs['k_count_pair'])/(pairs['s_count_one']+pairs['k_count_one'])
pairs['k_expected'] = pairs['k_count_one']*(pairs['s_count_pair']+pairs['k_count_pair'])/(pairs['s_count_one']+pairs['k_count_one'])
pairs['k_semi_l'] = pairs['k_count_pair']*np.log(pairs['k_count_pair']/pairs['k_expected'])
pairs['s_semi_l'] = pairs['s_count_pair']*np.log(pairs['s_count_pair']/pairs['s_expected'])
pairs['k_semi_l'] = pairs['k_semi_l'].fillna(0)
pairs['s_semi_l'] = pairs['s_semi_l'].fillna(0)
pairs['log_l'] = 2*(pairs['s_semi_l'] + pairs['k_semi_l'])
pairs['ell'] = pairs['log_l']/(82582*np.log(
        pairs['s_expected']*(pairs['s_expected']<pairs['k_expected'])+pairs['k_expected']*(pairs['k_expected']<pairs['s_expected'])
                                     )
                              )
pairs = pairs.sort_values('log_l', ascending=False)

del pairs['k_semi_l']
del pairs['s_semi_l']
del pairs['s_expected']
del pairs['k_expected']

In [26]:
columns = pairs.columns.tolist()
columns = columns[4:5] + columns[3:4] + columns[5:6] + columns[1:2] + columns[:1] + columns[2:3] + columns[6:]
pairs = pairs[columns]

pairs.head(20)

Unnamed: 0,morpheme,gloss,pos,2morpheme,2gloss,2pos,s_count_pair,k_count_pair,s_count_one,k_count_one,log_l,ell
2203,R(E),nonfut,verbal,p,1pl.in,verbal,384.0,52.0,2084.0,2956.0,415.043799,0.000968
1550,WEːČ,gnr,verbal,R(E),nonfut,verbal,19.0,1506.0,335.0,1891.0,358.820077,0.000799
2928,D,prog,verbal,WEːČ,gnr,verbal,0.0,458.0,765.0,1675.0,344.585345,0.00084
3021,RI,pst,verbal,t(I),poss.1pl.in,verbal,170.0,9.0,1308.0,2712.0,317.468213,0.000946
1568,WEːČ,gnr,verbal,Gr(E),hab,verbal,67.0,0.0,335.0,1891.0,253.773354,0.00133
3026,RI,pst,verbal,WUn,poss.1pl.ex,verbal,0.0,318.0,1308.0,2712.0,250.326759,0.000653
2910,D,prog,verbal,nIkEn,sim.cvb,verbal,112.0,3.0,765.0,1675.0,234.270697,0.000791
2896,D,prog,verbal,REk,cond.cvb,verbal,111.0,32.0,765.0,1675.0,129.517872,0.000412
2904,D,prog,verbal,RI,impf.ptc,verbal,94.0,21.0,765.0,1675.0,124.531704,0.000421
2902,D,prog,verbal,L,inch,verbal,2.0,165.0,765.0,1675.0,107.10518,0.000328


In [27]:
rel_pairs = pd.concat(g.sort_values('log_l', ascending=False) for _, g in pairs.groupby(['morpheme', 'gloss', 'pos']) if len(g.loc[g['log_l']>10])>0)       
rel_pairs = rel_pairs.sort_values('log_l', ascending=False)
rel_pairs['log_l'] = rel_pairs['log_l'].map(lambda x: round(x, 1))
rel_pairs.to_excel('pairs_by_ll_grouped.xlsx')

In [28]:
len(rel_pairs['gloss'].unique())

55