Наброски кода для
0. сведения алломорфов морфем в одно (хотя бы в таблице)
1. подсчёта сочетаний морфем
2. вытаскивания конвербов
3. вытаскивания цепочек частей речи

Все манипуляции можно тестировать на корпусах, которые в почте.

# 0
Действительно надо вытащить всё, положить в словарину и радоваться

Формат:

`корпус{
    название:документ{    # done
        мета:мета,    # done
        текст:[
            предложение{
                слой:[
                    морфемы
                    ],
                перевод:''
            }
        ]
    }
}`

In [4]:
corpora = ['Kamchatka', 'Sebjan']

In [3]:
import os, re, pickle
from pprint import pprint

In [4]:
def morphs_2_words(line):
    '''берёт на вход массив расчленённой строки, возвращает массив слов'''
    words = []
    word = []
    for morph in line:
        if morph.strip()[0] in '=-':
            word.append(morph)
        else:
            if len(word) > 0:
                words.append(word)
            word = [morph]
    words.append(word)
    return words


def handle_startline(line, res, current_layer):
    line = line.split()
    layer = line[0].strip('\\')
    parted_layers = ['tx', 'mb', 'ge', 'ps']
    if not (len(line) == 1): # чтобы не считать пустые строки
        if layer in parted_layers:
            line_content = morphs_2_words(line[1:]) # делим на слова, состоящие из морфем
        else:
            line_content = [' '.join(line[1:])] # просто целые строки (комментарии и тп)
            current_layer = layer
        if layer in res and res[layer][0] != '':
            res[layer] += line_content
        else:
            res[layer] = line_content
    return res, current_layer


def lines_2_dict(part):
    '''
    i: кусок текста (предложение) в несколько строк, в каждой строке несколько слоёв, и с другими данными предложения
    o: джейсонина вида {'слой': [сл, о, ва], 'слой': содержимое}
    доп. ограничения: длина всех строк-массивов равна
    '''
    res = {}
    lines = [line for line in part.split('\n') if len(line) > 1]
    res['index'] = [lines[0].split('_')[-1]]
    parted_layers = ['tx', 'mb', 'ge', 'ps']
    current_layer = '' # для переносов
    for line in lines[1:]:
        if line.startswith('\\'):
            res, current_layer = handle_startline(line, res, current_layer)
        else:
            if current_layer:
                res[current_layer][0] += ' ' + line
    return res


def check_len(p_sent, fil):
    parted_layers = ['mb', 'ge', 'ps']
    selected_layers = [key for key in p_sent if key in parted_layers and len(p_sent[key])>1]
    # 1. check that the number of words is the same
    lengths = set([len(p_sent[key]) for key in selected_layers])
    if len(lengths) > 1:
        print('Error in {}, here:'.format(fil))
        print(lengths)
        for l in selected_layers:
            print(len(p_sent[l]))
            pprint(p_sent[l])
        return lengths
    # 2. check that morphemes are aligned
    if 'mb' in selected_layers and 'ge' in selected_layers:
        for i in range(len(p_sent['ge'])):
            if len(p_sent['ge'][i]) != len(p_sent['mb'][i]):
                print('што-то слиплось в {}'.format(fil))
                pprint(p_sent)
    return lengths


def make_readable(corp):
    "переводит текст корпусов в удобомашиночитаемую джейсонину"
    folder = 'Corpus_Text_{}_postagged'.format(corp)
    corpus_dict = {}
    for fil in os.listdir(folder):
        if 'pyzhik' in fil:
            continue
        if not fil.endswith('.txt'):
            continue
        with open(os.path.join(folder, fil), 'r') as f:
            text = f.read()
        file_content, text_content = {}, []
        sents = text.split('\id')
        file_content['meta'] = sents[1] # metainfo at the beginning of the file; not parsed
        for sent in sents[2:]:
            sent_content = lines_2_dict(sent)
            check_len(sent_content, fil)
            text_content.append(sent_content)
        file_content['text'] = text_content
        corpus_dict[fil] = file_content
    with open('{}.pickle'.format(corp), 'wb') as f:
        pickle.dump(corpus_dict, f)
    return corpus_dict

In [60]:
for corp in corpora:
    make_readable(corp)

In [27]:
def detect(what, where, replace=False, corpora=corpora, show=True):
    for corpus in corpora:
        hits = []
        with open('{}.pickle'.format(corpus), 'rb') as f:
            content = pickle.load(f)
        for doc in content:
            for i in range(len(content[doc]['text'])): # looping through sentences
                if not where in content[doc]['text'][i]:
                    continue
                for word in content[doc]['text'][i][where]:
                    if what in word:
                        print('Found in {}, {} at {}'.format(corpus, doc, i))
                        if show:
                            pprint(content[doc]['text'][i])
                        hits.append(doc)
        if len(hits) == 0:
            print('noth found. cool! or not?')
        else:
            inp = input('correct all these ({} hits) & press Enter and I\'ll reload.\n'.format(len(hits)))
            if replace:
                target = replace
                for doc in list(set(hits)):
                    with open(os.path.join('Corpus_Text_{}_postagged'.format(corpus), doc), 'r') as f:
                        text = f.read()
                    a_hits = re.findall('{}\\b'.format(what), text)
                    while len(a_hits) > len(hits):
                        print('what is not accurate. see what\'d be affected')
                        pprint(a_hits[:15])
                        what = input('change the what: ')
                        a_hits = re.findall('{} '.format(what.replace('.', '\.')), text)
                    text = re.sub('{}(\\b)'.format(what.replace('.', '\.')), '{}\\1'.format(target), text)
                    with open(os.path.join('Corpus_Text_{}_postagged'.format(corpus), doc), 'w') as f:
                        f.write(text)
            make_readable(corpus)
            print('done!')

In [43]:
nfs = ['-nonfut3pl', '-nonfut3sg', '-nonfut.3pl', '-nonfut.3sg']
for nf in nfs:
    detect(nf, 'ge', replace='-nonfut', show=False)
print('\n\nregalar nfs done\n')
other_nfs = ['-hab.nonfut3pl', '-hab.nonfut3sg', '-hab.nonfut.3pl', '-hab.nonfut.3sg']
for nf in other_nfs:
    detect(nf, 'ge', replace='-hab.nonfut', show=False)
other_nfs = ['-caus.nonfut3pl', '-caus.nonfut3sg', '-caus.nonfut.3pl', '-caus.nonfut.3sg']
for nf in other_nfs:
    detect(nf, 'ge', replace='-caus.nonfut', show=False)

noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?


regalar nfs done

noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
Found in Sebjan, SlepcovaNA_her_class.txt at 27
Found in Sebjan, Zaxarova_JP_pear_story.txt at 21
correct all these (2 hits) & press Enter and I'll reload.

done!
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
Found in Sebjan, Krivoshapkina_AE_childhood.txt at 114
Found in Sebjan, Krivoshapkin_DM_Segen.txt at 40
Found in Sebjan, Krivoshapkin_DM_Segen.txt at 40
correct all these (3 hits) & press Enter and I'll reload.

done!
noth found. cool! or not?
noth found. cool! or not?


Здесь повторяю таблицу которая была

In [2]:
import numpy as np
import pandas as pd 

In [108]:
def extract_morphs(corpus):
    with open('{}.pickle'.format(corpus), 'rb') as f:
        content = pickle.load(f)
    morphs = []
    for doc in content:
        for sent in content[doc]['text']:
            if 'mb' in sent and 'ge' in sent and 'ps' in sent:
                for i in range(len(sent['mb'])):
                    morphs.append(tuple([0]))
                    ps = sent['ps'][i][0]
                    for l in range(1, len(sent['mb'][i])):
                        try:
                            morphs.append((sent['mb'][i][l], sent['ge'][i][l], ps))
                        except:
                            print(doc)
                            pprint(sent)
                    morphs.append(tuple([1]))
    with open('{}_morphemes.pickle'.format(corpus), 'wb') as f:
        pickle.dump(morphs, f)
    print(len(morphs))
    pprint(morphs[:15])

In [109]:
for corp in corpora:
    extract_morphs(corp)

105395
[(0,),
 ('-L', '-pl', 'n'),
 ('-W', '-acc', 'n'),
 (1,),
 (0,),
 ('-L', '-pl', 'n'),
 ('-W', '-acc', 'n'),
 (1,),
 (0,),
 (1,),
 (0,),
 ('-W', '-poss.1sg', 'n'),
 (1,),
 (0,),
 ('-WkEn', '-caus', 'v')]
149704
[(0,),
 ('-WEːČ', '-gnr', 'v'),
 ('-RI', '-impf.ptc', 'v'),
 (1,),
 (0,),
 (1,),
 (0,),
 (1,),
 (0,),
 ('-(dU)LE', '-loc', 'n'),
 (1,),
 (0,),
 ('-B', '-med', 'v'),
 ('-DEŋ', '-pst.ptc', 'v'),
 ('-E', '-ep', 'v')]


In [47]:
with open('Kamchatka_morphemes.pickle', 'rb') as f:
    kam_mor = pickle.load(f)
with open('Sebjan_morphemes.pickle', 'rb') as f:
    seb_mor = pickle.load(f)

In [8]:
from collections import Counter

seb_count = Counter(seb_mor)
kam_count = Counter(kam_mor)

In [9]:
# число слов в каждом
# в прошлый раз было 34933 на камчатке и 51148 в себъяне
print(seb_count[(0,)])
print(kam_count[(0,)])

49804
32778


ЛАДНО

In [113]:
kam_pd = pd.DataFrame({
                       'morpheme': [x[0] for x in kam_count if len(x)==3], 
                       'gloss': [x[1] for x in kam_count if len(x)==3],
                       'pos': [x[2] for x in kam_count if len(x)==3],
                       'k_count': [kam_count[x] for x in kam_count if len(x)==3]
    })

seb_pd = pd.DataFrame({
                       'morpheme': [x[0] for x in seb_count if len(x)==3], 
                       'gloss': [x[1] for x in seb_count if len(x)==3],
                       'pos': [x[2] for x in seb_count if len(x)==3],
                       's_count': [seb_count[x] for x in seb_count if len(x)==3]
    })

In [None]:
seb_pd = seb_pd.sort_values(('s_count'), ascending=False)
kam_pd = kam_pd.sort_values(('k_count'), ascending=False)
seb_pd['gloss'] = seb_pd['gloss'].map(lambda x: x.strip('-='))
seb_pd['morpheme'] = seb_pd['morpheme'].map(lambda x: x.strip('-='))
kam_pd['gloss'] = kam_pd['gloss'].map(lambda x: x.strip('-='))
kam_pd['morpheme'] = kam_pd['morpheme'].map(lambda x: x.strip('-='))

In [117]:
kam_pd.head()

Unnamed: 0,gloss,k_count,morpheme,pos
120,nonfut,2956,R(E),v
283,pst,2712,RI,v
161,gnr,1891,WEːČ,v
296,3sg,1786,n(I),v
270,prog,1675,D,v


In [118]:
seb_pd.to_pickle('Sebjan_morphemes_only.pickle')
kam_pd.to_pickle('Kamchatka_morphemes_only.pickle')

In [10]:
seb_pd = pd.read_pickle('Sebjan_morphemes_only.pickle')
kam_pd = pd.read_pickle('Kamchatka_morphemes_only.pickle')

А теперь надо джойнить ))00

In [11]:
total = pd.merge(seb_pd, kam_pd, how='outer', on=['morpheme', 'gloss', 'pos'])

In [12]:
total = total.fillna(0)

In [14]:
total['s_freq'] = total['s_count'].map(lambda x: x/49804)
total['k_freq'] = total['k_count'].map(lambda x: x/32778)

In [70]:
total.head()

Unnamed: 0,gloss,morpheme,pos,s_count,k_count,s_freq,k_freq,s_expected,k_expected,log_l,ell
37,gnr,WEːČ,v,335.0,1891.0,0.006726,0.057691,1342.468141,883.531859,1947.801162,0.003477
4,hab,Gr(E),v,1747.0,41.0,0.035078,0.001251,1078.316728,709.683272,1452.048547,0.002678
7,pst,RI,v,1308.0,2712.0,0.026263,0.082738,2424.403381,1595.596619,1262.801694,0.002073
3,pf.ptc,čE,v,1895.0,211.0,0.038049,0.006437,1270.097891,835.902109,935.525501,0.001684
17,prog,D,v,765.0,1675.0,0.01536,0.051101,1471.52842,968.47158,834.398174,0.00147


In [25]:
total['s_expected'] = 49804*(total['s_count']+total['k_count'])/(82582)
total['k_expected'] = 32778*(total['s_count']+total['k_count'])/(82582)

In [38]:
total['log_l'] = 2*(total['s_semi_l'] + total['k_semi_l'])

In [39]:
# здесь есть вытаскивание минимального значения из двух столбцов
total['ell'] = total['log_l']/(82582*np.log(
        total['s_expected']*(total['s_expected']<total['k_expected'])+total['k_expected']*(total['k_expected']<total['s_expected'])
                                     )
                              )

In [40]:
del total['s_freq']
del total['k_freq']
del total['k_semi_l']
del total['s_semi_l']

In [57]:
total.head()

Unnamed: 0,gloss,morpheme,pos,s_count,k_count,s_expected,k_expected,log_l,ell
37,gnr,WEːČ,v,335.0,1891.0,1342.468141,883.531859,1947.801162,0.003477
4,hab,Gr(E),v,1747.0,41.0,1078.316728,709.683272,1452.048547,0.002678
7,pst,RI,v,1308.0,2712.0,2424.403381,1595.596619,1262.801694,0.002073
3,pf.ptc,čE,v,1895.0,211.0,1270.097891,835.902109,935.525501,0.001684
17,prog,D,v,765.0,1675.0,1471.52842,968.47158,834.398174,0.00147


In [56]:
total = total.sort_values('log_l', ascending=False)

In [43]:
total.to_pickle('total_morphemes.pickle')

In [35]:
total = pd.read_pickle('total_morphemes.pickle')

In [37]:
total['k_semi_l'] = total['k_count']*np.log(total['k_count']/total['k_expected'])
total['s_semi_l'] = total['s_count']*np.log(total['s_count']/total['s_expected'])
total['k_semi_l'] = total['k_semi_l'].fillna(0)
total['s_semi_l'] = total['s_semi_l'].fillna(0)
total.tail()

Unnamed: 0,gloss,morpheme,pos,s_count,k_count,s_freq,k_freq,s_expected,k_expected,log_l,ell,k_semi_l,s_semi_l
1061,dim.def,jEkEn,adv,0.0,1.0,0.0,3.1e-05,0.603085,0.396915,,,0.924034,0.0
1062,restr,mEːk,rel.n,0.0,1.0,0.0,3.1e-05,0.603085,0.396915,,,0.924034,0.0
1063,ord,(G)I,v,0.0,1.0,0.0,3.1e-05,0.603085,0.396915,,,0.924034,0.0
1064,adjr,pčVn,adj,0.0,1.0,0.0,3.1e-05,0.603085,0.396915,,,0.924034,0.0
1065,agnr,mŋE,pron,0.0,1.0,0.0,3.1e-05,0.603085,0.396915,,,0.924034,0.0


In [58]:
total.to_excel('morphemes_new.xlsx')

# разбираюсь с алломорфами

In [13]:
with open('morpheme_2.txt') as f:
    content = f.read().split('\n\n')
print(len(content))
pprint(content[39])

360
'\\lx =eː\n\\a =ej\n\\a =eːj\n\\a =aː\n\\a =oː\n\\ge =emph\n\\dt 29/Feb/2012'


In [32]:
morphs = {}
m_re = re.compile('\Wlx (\S+)\n')
g_re = re.compile('\Wge (\S+)\n')
a_re = re.compile('\Wa (\S+)\n')
for part in content:
    morph = re.search(m_re, part)
    if morph:
        morph = morph.group(1)
        gloss = re.search(g_re, part).group(1)
        morphs[(morph, gloss)] = re.findall(a_re, part)

In [41]:
for key in morphs:
    for allo in morphs[key]:
        res = total.loc[total['morpheme']==allo.strip('-=')].loc[total['gloss']==key[1]]
        if len(res)>0:
            print(key)
            print(res)

('-E', 'ep')
    gloss morpheme pos  s_count  k_count   s_freq  k_freq  s_expected  \
642    ep        i   v      1.0      0.0  0.00002     0.0    0.603085   

     k_expected  log_l  ell  
642    0.396915    NaN  NaN  
('-G', 'advr')
    gloss morpheme pos  s_count  k_count   s_freq  k_freq  s_expected  \
660  advr        k   v      1.0      0.0  0.00002     0.0    0.603085   

     k_expected  log_l  ell  
660    0.396915    NaN  NaN  
('-n(I)', '3sg')
    gloss morpheme pos  s_count  k_count   s_freq  k_freq  s_expected  \
573   3sg        n   v      1.0      0.0  0.00002     0.0    0.603085   

     k_expected  log_l  ell  
573    0.396915    NaN  NaN  
('-G(I)', 'vr')
    gloss morpheme pos  s_count  k_count   s_freq    k_freq  s_expected  \
681    vr        ŋ   v      1.0     12.0  0.00002  0.000366     7.84011   

     k_expected     log_l       ell  
681     5.15989  16.13729  0.000119  
('-ssO', 'dur')
    gloss morpheme pos  s_count  k_count   s_freq  k_freq  s_expected  \
40

Они ничего не делают

# разбираюсь с конвербами

In [48]:
cvbs = []
for corp in [seb_mor, kam_mor]:
    for morph in corp:
        if len(morph) > 1 and 'cvb' in morph[1]:
            cvbs.append(morph[1].strip('-='))
cvbs = set(cvbs)
pprint(cvbs)

{'ant.cvb',
 'caus.neg.cvb',
 'cond.cvb',
 'des.neg.cvb',
 'dur.cvb',
 'imm.cvb',
 'impf.cvb',
 'mult.cvb',
 'neg.cvb',
 'nmdl.cvb',
 'pf.cvb.Y',
 'purp.cvb',
 'sim.cvb',
 'trm.cvb',
 'vr.cvb.Y'}


In [49]:
cvb_pd = []
for cvb in cvbs:
    if len(cvb_pd)==0:
        cvb_pd = total.loc[total['gloss']==cvb]
        continue
    cvb_pd = pd.concat([cvb_pd, total.loc[total['gloss']==cvb]])
cvb_pd.head()

Unnamed: 0,gloss,morpheme,pos,s_count,k_count,s_expected,k_expected,log_l,ell
404,trm.cvb,klEːkE,v,4.0,0.0,2.412342,1.587658,4.045571,0.0001059761
171,trm.cvb,dle,v,31.0,9.0,24.123417,15.876583,5.332687,2.335554e-05
230,trm.cvb,kEn,v,14.0,9.0,13.870965,9.129035,0.003029,1.658669e-08
269,pf.cvb.Y,An,v,10.0,0.0,6.030854,3.969146,10.113929,8.884064e-05
605,pf.cvb.Y,An,?,1.0,0.0,0.603085,0.396915,1.011393,-1.325398e-05


In [59]:
cvb_pd = cvb_pd.sort_values(['log_l'], ascending=False)

In [60]:
cvb_pd

Unnamed: 0,gloss,morpheme,pos,s_count,k_count,s_expected,k_expected,log_l,ell
12,sim.cvb,nIkEn,v,1083.0,59.0,688.723548,453.276452,739.842694,0.001464707
88,mult.cvb,ntEkEn,v,101.0,0.0,60.911627,40.088373,102.15068,0.000335121
20,ant.cvb,RIdʒI,v,721.0,237.0,577.755831,380.244169,95.300264,0.0001942508
39,purp.cvb,DE,v,307.0,405.0,429.396818,282.603182,85.455394,0.0001833427
138,impf.cvb,ŋsI,v,44.0,0.0,26.535758,17.464242,44.501286,0.0001884072
26,cond.cvb,REk,v,488.0,197.0,413.113511,271.886489,35.655486,7.702568e-05
256,nmdl.cvb,ŋE,v,11.0,33.0,26.535758,17.464242,22.626085,9.579313e-05
29,cond.cvb,mI,v,442.0,223.0,401.051803,263.948197,10.754677,2.335654e-05
269,pf.cvb.Y,An,v,10.0,0.0,6.030854,3.969146,10.113929,8.884064e-05
171,trm.cvb,dle,v,31.0,9.0,24.123417,15.876583,5.332687,2.335554e-05


In [61]:
cvb_pd.to_pickle('converbs.pickle')

In [62]:
cvb_pd.to_excel('converbs.xlsx')

# 1
это уже начато там
морфемы лежат в отдельном словаре, кажется; нужно приложить это знание на таблицу?
или создать уже наконец промежуточное представление всего

# 2
сочетания морфем

сначала надо вытащить представления (кажется это уже сделано)

пройтись по всему, растащить каждое слово на пары морфем, первая (ROOT, morph), последняя (morph, END)


Сочетания хи-квадратом vs. то же самое что с одиночными морфемами но для биграммов!!!11!

In [88]:
def list_afters(corp_m_list):
    res = {}
    for i in range(len(corp_m_list)-1):
        if corp_m_list[i] in res:
            res[corp_m_list[i]].append(corp_m_list[i+1])
        else:
            res[corp_m_list[i]] = [corp_m_list[i+1]]
    return res

In [89]:
from collections import Counter
# словарь морфема: список(каждый последующий), потом по списку считаю частотность
k_morph_dl = list_afters(kam_mor)
k_morph_pairs_dict = {x: Counter(k_morph_dl[x]) for x in k_morph_dl}

s_morph_dl = list_afters(seb_mor)
s_morph_pairs_dict = {x: Counter(s_morph_dl[x]) for x in s_morph_dl}

In [90]:
s_morph_pairs_dict[('-WEːČ', '-gnr', 'v')]

Counter({('-RIdʒI', '-ant.cvb', 'v'): 5,
         ('-nIkEn', '-sim.cvb', 'v'): 26,
         ('-mEČ', '-rec', 'v'): 2,
         ('-RI', '-pst', 'v'): 35,
         ('-mI', '-cond.cvb', 'v'): 15,
         ('-REk', '-cond.cvb', 'v'): 3,
         ('-R(E)', '-nonfut', 'v'): 19,
         ('-nE', '-intent', 'v'): 1,
         ('-čE', '-pf.ptc', 'v'): 19,
         ('-DEŋ', '-pst.ptc', 'v'): 6,
         (1,): 10,
         ('-m', '-des', 'v'): 1,
         ('-R', '-neg.cvb', 'v'): 11,
         ('-DʒI', '-fut', 'v'): 14,
         ('-Gr(E)', '-hab', 'v'): 67,
         ('-D', '-prog', 'v'): 3,
         ('-RI', '-impf.ptc', 'v'): 33,
         ('-E', '-ep', 'v'): 58,
         ('-DU', '-dat', 'v'): 1,
         ('-Gr(E)', '-hab.nonfut', 'v'): 5,
         ('-sčI', '-conat', 'v'): 1})

In [83]:
# bigramm-style!
k_pairs = Counter([(kam_mor[i], kam_mor[i+1]) for i in range(len(kam_mor)-1)])
s_pairs = Counter([(seb_mor[i], seb_mor[i+1]) for i in range(len(seb_mor)-1)])

In [105]:
kpd_pairs = pd.DataFrame({
                       '1morpheme': [x[0][0] for x in k_pairs], 
                       '1gloss': [x[0][1] if len(x[0])==3 else x[0][0] for x in k_pairs],
                       '1pos': [x[0][2] if len(x[0])==3 else x[0][0] for x in k_pairs],
                       '2morpheme': [x[1][0] for x in k_pairs], 
                       '2gloss': [x[1][1] if len(x[1])==3 else x[1][0] for x in k_pairs],
                       '2pos': [x[1][2] if len(x[1])==3 else x[1][0] for x in k_pairs],
                       'k_count': [k_pairs[x] for x in k_pairs]
    })

spd_pairs = pd.DataFrame({
                       '1morpheme': [x[0][0] for x in s_pairs], 
                       '1gloss': [x[0][1] if len(x[0])==3 else x[0][0] for x in s_pairs],
                       '1pos': [x[0][2] if len(x[0])==3 else x[0][0] for x in s_pairs],
                       '2morpheme': [x[1][0] for x in s_pairs], 
                       '2gloss': [x[1][1] if len(x[1])==3 else x[1][0] for x in s_pairs],
                       '2pos': [x[1][2] if len(x[1])==3 else x[1][0] for x in s_pairs],
                       's_count': [s_pairs[x] for x in s_pairs]
    })

In [89]:
kpd_pairs.head()

Unnamed: 0,1gloss,1morpheme,1pos,2gloss,2morpheme,2pos,k_count
0,-abl,-DUk(U),rel.n,1,1,1,1
1,0,0,0,=q,=GU,adv,1
2,-poss.3sg,-n(I),ptl,-ep,-E,ptl,1
3,-poss.3sg,-n(I),adv,-acc,-W,adv,2
4,0,0,0,-desig,-GE,pron,2


In [114]:
total_pairs = pd.merge(spd_pairs, kpd_pairs, how='outer', on=['1morpheme', '1gloss', '1pos', '2morpheme', '2gloss', '2pos'])

In [127]:
for column in ['1morpheme', '1gloss', '1pos', '2morpheme', '2gloss', '2pos']:
    total_pairs[column] = total_pairs[column].map(lambda x: x.strip('-=') if x not in [0, 1] else x)

In [116]:
total_pairs = total_pairs.sort_values(['1gloss', '1morpheme', '1pos'])

In [109]:
total_pairs.head()

Unnamed: 0,1gloss,1morpheme,1pos,2gloss,2morpheme,2pos,s_count,k_count
4,0,0,0,ptl,DE,interj,1.0,
5,0,0,0,sml,mdEs,pron,17.0,
10,0,0,0,ints,mEkEn,adj,6.0,4.0
16,0,0,0,restr,ńUn,adv,10.0,1.0
18,0,0,0,attn,sUkEn,adv,1.0,3.0


In [14]:
total_pairs.to_pickle('pairs.pickle')

In [63]:
total_pairs.head()

Unnamed: 0,1gloss,1morpheme,1pos,2gloss,2morpheme,2pos,s_count,k_count
1016,1pl.Y,BIt,ptl,1,1,1,1.0,
2875,1pl.Y,BIt,v,1,1,1,5.0,
4064,1pl.ex,(R)U,?,1,1,1,,4.0
3880,1pl.ex,(R)U,n,1,1,1,,1.0
3061,1pl.ex,(R)U,v,ptl,DE,v,,17.0


49804
32778

In [12]:
total_pairs = total_pairs.loc[total_pairs['s_count'] != 49803]
total_pairs = total_pairs.loc[total_pairs['1gloss'] != 0]
total_pairs = total_pairs.loc[total_pairs['1gloss'] != '']
total_pairs = total_pairs.loc[total_pairs['1gloss'] != '*']
total_pairs = total_pairs.loc[total_pairs['1gloss'] != '***']
total_pairs = total_pairs.loc[total_pairs['1gloss'] != '******']

In [5]:
total_pairs = pd.read_pickle('pairs.pickle')

In [69]:
total_pairs['s_count'] = total_pairs['s_count'].fillna(0)
total_pairs['k_count'] = total_pairs['k_count'].fillna(0)

In [75]:
total_pairs['s_expected'] = 49804*(total_pairs['s_count']+total_pairs['k_count'])/(82582)
total_pairs['k_expected'] = 32778*(total_pairs['s_count']+total_pairs['k_count'])/(82582)
total_pairs['k_semi_l'] = total_pairs['k_count']*np.log(total_pairs['k_count']/total_pairs['k_expected'])
total_pairs['s_semi_l'] = total_pairs['s_count']*np.log(total_pairs['s_count']/total_pairs['s_expected'])
total_pairs['k_semi_l'] = total_pairs['k_semi_l'].fillna(0)
total_pairs['s_semi_l'] = total_pairs['s_semi_l'].fillna(0)
total_pairs['log_l'] = 2*(total_pairs['s_semi_l']+total_pairs['k_semi_l'])
total_pairs['ell'] = total_pairs['log_l']/(82582*np.log(
        total_pairs['s_expected']*(total_pairs['s_expected']<total_pairs['k_expected'])+total_pairs['k_expected']*(total_pairs['k_expected']<total_pairs['s_expected'])
                                     )
                              )
del total_pairs['k_semi_l']
del total_pairs['s_semi_l']
total_pairs = total_pairs.sort_values('log_l', ascending=False)
total_pairs.head()

Unnamed: 0,1gloss,1morpheme,1pos,2gloss,2morpheme,2pos,s_count,k_count,s_expected,k_expected,log_l,ell
1558,gnr,WEːČ,v,nonfut,R(E),v,19.0,1506.0,919.705263,605.294737,2598.00332,0.004911
4680,prog,D,v,gnr,WEːČ,v,0.0,458.0,276.213122,181.786878,846.415309,0.00197
2445,sim.cvb,nIkEn,v,1,1,1,1039.0,34.0,647.110654,425.889346,812.034418,0.001624
999,pf.ptc,čE,v,1,1,1,1083.0,73.0,697.166743,458.833257,685.667361,0.001355
4769,poss.1pl.ex,WUn,v,1,1,1,0.0,327.0,197.208932,129.791068,604.318354,0.001504


In [77]:
total_pairs.to_pickle('pairs_by_ll.pickle')
total_pairs.to_excel('pairs_by_ll.xlsx')

In [86]:
pairs = pd.read_pickle('pairs_by_ll.pickle')
del pairs['s_expected']
del pairs['k_expected']
del pairs['log_l']
del pairs['ell']

Unnamed: 0,1gloss,1morpheme,1pos,2gloss,2morpheme,2pos,s_count,k_count
1558,gnr,WEːČ,v,nonfut,R(E),v,19.0,1506.0
4680,prog,D,v,gnr,WEːČ,v,0.0,458.0
2445,sim.cvb,nIkEn,v,1,1,1,1039.0,34.0
999,pf.ptc,čE,v,1,1,1,1083.0,73.0
4769,poss.1pl.ex,WUn,v,1,1,1,0.0,327.0


In [None]:
# вернёмся к вложенной структуре
# k_morph_pairs_dict
# s_morph_pairs_dict
from scipy import stats


def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))


for key in k_morph_pairs_dict:
    if key in s_morph_pairs_dict:
        lines = list(set(k_morph_pairs_dict[key].keys())|set(s_morph_pairs_dict[key].keys()))
        obs_s = [s_morph_pairs_dict[key][x] if x in s_morph_pairs_dict else 0]
        obs_k = [k_morph_pairs_dict[key][x] if x in k_morph_pairs_dict else 0]
        obs = np.array([obs_s, obs_k])
        chi2, p, dof, expected = stats.chi2_contingency(obs)
        if all(expected>5):
            confusion_matrix = pd.crosstab(pd.Series('obs_s': obs_s), pd.Series('obs_k': obs_k))
            c_v = cramers_corrected_stat(confusion_matrix)
            

In [96]:
a = np.array([24, 7, 37, 445])
a>5

array([ True,  True,  True,  True], dtype=bool)