In [25]:
import re
import pickle
from pprint import pprint
from collections import Counter
import os
import numpy as np
import pandas as pd


class MorphemeError(Exception):
    '''raised if morpheme is empty'''
    def __init__(self, message='Length of lists not equal'):
        super(MorphemeError, self).__init__()
        self.message = message
    def __str__(self):
        return repr(self.message)


class Morphemes():


    def __init__(self, content, corpus):
        self.content = content
        self.corpus = corpus
        self.morphemes = self.morpheme_count()
        if '0' in self.morphemes or 0 in self.morphemes:
            raise MorphemeError('0 morphemes in morpheme_count')
        self.raw_speakers = self.raw_speaker_count()
        self.speakers = self.speaker_count()


    def morpheme_count(self):
        '''returns dict with overall count for morphemes (ignoring speakers)'''
        items = [x[:3] for x in self.content if x[0][0] in '-=' and not '-ep' in x]
        return Counter(items)

    
    def raw_morpheme_by_speaker_count(self):
#     '''generates dict morheme: Counter(by speaker) with number of times speaker said this morpheme'''
        alll = {}
        for morpheme in self.morphemes:
            count = Counter([x[-1] for x in self.content if x[:3]==morpheme])
            res = {x: count[x] for x in count}
            mor = '-'.join(list(map(lambda x: x.strip('-='), morpheme)))
            alll[mor] = res
        return alll
    
    
    def morpheme_by_speaker_count(self):
        '''generates dict morheme: Counter(by speaker) with proportion of this speaker in the 
        overall count of this morpheme'''
        for morpheme in self.morphemes:
            count = Counter([x[-1] for x in self.content if x[:3]==morpheme])
            res = {x: count[x]/self.morphemes[morpheme] for x in count}
            yield (morpheme, res)

    def speaker_by_morpheme_count(self):
        '''generates dict morpheme: Counter(by speaker) with proportion of this morpheme in this speakers speech'''
        alll = {}
        for morpheme in self.morphemes:
            count = Counter([x[-1] for x in self.content if x[:3]==morpheme])
            res = {x: count[x]/self.raw_speakers[x] for x in count}
            alll[morpheme] = res
        return alll


    def calculate_dp(self):
        '''well, calculates dp for all morphemes'''
        res = {'morpheme': [], 'count': [], 'dp': []}
        for sp in self.speakers:
            res[sp] = []
        sp_by_mor = self.speaker_by_morpheme_count()
        for mor, counts in self.morpheme_by_speaker_count():
            sp_count = sp_by_mor[mor]
            res['morpheme'].append('-'.join(list(map(lambda x: x.strip('-='), mor))))
            res['count'].append(self.morphemes[mor])
            res['dp'].append(dp(self.speakers, counts))
            if not sum(counts.values()) > 0.999:
                print(mor)
                print(self.morphemes[mor])
                pprint(counts)
                print(sum(counts.values()))
                raise MorphemeError('just stopping')
            for sp in self.speakers:
                if sp in sp_count:
                    res[sp].append(sp_count[sp])
                else:
                    res[sp].append(0)
        return res


    def raw_speaker_count(self):
        '''returns dict with the number of words every speaker said in a corpus'''
        items = [x[-1] for x in self.content if x[0]=='END']
        return Counter(items)


    def speaker_count(self, totals = {'Kamchatka': 33112, 'Sebjan': 49800}):
        '''returns dict with proportions of every speaker in a corpus'''
        res = {x: self.raw_speakers[x]/totals[self.corpus] for x in self.raw_speakers}
        # pprint(res)
        # print(sum(res.values()))
        return res


def dp(expected, observed):
    # step = {sp: expected[sp]-observed[sp] if sp in observed else expected[sp] for sp in expected}
    deltas = []
    for sp in expected:
        if sp in observed:
            delta = np.abs(expected[sp]-observed[sp])
        else:
            delta = 0
        deltas.append(delta)
    dp = sum(deltas) / 2
    return dp



def main():
    corpora = ['Kamchatka', 'Sebjan']
    corp_pd = []
    for corp in corpora:
        with open('../{}_wds_by_mor.pickle'.format(corp), 'rb') as f:
            mor = Morphemes(pickle.load(f), corp)
        res = mor.calculate_dp()
        columns = ['morpheme', 'count', 'dp'] + sorted(list(mor.speakers.keys()))
        a = pd.DataFrame(res).loc[pd.DataFrame(res)['count']>15].sort_values('dp', ascending=False)[columns] 
        #.to_excel('{}_dp_new.xlsx'.format(corp))
        corp_pd.append(a)
        print('{} done'.format(corp))
    print('all done')
    return corp_pd


if __name__ == '__main__':
    # unittest.main()
    a = main()



Kamchatka done
Sebjan done
all done


In [20]:
a[0]['AAS_k'] = a[0]["AAS"]
del a[0]["AAS"]

Надо понять что мне нужно

мне нужен

- список всех спикеров +
- список нужных морфем +

надо почистить спикеров по тому, что говорила Бригитта

ptc и ptcp не исправлены! аааааа


In [16]:
# здесь достаются морфемы с которыми работаем
n = ['ptc', 'cvb', 'loc', 'dat', 'all']
n_0 = [x for x in list(a[0]['morpheme']) if any([y in x for y in n])]
n_1 = [x for x in list(a[1]['morpheme']) if any([y in x for y in n])]
print(len(n_0))
print(len(n_1))
needed = sorted(list(set(n_0)&set(n_1)))
pprint(needed)

20
22
['(dU)LE-loc-n',
 '(dU)LE-loc-v',
 'DE-purp.cvb-v',
 'DU-dat-n',
 'R-neg.cvb-v',
 'REk-cond.cvb-v',
 'RI-impf.ptc-v',
 'RIdʒI-ant.cvb-v',
 'mI-cond.cvb-v',
 'nIkEn-sim.cvb-v',
 'skI-advb.all-rel.n',
 't(E)kI-all-n',
 't(E)kI-all-pron',
 'čE-pf.ptc-v']


In [26]:
k_speakers = list(a[0].columns[3:])
s_speakers = list(a[1].columns[4:])
all_speakers = k_speakers + s_speakers

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def sort_to_two(A, B):
    C = {}
    C.update(A)
    C.update(B)
    res_keys = sorted(C, key=C.get)
    res_values = sorted(C.values())
    res_A = [res_values[i] if res_keys[i] in A else 0 for i in range(len(res_values))]
    res_B = [res_values[i] if res_keys[i] in B else 0 for i in range(len(res_values))]
    i = delete_zeros(res_A, res_B)
    return res_A[i:], res_B[i:], res_keys[i:]
    

def delete_zeros(res_A, res_B):
    i = 0
    while res_A[i] == 0 and res_B[i] == 0:
        i+= 1
#         print(i)
    return i

def make_dict(row, speakers):
    '''делает словарь спикер: частотность'''
    res = {}
    for sp in speakers:
        try:
            res[sp] = row[sp].iloc[0]
        except:
            print(row)
            print(speakers)
            raise Exception
#             res[sp] = 0
    return res


def calc_dp(k_count, s_count):
    '''считает дипи для двух корпусов как двух коробок'''
    total = k_count + s_count
    exp = np.array([33112/(49800+33112), 49800/(49800+33112)])
    obs = np.array(k_count/total, s_count/total)
    res = np.sum(np.abs(exp-obs))/2
    return res

from scipy.stats import beta

def binom_interval(success, total, confint=0.95):
    quantile = (1 - confint) / 2.
    lower = beta.ppf(quantile, success, total - success + 1)
    upper = beta.ppf(1 - quantile, success + 1, total - success)
    return (lower, upper)

Для конфинта нужны

- сырые частотности по спикерам (не делённые!)
- количество слов по спикерам

их надо подготовить (в след. ячейке)

считается proportion_confint(raw_count, raw_speaker)

In [5]:
corpora = ['Kamchatka', 'Sebjan']
morphemes_cl = []
for corp in corpora:
    with open('../{}_wds_by_mor.pickle'.format(corp), 'rb') as f:
        mor = Morphemes(pickle.load(f), corp)
    morphemes_cl.append(mor)
    
raw_m_k = morphemes_cl[0].raw_morpheme_by_speaker_count()
raw_m_s = morphemes_cl[1].raw_morpheme_by_speaker_count()
k_raw_speakers = morphemes_cl[0].raw_speaker_count()
s_raw_speakers = morphemes_cl[1].raw_speaker_count()

Собственно конфинты

In [6]:
def yerrs(morpheme, res_x):
    k_m_by_s = raw_m_k[morpheme]
    s_m_by_s = raw_m_s[morpheme]
    k_yerr = get_yerr(res_x, k_m_by_s, k_raw_speakers)
    s_yerr = get_yerr(res_x, s_m_by_s, s_raw_speakers)
    print(res_x)
    pprint(len(s_yerr))
    for x, y in zip(res_x, k_yerr):
        print(x, y)
    k_yerr = [[x[0] for x in k_yerr], [x[1] for x in k_yerr]]
    s_yerr = [[x[0] for x in s_yerr], [x[1] for x in s_yerr]]
    return k_yerr, s_yerr
    

def get_yerr(order, m_by_s, raw_speakers):
    yerr = []
    for sp in order:
        if sp == "AAS_k":
            sp = "AAS"
        if sp in m_by_s:
#             if 'AMG' in raw_speakers and sp == 'AAS':
#                 print('found')
#                 sp = "AAS"
#                 print(sp)
            raw_count = m_by_s[sp]
            raw_sp = raw_speakers[sp]
            ci = binom_interval(raw_count, raw_sp)
            print(sp, ci)
            yerr.append(ci)
        else:
            yerr.append((0, 0))
    return yerr

Здесь кусок кода который делает графики

In [23]:
for mor in needed:
    row_k = a[0].loc[a[0]['morpheme']==mor]
    k = make_dict(row_k, k_speakers)
    row_s = a[1].loc[a[1]['morpheme']==mor]
    s = make_dict(row_s, s_speakers)
#     dp = calc_dp(row_k['count'].iloc[0], row_s['count'].iloc[0])
    dp = calc_dp_together(k, s)
    print(dp)
    # now sort
    res_k, res_s, res_x = sort_to_two(k, s)
    fig, ax = plt.subplots(figsize=(25, 10))
    x = np.arange(len(res_x))
    yerr = yerrs(mor, res_x)
    plt.xticks(rotation=70)
    ax.bar(x, res_k, color="red", yerr=yerr[0])
    ax.bar(x, res_s, color="cyan", yerr=yerr[1])
    plt.xticks(x, res_x)
    dp_k = row_k['dp'].iloc[0]
    dp_s = row_s['dp'].iloc[0]
    diff = dp - min(dp_k, dp_s)
    plt.title("{}".format(mor))
    plt.text(0, max(*res_k, *res_s)/2, "DP:{}, \ndiff: {}, \nDP K: {}, \nDP S: {}".format(dp, diff, dp_k, dp_s))
    plt.legend(['Kamchatka', 'Sebjan'])
    if not os.path.exists("pics"):
        os.mkdir('pics')
#     fig.savefig('pics/{}{}.png'.format(diff, mor))
    plt.show()
#     break


KeyError: 'AAS_k'

In [24]:
row_k

Unnamed: 0,morpheme,count,dp,AEI,AFI,AGK,AL,AMG,AS,ASA,...,NIG,NMK,ONI,PMB,RME,RMS,TEB,VIA,rh,AAS_k
10,(dU)LE-loc-n,1044,0.095194,0.022712,0.061947,0.031746,0,0.0125,0.032334,0.014563,...,0.027682,0.036508,0.073099,0.041359,0.02976,0.034907,0.044601,0.028743,0.0,0.032023


In [16]:
k_raw_speakers

Counter({'AAS': 687,
         'AEI': 1497,
         'AFI': 113,
         'AGK': 189,
         'AL': 4,
         'AMG': 80,
         'AS': 1268,
         'ASA': 412,
         'BP': 61,
         'DBA': 225,
         'EGA': 1286,
         'EIA': 2566,
         'EPA': 484,
         'GAS': 612,
         'GIK': 1547,
         'INB': 198,
         'JET': 458,
         'JIP': 502,
         'LGT': 761,
         'NA': 4,
         'NAT': 614,
         'NFI': 840,
         'NIG': 5202,
         'NMK': 630,
         'ONI': 342,
         'PMB': 677,
         'RME': 7695,
         'RMS': 2034,
         'TEB': 426,
         'VIA': 1670,
         'rh': 15})

In [13]:
6500/sum(s_raw_speakers.values())

0.1305194674805727

In [327]:
print(raw_m_k['(dU)LE-loc-n'])
print(raw_m_s['(dU)LE-loc-n'])

{'EIA': 113, 'NIG': 144, 'RMS': 71, 'NAT': 18, 'EPA': 11, 'RME': 229, 'AFI': 7, 'AAS': 22, 'TEB': 19, 'ONI': 25, 'BP': 6, 'DBA': 6, 'AMG': 1, 'GAS': 24, 'JET': 15, 'NMK': 23, 'AGK': 6, 'JIP': 12, 'AS': 41, 'VIA': 48, 'AEI': 34, 'ASA': 6, 'LGT': 25, 'INB': 3, 'PMB': 28, 'GIK': 65, 'EGA': 30, 'NFI': 12}
{'EIA': 113, 'NIG': 144, 'RMS': 71, 'NAT': 18, 'EPA': 11, 'RME': 229, 'AFI': 7, 'AAS': 22, 'TEB': 19, 'ONI': 25, 'BP': 6, 'DBA': 6, 'AMG': 1, 'GAS': 24, 'JET': 15, 'NMK': 23, 'AGK': 6, 'JIP': 12, 'AS': 41, 'VIA': 48, 'AEI': 34, 'ASA': 6, 'LGT': 25, 'INB': 3, 'PMB': 28, 'GIK': 65, 'EGA': 30, 'NFI': 12}


In [231]:
# morphemes_cl[0].raw_morpheme_by_speaker_count()

In [101]:
a[0].loc[a[0]['morpheme']==mor]

Unnamed: 0,morpheme,count,dp,AEI,AFI,AGK,AL,AMG,AS,ASA,...,NIG,NMK,ONI,PMB,RME,RMS,TEB,VIA,rh,AAS_k
55,čE-pf.ptc-v,214,0.185538,0.008016,0.0,0.031746,0,0.0,0.005521,0.01699,...,0.006536,0.003175,0.020468,0.014771,0.004548,0.009341,0.0,0.006587,0.0,0.004367


In [150]:
from statsmodels.stats.proportion import proportion_confint

In [300]:
from scipy import nan

In [248]:
"AMG" in s_raw_speakers

False

In [252]:
raw_m_k["(dU)LE-loc-n"]

{'AAS': 22,
 'AEI': 34,
 'AFI': 7,
 'AGK': 6,
 'AMG': 1,
 'AS': 41,
 'ASA': 6,
 'BP': 6,
 'DBA': 6,
 'EGA': 30,
 'EIA': 113,
 'EPA': 11,
 'GAS': 24,
 'GIK': 65,
 'INB': 3,
 'JET': 15,
 'JIP': 12,
 'LGT': 25,
 'NAT': 18,
 'NFI': 12,
 'NIG': 144,
 'NMK': 23,
 'ONI': 25,
 'PMB': 28,
 'RME': 229,
 'RMS': 71,
 'TEB': 19,
 'VIA': 48}

А теперь мне нужно сделать табличку с десятью самыми частотными морфемами и DP для них.

- найти 10 самых частотных +
- дропать эпентезы +
- написать фц для подсчёта DP по-Даниэлевски
- сделать табличку

In [6]:
corp = 'Kamchatka'
with open('../{}_wds_by_mor.pickle'.format(corp), 'rb') as f:
    kam = Morphemes(pickle.load(f), corp)
corp = "Sebjan"
with open('../{}_wds_by_mor.pickle'.format(corp), 'rb') as f:
    seb = Morphemes(pickle.load(f), corp)

kam_counts = kam.morpheme_count()
sorted(kam_counts, key=kam_counts.get, reverse=True)[:11]

[('-R(E)', '-nonfut', 'v'),
 ('-RI', '-pst', 'v'),
 ('-WEːČ', '-gnr', 'v'),
 ('-W', '-acc', 'n'),
 ('-n(I)', '-3sg', 'v'),
 ('-D', '-prog', 'v'),
 ('-n(I)', '-poss.3sg', 'v'),
 ('-(dU)LE', '-loc', 'n'),
 ('-m', '-1sg', 'v'),
 ('-n(I)', '-poss.3sg', 'n'),
 ('-L', '-pl', 'n')]

In [11]:
np.array([*[2, 45], *[14, 24]])

array([ 2, 45, 14, 24])

In [13]:
k_sp_prop = kam.speaker_count()
s_sp_prop = seb.speaker_count()

def calc_dp_together(k, s):
    '''на вход словари по спикерам, на выходе число - DP для корпуса из всех спикеров'''
    k_order, s_order = k.keys(), s.keys()
    k, s = np.array([k[x] for x in k_order])*33112/(49800+33112), np.array([s[x] for x in s_order])*49800/(49800+33112)
    total = np.sum(k) + np.sum(s)
    exp = np.array([*[k_sp_prop[sp] for sp in k_order], *[s_sp_prop[sp] for sp in s_order]])
    obs = np.array([*k, *s])
    res = np.sum(np.abs(exp-obs))/2
    return res

In [None]:
plt.bar(kam, c="red", xticks=)
plt.bar(seb, c="blue")