In [87]:
import re
import pickle
from pprint import pprint
from collections import Counter
import os
import numpy as np
import pandas as pd


class MorphemeError(Exception):
    '''raised if morpheme is empty'''
    def __init__(self, message='Length of lists not equal'):
        super(MorphemeError, self).__init__()
        self.message = message
    def __str__(self):
        return repr(self.message)


class Morphemes():


    def __init__(self, content, corpus):
        self.content = content
        self.corpus = corpus
        self.morphemes = self.morpheme_count()
        if '0' in self.morphemes or 0 in self.morphemes:
            raise MorphemeError('0 morphemes in morpheme_count')
        self.raw_speakers = self.raw_speaker_count()
        self.speakers = self.speaker_count()


    def morpheme_count(self):
        '''returns dict with overall count for morphemes (ignoring speakers)'''
        items = [x[:3] for x in self.content if x[0][0] in '-=']
        return Counter(items)

    
    def morpheme_by_speaker_count(self):
        '''generates dict morheme: Counter(by speaker)'''
        for morpheme in self.morphemes:
            count = Counter([x[-1] for x in self.content if x[:3]==morpheme])
            res = {x: count[x]/self.morphemes[morpheme] for x in count}
            yield (morpheme, res)

    def speaker_by_morpheme_count(self):
        '''generates dict morpheme: Counter(by speaker)'''
        alll = {}
        for morpheme in self.morphemes:
            count = Counter([x[-1] for x in self.content if x[:3]==morpheme])
            res = {x: count[x]/self.raw_speakers[x] for x in count}
            alll[morpheme] = res
        return alll


    def calculate_dp(self):
        '''well, calculates dp for all morphemes'''
        res = {'morpheme': [], 'count': [], 'dp': []}
        for sp in self.speakers:
            res[sp] = []
        sp_by_mor = self.speaker_by_morpheme_count()
        for mor, counts in self.morpheme_by_speaker_count():
            sp_count = sp_by_mor[mor]
            res['morpheme'].append('-'.join(list(map(lambda x: x.strip('-='), mor))))
            res['count'].append(self.morphemes[mor])
            res['dp'].append(dp(self.speakers, counts))
            if not sum(counts.values()) > 0.999:
                print(mor)
                print(self.morphemes[mor])
                pprint(counts)
                print(sum(counts.values()))
                raise MorphemeError('just stopping')
            for sp in self.speakers:
                if sp in sp_count:
                    res[sp].append(sp_count[sp])
                else:
                    res[sp].append(0)
        return res


    def raw_speaker_count(self):
        '''returns dict with the number of words every speaker said in a corpus'''
        items = [x[-1] for x in self.content if x[0]=='END']
        return Counter(items)


    def speaker_count(self, totals = {'Kamchatka': 33112, 'Sebjan': 49800}):
        '''returns dict with proportions of every speaker in a corpus'''
        res = {x: self.raw_speakers[x]/totals[self.corpus] for x in self.raw_speakers}
        # pprint(res)
        # print(sum(res.values()))
        return res


def dp(expected, observed):
    # step = {sp: expected[sp]-observed[sp] if sp in observed else expected[sp] for sp in expected}
    deltas = []
    for sp in expected:
        if sp in observed:
            delta = np.abs(expected[sp]-observed[sp])
        else:
            delta = 0
        deltas.append(delta)
    dp = sum(deltas) / 2
    return dp



def main():
    corpora = ['Kamchatka', 'Sebjan']
    corp_pd = []
    for corp in corpora:
        with open('../{}_wds_by_mor.pickle'.format(corp), 'rb') as f:
            mor = Morphemes(pickle.load(f), corp)
        res = mor.calculate_dp()
        columns = ['morpheme', 'count', 'dp'] + sorted(list(mor.speakers.keys()))
        a = pd.DataFrame(res).loc[pd.DataFrame(res)['count']>15].sort_values('dp', ascending=False)[columns] #.to_excel('{}_dp_new.xlsx'.format(corp))
        corp_pd.append(a)
        print('{} done'.format(corp))
    print('all done')
    return corp_pd


if __name__ == '__main__':
    # unittest.main()
    a = main()



Kamchatka done
Sebjan done
all done


In [100]:
a[0]['AAS_k'] = a[0]["AAS"]
del a[0]["AAS"]

Надо понять что мне нужно

мне нужен

- список всех спикеров +
- список нужных морфем

надо почистить спикеров по тому, что говорила Бригитта

ptc и ptcp не исправлены! аааааа


In [96]:
n = ['ptc', 'cvb', 'loc', 'dat', 'all']
n_0 = [x for x in list(a[0]['morpheme']) if any([y in x for y in n])]
n_1 = [x for x in list(a[1]['morpheme']) if any([y in x for y in n])]
print(len(n_0))
print(len(n_1))
needed = sorted(list(set(n_0)&set(n_1)))
pprint(needed)

20
22
['(dU)LE-loc-n',
 '(dU)LE-loc-v',
 'DE-purp.cvb-v',
 'DU-dat-n',
 'R-neg.cvb-v',
 'REk-cond.cvb-v',
 'RI-impf.ptc-v',
 'RIdʒI-ant.cvb-v',
 'mI-cond.cvb-v',
 'nIkEn-sim.cvb-v',
 'skI-advb.all-rel.n',
 't(E)kI-all-n',
 't(E)kI-all-pron',
 'čE-pf.ptc-v']


In [113]:
k_speakers = list(a[0].columns[3:])
k_speakers

['AEI',
 'AFI',
 'AGK',
 'AL',
 'AMG',
 'AS',
 'ASA',
 'BP',
 'DBA',
 'EGA',
 'EIA',
 'EPA',
 'GAS',
 'GIK',
 'INB',
 'JET',
 'JIP',
 'LGT',
 'NA',
 'NAT',
 'NFI',
 'NIG',
 'NMK',
 'ONI',
 'PMB',
 'RME',
 'RMS',
 'TEB',
 'VIA',
 'rh',
 'AAS_k']

In [117]:
s_speakers = list(a[1].columns[4:])
all_speakers = k_speakers + s_speakers

In [None]:
needed = ["(dU)LE-loc-n", "DUk(U)-abl-n", "RIdʒI-ant.cvb-v", "nIkEn-sim.cvb-v", "DEŋ-pst.ptc-v", "REk-cond.cvb-v", \
          "RI-impf.ptc-v", "DU-dat-n", "mI-cond.cvb-v", "Ar-prs.ptcp.Y-v", ]

In [14]:
import matplotlib.pyplot as plt

  'Matplotlib is building the font cache using fc-list. '


In [104]:
row_k

Unnamed: 0,morpheme,count,dp,AEI,AFI,AGK,AL,AMG,AS,ASA,...,NIG,NMK,ONI,PMB,RME,RMS,TEB,VIA,rh,AAS_k
55,čE-pf.ptc-v,214,0.185538,0.008016,0.0,0.031746,0,0.0,0.005521,0.01699,...,0.006536,0.003175,0.020468,0.014771,0.004548,0.009341,0.0,0.006587,0.0,0.004367


In [None]:
sort_to_two()

In [124]:
def sort_to_two(A, B):
    C = {}
    C.update(A)
    C.update(B)
    res_keys = sorted(C, key=C.get)
    res_values = sorted(C.values())
    res_A = [res_values[i] if res_keys[i] in A else 0 for i in range(len(res_values))]
    res_B = [res_values[i] if res_keys[i] in B else 0 for i in range(len(res_values))]
    return res_A, res_B, res_keys
    

In [125]:
def make_dict(row, speakers):
    res = {}
    for sp in speakers:
        try:
            res[sp] = row[sp].iloc[0]
        except:
            print(row)
            print(speakers)
            raise Exception
#             res[sp] = 0
    return res

In [149]:
for mor in needed:
    row_k = a[0].loc[a[0]['morpheme']==mor]
    k = make_dict(row_k, k_speakers)
    row_s = a[1].loc[a[1]['morpheme']==mor]
    s = make_dict(row_s, s_speakers)
    # now sort
    res_k, res_s, res_x = sort_to_two(k, s)
    fig, ax = plt.subplots(figsize=(20, 10))
    x = np.arange(len(res_x))
    ax.bar(x, res_k, color="red")
    ax.bar(x, res_s, color="blue")
    plt.xticks(x, res_x)
    plt.title("{}, DP K: {}, DP S: {}".format(mor, row_k['dp'].iloc[0], row_s['dp'].iloc[0]))
    plt.legend(['Kamchatka', 'Sebjan'])
    if not os.path.exists("pics"):
        os.mkdir('pics')
    fig.savefig('pics/{}.png'.format(mor))
#     plt.show()


[0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.012500000000000001, 0.014285714285714285, 0.014563106796116505, 0.015151515151515152, 0, 0, 0.022712090848363394, 0.022727272727272728, 0, 0.023328149300155521, 0.023904382470119521, 0, 0, 0.026666666666666668, 0, 0.02768166089965398, 0.02874251497005988, 0, 0.029315960912052116, 0.029759584145549058, 0, 0, 0, 0.031746031746031744, 0.032023289665211063, 0.032334384858044164, 0.032751091703056769, 0.032851511169513799, 0, 0, 0.034906588003933134, 0, 0, 0.03650793650793651, 0, 0, 0, 0, 0.039215686274509803, 0, 0, 0, 0, 0.041358936484490398, 0.04189944134078212, 0.042016806722689079, 0, 0, 0, 0, 0.044037412314886983, 0.044600938967136149, 0, 0, 0, 0, 0, 0, 0, 0, 0.061946902654867256, 0, 0, 0, 0.073099415204678359, 0, 0.11940298507462686, 0]
[0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00038986354775828459, 0, 0, 0, 0.0011

[0.0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00064641241111829345, 0, 0.0019920318725099601, 0.0020661157024793389, 0, 0.0031746031746031746, 0.0032679738562091504, 0.0043668122270742356, 0.0045484080571799868, 0.0047619047619047623, 0.0050505050505050509, 0.0052562417871222077, 0.0055205047318611991, 0, 0.0065146579804560263, 0.0065359477124183009, 0.0065502183406113534, 0.0065868263473053889, 0, 0.0080160320641282558, 0, 0, 0.0093312597200622092, 0.0093411996066863328, 0.0097427903351519872, 0, 0, 0, 0, 0, 0, 0.013333333333333334, 0.014771048744460856, 0, 0.016990291262135922, 0, 0, 0.019553072625698324, 0, 0.02046783625730994, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.031746031746031744, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [101]:
a[0].loc[a[0]['morpheme']==mor]

Unnamed: 0,morpheme,count,dp,AEI,AFI,AGK,AL,AMG,AS,ASA,...,NIG,NMK,ONI,PMB,RME,RMS,TEB,VIA,rh,AAS_k
55,čE-pf.ptc-v,214,0.185538,0.008016,0.0,0.031746,0,0.0,0.005521,0.01699,...,0.006536,0.003175,0.020468,0.014771,0.004548,0.009341,0.0,0.006587,0.0,0.004367


In [150]:
from statsmodels.stats.proportion import proportion_confint

Для конфинта нужны

- сырые частотности по спикерам (не делённые!)
- количество слов по спикерам

их надо подготовить выше

In [None]:
plt.bar(kam, c="red", xticks=)
plt.bar(seb, c="blue")