Наброски кода для
0. сведения алломорфов морфем в одно (хотя бы в таблице)
1. подсчёта сочетаний морфем
2. вытаскивания конвербов
3. вытаскивания цепочек частей речи

Все манипуляции можно тестировать на корпусах, которые в почте.

# 0
Действительно надо вытащить всё, положить в словарину и радоваться

Формат:

`корпус{
    название:документ{    # done
        мета:мета,    # done
        текст:[
            предложение{
                слой:[
                    морфемы
                    ],
                перевод:''
            }
        ]
    }
}`

In [1]:
corpora = ['Kamchatka', 'Sebjan']

In [24]:
import os, re, pickle
from pprint import pprint


def morphs_2_words(line):
    '''берёт на вход массив расчленённой строки, возвращает массив слов'''
    words = []
    word = []
    for morph in line:
        if morph.strip()[0] in '=-':
            word.append(morph)
        else:
            if len(word) > 0:
                words.append(word)
            word = [morph]
    words.append(word)
    return words


def handle_startline(line, res, current_layer):
    line = line.split()
    layer = line[0].strip('\\')
    parted_layers = ['tx', 'mb', 'ge', 'ps']
    if not (len(line) == 1): # чтобы не считать пустые строки
        if layer in parted_layers:
            line_content = morphs_2_words(line[1:]) # делим на слова, состоящие из морфем
        else:
            line_content = [' '.join(line[1:])] # просто целые строки (комментарии и тп)
            current_layer = layer
        if layer in res and res[layer][0] != '':
            res[layer] += line_content
        else:
            res[layer] = line_content
    return res, current_layer


def lines_2_dict(part):
    '''
    i: кусок текста (предложение) в несколько строк, в каждой строке несколько слоёв, и с другими данными предложения
    o: джейсонина вида {'слой': [сл, о, ва], 'слой': содержимое}
    доп. ограничения: длина всех строк-массивов равна
    '''
    res = {}
    lines = [line for line in part.split('\n') if len(line) > 1]
    res['index'] = [lines[0].split('_')[-1]]
    parted_layers = ['tx', 'mb', 'ge', 'ps']
    current_layer = '' # для переносов
    for line in lines[1:]:
        if line.startswith('\\'):
            res, current_layer = handle_startline(line, res, current_layer)
        else:
            if current_layer:
                res[current_layer][0] += ' ' + line
    return res


def check_len(p_sent, fil):
    parted_layers = ['mb', 'ge', 'ps']
    selected_layers = [key for key in p_sent if key in parted_layers and len(p_sent[key])>1]
    # 1. check that the number of words is the same
    lengths = set([len(p_sent[key]) for key in selected_layers])
    if len(lengths) > 1:
        print('Error in {}, here:'.format(fil))
        print(lengths)
        for l in selected_layers:
            print(len(p_sent[l]))
            pprint(p_sent[l])
        return lengths
    # 2. check that morphemes are aligned
    if 'mb' in selected_layers and 'ge' in selected_layers:
        for i in range(len(p_sent['ge'])):
            if len(p_sent['ge'][i]) != len(p_sent['mb'][i]):
                print('што-то слиплось в {}'.format(fil))
                pprint(p_sent)
    return lengths


def make_readable(corp):
    "переводит текст корпусов в удобомашиночитаемую джейсонину"
    folder = 'Corpus_Text_{}_postagged'.format(corp)
    corpus_dict = {}
    for fil in os.listdir(folder):
        if 'pyzhik' in fil:
            continue
        if not fil.endswith('.txt'):
            continue
        with open(os.path.join(folder, fil), 'r') as f:
            text = f.read()
        file_content, text_content = {}, []
        sents = text.split('\id')
        file_content['meta'] = sents[1] # metainfo at the beginning of the file; not parsed
        for sent in sents[2:]:
            sent_content = lines_2_dict(sent)
            check_len(sent_content, fil)
            text_content.append(sent_content)
        file_content['text'] = text_content
        corpus_dict[fil] = file_content
    with open('{}.pickle'.format(corp), 'wb') as f:
        pickle.dump(corpus_dict, f)
    return corpus_dict

In [60]:
for corp in corpora:
    make_readable(corp)

In [41]:
def detect(what, where, replace=False, corpora=corpora, show=True):
    for corpus in corpora:
        hits = []
        with open('{}.pickle'.format(corpus), 'rb') as f:
            content = pickle.load(f)
        for doc in content:
            for i in range(len(content[doc]['text'])): # looping through sentences
                if not where in content[doc]['text'][i]:
                    continue
                for word in content[doc]['text'][i][where]:
                    if what in word:
                        print('Found in {}, {} at {}'.format(corpus, doc, i))
                        if show:
                            pprint(content[doc]['text'][i])
                        hits.append(doc)
        if len(hits) == 0:
            print('noth found. cool! or not?')
        else:
            inp = input('correct all these ({} hits) & press Enter and I\'ll reload.\n'.format(len(hits)))
            if replace:
                target = replace
                for doc in list(set(hits)):
                    with open(os.path.join('Corpus_Text_{}_postagged'.format(corpus), doc), 'r') as f:
                        text = f.read()
                    a_hits = re.findall('{}\\b'.format(what), text)
                    while len(a_hits) > len(hits):
                        print('what is not accurate. see what\'d be affected')
                        pprint(a_hits[:15])
                        what = input('change the what: ')
                        a_hits = re.findall('{} '.format(what.replace('.', '\.')), text)
                    text = re.sub('{}(\\b)'.format(what.replace('.', '\.')), '{}\\1'.format(target), text)
                    with open(os.path.join('Corpus_Text_{}_postagged'.format(corpus), doc), 'w') as f:
                        f.write(text)
            make_readable(corpus)
            print('done!')

In [43]:
nfs = ['-nonfut3pl', '-nonfut3sg', '-nonfut.3pl', '-nonfut.3sg']
for nf in nfs:
    detect(nf, 'ge', replace='-nonfut', show=False)
print('\n\nregalar nfs done\n')
other_nfs = ['-hab.nonfut3pl', '-hab.nonfut3sg', '-hab.nonfut.3pl', '-hab.nonfut.3sg']
for nf in other_nfs:
    detect(nf, 'ge', replace='-hab.nonfut', show=False)
other_nfs = ['-caus.nonfut3pl', '-caus.nonfut3sg', '-caus.nonfut.3pl', '-caus.nonfut.3sg']
for nf in other_nfs:
    detect(nf, 'ge', replace='-caus.nonfut', show=False)

noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?


regalar nfs done

noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
Found in Sebjan, SlepcovaNA_her_class.txt at 27
Found in Sebjan, Zaxarova_JP_pear_story.txt at 21
correct all these (2 hits) & press Enter and I'll reload.

done!
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
Found in Sebjan, Krivoshapkina_AE_childhood.txt at 114
Found in Sebjan, Krivoshapkin_DM_Segen.txt at 40
Found in Sebjan, Krivoshapkin_DM_Segen.txt at 40
correct all these (3 hits) & press Enter and I'll reload.

done!
noth found. cool! or not?
noth found. cool! or not?


Здесь повторяю таблицу которая была

In [45]:
import numpy as np
import pandas as pd 

In [108]:
def extract_morphs(corpus):
    with open('{}.pickle'.format(corpus), 'rb') as f:
        content = pickle.load(f)
    morphs = []
    for doc in content:
        for sent in content[doc]['text']:
            if 'mb' in sent and 'ge' in sent and 'ps' in sent:
                for i in range(len(sent['mb'])):
                    morphs.append(tuple([0]))
                    ps = sent['ps'][i][0]
                    for l in range(1, len(sent['mb'][i])):
                        try:
                            morphs.append((sent['mb'][i][l], sent['ge'][i][l], ps))
                        except:
                            print(doc)
                            pprint(sent)
                    morphs.append(tuple([1]))
    with open('{}_morphemes.pickle'.format(corpus), 'wb') as f:
        pickle.dump(morphs, f)
    print(len(morphs))
    pprint(morphs[:15])

In [109]:
for corp in corpora:
    extract_morphs(corp)

105395
[(0,),
 ('-L', '-pl', 'n'),
 ('-W', '-acc', 'n'),
 (1,),
 (0,),
 ('-L', '-pl', 'n'),
 ('-W', '-acc', 'n'),
 (1,),
 (0,),
 (1,),
 (0,),
 ('-W', '-poss.1sg', 'n'),
 (1,),
 (0,),
 ('-WkEn', '-caus', 'v')]
149704
[(0,),
 ('-WEːČ', '-gnr', 'v'),
 ('-RI', '-impf.ptc', 'v'),
 (1,),
 (0,),
 (1,),
 (0,),
 (1,),
 (0,),
 ('-(dU)LE', '-loc', 'n'),
 (1,),
 (0,),
 ('-B', '-med', 'v'),
 ('-DEŋ', '-pst.ptc', 'v'),
 ('-E', '-ep', 'v')]


In [110]:
with open('Kamchatka_morphemes.pickle', 'rb') as f:
    kam_mor = pickle.load(f)
with open('Sebjan_morphemes.pickle', 'rb') as f:
    seb_mor = pickle.load(f)

In [111]:
from collections import Counter

seb_count = Counter(seb_mor)
kam_count = Counter(kam_mor)

In [121]:
# число слов в каждом
# в прошлый раз было 34933 на камчатке и 51148 в себъяне
print(seb_count[(0,)])
print(kam_count[(0,)])

49804
32778


ЛАДНО

In [113]:
kam_pd = pd.DataFrame({
                       'morpheme': [x[0] for x in kam_count if len(x)==3], 
                       'gloss': [x[1] for x in kam_count if len(x)==3],
                       'pos': [x[2] for x in kam_count if len(x)==3],
                       'k_count': [kam_count[x] for x in kam_count if len(x)==3]
    })

seb_pd = pd.DataFrame({
                       'morpheme': [x[0] for x in seb_count if len(x)==3], 
                       'gloss': [x[1] for x in seb_count if len(x)==3],
                       'pos': [x[2] for x in seb_count if len(x)==3],
                       's_count': [seb_count[x] for x in seb_count if len(x)==3]
    })

In [None]:
seb_pd = seb_pd.sort_values(('s_count'), ascending=False)
kam_pd = kam_pd.sort_values(('k_count'), ascending=False)
seb_pd['gloss'] = seb_pd['gloss'].map(lambda x: x.strip('-='))
seb_pd['morpheme'] = seb_pd['morpheme'].map(lambda x: x.strip('-='))
kam_pd['gloss'] = kam_pd['gloss'].map(lambda x: x.strip('-='))
kam_pd['morpheme'] = kam_pd['morpheme'].map(lambda x: x.strip('-='))

In [117]:
kam_pd.head()

Unnamed: 0,gloss,k_count,morpheme,pos
120,nonfut,2956,R(E),v
283,pst,2712,RI,v
161,gnr,1891,WEːČ,v
296,3sg,1786,n(I),v
270,prog,1675,D,v


In [118]:
seb_pd.to_pickle('Sebjan_morphemes_only.pickle')
kam_pd.to_pickle('Kamchatka_morphemes_only.pickle')

А теперь надо джойнить ))00

In [133]:
total = pd.merge(seb_pd, kam_pd, how='outer', on=['morpheme', 'gloss', 'pos'])

In [134]:
total = total.fillna(0)

In [135]:
total

Unnamed: 0,gloss,morpheme,pos,s_count,k_count
0,nonfut,R(E),v,2084.0,2956.0
1,ep,E,v,2080.0,1552.0
2,3sg,n(I),v,2069.0,1786.0
3,pf.ptc,čE,v,1895.0,211.0
4,hab,Gr(E),v,1747.0,41.0
5,acc,W,n,1590.0,1100.0
6,poss.3sg,n(I),v,1363.0,1449.0
7,pst,RI,v,1308.0,2712.0
8,ep,E,n,1181.0,1063.0
9,pl,L,n,1119.0,732.0


In [None]:
total

In [136]:
total.to_pickle('total_morphemes.pickle')

# 1
это уже начато там
морфемы лежат в отдельном словаре, кажется; нужно приложить это знание на таблицу?
или создать уже наконец промежуточное представление всего

# 2
сочетания морфем

сначала надо вытащить представления (кажется это уже сделано)

пройтись по всему, растащить каждое слово на пары морфем, первая (ROOT, morph), последняя (morph, END)

`from collections import Counter
# вытаскиваю все морфемы из первого слота, считаю, с чем они сочитаются, записываю в словарь
morph_pairs_dict = {x: Counter([pair for pair in morph_pairs_list if pair[0]==x]) for x in set(map(lambda pair: pair[0], morph_pairs_list))}
`
