Наброски кода для
0. сведения алломорфов морфем в одно (хотя бы в таблице)
1. подсчёта сочетаний морфем
2. вытаскивания конвербов
3. вытаскивания цепочек частей речи

Все манипуляции можно тестировать на корпусах, которые в почте.

# 0
Действительно надо вытащить всё, положить в словарину и радоваться

Формат:

`корпус{
    название:документ{    # done
        мета:мета,    # done
        текст:[
            предложение{
                слой:[
                    морфемы
                    ],
                перевод:''
            }
        ]
    }
}`

In [1]:
corpora = ['Kamchatka', 'Sebjan']

In [24]:
import os, re, pickle
from pprint import pprint


def morphs_2_words(line):
    '''берёт на вход массив расчленённой строки, возвращает массив слов'''
    words = []
    word = []
    for morph in line:
        if morph.strip()[0] in '=-':
            word.append(morph)
        else:
            if len(word) > 0:
                words.append(word)
            word = [morph]
    words.append(word)
    return words


def handle_startline(line, res, current_layer):
    line = line.split()
    layer = line[0].strip('\\')
    parted_layers = ['tx', 'mb', 'ge', 'ps']
    if not (len(line) == 1): # чтобы не считать пустые строки
        if layer in parted_layers:
            line_content = morphs_2_words(line[1:]) # делим на слова, состоящие из морфем
        else:
            line_content = [' '.join(line[1:])] # просто целые строки (комментарии и тп)
            current_layer = layer
        if layer in res and res[layer][0] != '':
            res[layer] += line_content
        else:
            res[layer] = line_content
    return res, current_layer


def lines_2_dict(part):
    '''
    i: кусок текста (предложение) в несколько строк, в каждой строке несколько слоёв, и с другими данными предложения
    o: джейсонина вида {'слой': [сл, о, ва], 'слой': содержимое}
    доп. ограничения: длина всех строк-массивов равна
    '''
    res = {}
    lines = [line for line in part.split('\n') if len(line) > 1]
    res['index'] = [lines[0].split('_')[-1]]
    parted_layers = ['tx', 'mb', 'ge', 'ps']
    current_layer = '' # для переносов
    for line in lines[1:]:
        if line.startswith('\\'):
            res, current_layer = handle_startline(line, res, current_layer)
        else:
            if current_layer:
                res[current_layer][0] += ' ' + line
    return res


def check_len(p_sent, fil):
    parted_layers = ['mb', 'ge', 'ps']
    selected_layers = [key for key in p_sent if key in parted_layers and len(p_sent[key])>1]
    # 1. check that the number of words is the same
    lengths = set([len(p_sent[key]) for key in selected_layers])
    if len(lengths) > 1:
        print('Error in {}, here:'.format(fil))
        print(lengths)
        for l in selected_layers:
            print(len(p_sent[l]))
            pprint(p_sent[l])
        return lengths
    # 2. check that morphemes are aligned
    if 'mb' in selected_layers and 'ge' in selected_layers:
        for i in range(len(p_sent['ge'])):
            if len(p_sent['ge'][i]) != len(p_sent['mb'][i]):
                print('што-то слиплось в {}'.format(fil))
                pprint(p_sent)
    return lengths


def make_readable(corp):
    "переводит текст корпусов в удобомашиночитаемую джейсонину"
    folder = 'Corpus_Text_{}_postagged'.format(corp)
    corpus_dict = {}
    for fil in os.listdir(folder):
        if 'pyzhik' in fil:
            continue
        if not fil.endswith('.txt'):
            continue
        with open(os.path.join(folder, fil), 'r') as f:
            text = f.read()
        file_content, text_content = {}, []
        sents = text.split('\id')
        file_content['meta'] = sents[1] # metainfo at the beginning of the file; not parsed
        for sent in sents[2:]:
            sent_content = lines_2_dict(sent)
            check_len(sent_content, fil)
            text_content.append(sent_content)
        file_content['text'] = text_content
        corpus_dict[fil] = file_content
    with open('{}.pickle'.format(corp), 'wb') as f:
        pickle.dump(corpus_dict, f)
    return corpus_dict

In [31]:
for corp in corpora:
    make_readable(corp)

In [41]:
def detect(what, where, replace=False, corpora=corpora, show=True):
    for corpus in corpora:
        hits = []
        with open('{}.pickle'.format(corpus), 'rb') as f:
            content = pickle.load(f)
        for doc in content:
            for i in range(len(content[doc]['text'])): # looping through sentences
                if not where in content[doc]['text'][i]:
                    continue
                for word in content[doc]['text'][i][where]:
                    if what in word:
                        print('Found in {}, {} at {}'.format(corpus, doc, i))
                        if show:
                            pprint(content[doc]['text'][i])
                        hits.append(doc)
        if len(hits) == 0:
            print('noth found. cool! or not?')
        else:
            inp = input('correct all these ({} hits) & press Enter and I\'ll reload.\n'.format(len(hits)))
            if replace:
                target = replace
                for doc in list(set(hits)):
                    with open(os.path.join('Corpus_Text_{}_postagged'.format(corpus), doc), 'r') as f:
                        text = f.read()
                    a_hits = re.findall('{}\\b'.format(what), text)
                    while len(a_hits) > len(hits):
                        print('what is not accurate. see what\'d be affected')
                        pprint(a_hits[:15])
                        what = input('change the what: ')
                        a_hits = re.findall('{} '.format(what.replace('.', '\.')), text)
                    text = re.sub('{}(\\b)'.format(what.replace('.', '\.')), '{}\\1'.format(target), text)
                    with open(os.path.join('Corpus_Text_{}_postagged'.format(corpus), doc), 'w') as f:
                        f.write(text)
            make_readable(corpus)
            print('done!')

In [29]:
detect('-nonfut.3pl', 'ge', replace='-nonfut', show=False)

Found in Kamchatka, Egorova_RM_Tvajan.txt at 23
Found in Kamchatka, Egorova_RM_Tvajan.txt at 29
Found in Kamchatka, Egorova_RM_Tvajan.txt at 37
Found in Kamchatka, Amganov_EI_pear_story.txt at 11
Found in Kamchatka, Ichanga_AF_pear_story.txt at 14
Found in Kamchatka, Egorova_RM_Yakutia.txt at 0
Found in Kamchatka, Egorova_RM_Yakutia.txt at 39
Found in Kamchatka, Egorova_RM_Yakutia.txt at 42
Found in Kamchatka, Egorova_RM_Yakutia.txt at 81
Found in Kamchatka, Egorova_RM_Yakutia.txt at 91
Found in Kamchatka, Egorova_RM_Yakutia.txt at 97
Found in Kamchatka, Indanova_ON_Moscow.txt at 12
Found in Kamchatka, Axmetova_VI_childhood.txt at 1
Found in Kamchatka, Axmetova_VI_childhood.txt at 19
Found in Kamchatka, Indanova_ON_tabun.txt at 4
Found in Kamchatka, Ichanga_Adukanov_museum.txt at 19
Found in Kamchatka, Amganovy_rybalka_tabun.txt at 2
Found in Kamchatka, Amganovy_rybalka_tabun.txt at 3
Found in Kamchatka, Amganovy_rybalka_tabun.txt at 5
Found in Kamchatka, Amganovy_rybalka_tabun.txt at 

In [42]:
for nf in ['-nonfut3pl', '-nonfut3sg', '-nonfut.3pl']:
    detect(nf, 'ge', replace='-nonfut', show=False)

noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
noth found. cool! or not?
Found in Sebjan, Krivoshapkina_Marta_bear.txt at 3
Found in Sebjan, Krivoshapkina_Marta_bear.txt at 56
Found in Sebjan, Krivoshapkina_Marta_bear.txt at 69
Found in Sebjan, Stepanov_AA_his_life.txt at 17
Found in Sebjan, Krivoshapkina_AE_childhood.txt at 63
Found in Sebjan, Krivoshapkina_AE_childhood.txt at 64
Found in Sebjan, Stepanova_ZA_1_svatovstvo.txt at 32
Found in Sebjan, Kejmetinova_TV_pear_story_new.txt at 18
Found in Sebjan, Kejmetinova_TV_pear_story_new.txt at 52
Found in Sebjan, Kejmetinova_TV_pear_story_new.txt at 58
Found in Sebjan, Kejmetinova_AA_headmistress_Yakutsk_310310_LZ.txt at 69
Found in Sebjan, Nikitin_Mitja_pearstory.txt at 9
Found in Sebjan, Nikitin_Mitja_pearstory.txt at 10
Found in Sebjan, Zavarov_VN_poselok.txt at 53
Found in Sebjan, SlepcovaNA_her_class.txt at 2
Found in Sebjan, SlepcovaNA_her_class.txt at 7
Found in Sebjan, Slep

In [40]:
token = '-nonfut.3pl'
for corp in corpora:
    folder = 'Corpus_Text_{}_postagged'.format(corp)
    for fil in os.listdir(folder):
        with open(os.path.join(folder, fil)) as f:
            text = f.read()
        if token in text:
            print(fil)
    print('done')

Axmetova_VI_pyzhik_LZ.txt
Axmetova_VI_pyzhik_LZ_RM.txt
done
Kejmetinova_AA_headmistress_Yakutsk_310310_LZ.txt
Kejmetinova_TV_pear_story_new.txt
Krivoshapkin_DM_Segen.txt
Krivoshapkin_IN_pearstory_new.txt
Krivoshapkina_AE_childhood.txt
Krivoshapkina_Marta_bear.txt
Krivoshapkina_Sofija_life.txt
Nikitin_Mitja_pearstory.txt
SlepcovaNA_her_class.txt
Stepanov_AA_his_life.txt
Stepanova_ZA_1_svatovstvo.txt
Zavarov_VN_poselok.txt
Zaxarova_JP_pear_story.txt
done


In [94]:
# pprint(content['Alekseeva_RD_lost_tapes_znatoki_NA.txt']['text'][55]['ge'])
fil = 'Kejmetinova_AA_headmistress_Yakutsk_310310_LZ.txt'
with open(os.path.join('Corpus_Text_Sebjan_postagged', fil), 'r') as f:
    text = f.read()
sents = text.split('\id')
for i in range(len(sents)):
    sent_content = lines_2_dict(sents[i])
    check_len(sent_content, fil)

# 1
это уже начато там
морфемы лежат в отдельном словаре, кажется; нужно приложить это знание на таблицу?
или создать уже наконец промежуточное представление всего

# 2
сочетания морфем

сначала надо вытащить представления (кажется это уже сделано)

пройтись по всему, растащить каждое слово на пары морфем, первая (ROOT, morph), последняя (morph, END)

`from collections import Counter
# вытаскиваю все морфемы из первого слота, считаю, с чем они сочитаются, записываю в словарь
morph_pairs_dict = {x: Counter([pair for pair in morph_pairs_list if pair[0]==x]) for x in set(map(lambda pair: pair[0], morph_pairs_list))}
`
