In [1]:
import json
import pandas as pd
import re
from tqdm.notebook import tqdm

### Данные

In [2]:
roles_df = pd.read_csv('dative_roles.csv')

In [3]:
roles_df.head()

Unnamed: 0,Unnamed: 1,ExIndex,Example_x,ConstrIndex,Place_x,PhraseGen,WordDep,Form_x,Role_x,Rank_x,...,KeyLexemes,Unnamed: 15,Place_y,Form_y,Role_y,New_role,Rank_y,SemClass,KeyLexemes_x,Example_y
0,0,1021,"<p><se> <w><ana lex=""Васька"" sem=""d:dimb t:hu...",201.0,1.0,Ваське,Ваське,Sdat,субъект психологического состояния,Несобственный,...,волноваться,,,,,экспериенцер,,,,
1,1,1142,"<p><se> <w><ana lex=""тогда"" sem=""t:time r:dem...",228.0,1.0,им,им,SPROdat,субъект перемещения,Несобственный,...,"выступать,выступить",,,,,экспериенцер,,,,
2,6,29231,"<se> <w><ana lex=""режиссер"" gr=""S,m,anim=sg,no...",303.0,4.0,к кулисе,к кулисе,к + Sdat,конечная точка,Периферия,...,"вывести,выводить",,,,,направление,,,,
3,8,36593,"<p><se> <w><ana lex=""Буш"" sem=""t:hum r:propn ...",1001.0,1.0,публике,публике,Sdat,субъект поведения,Несобственный,...,аплодировать,,,,,экспериенцер,,,,
4,9,42237,"<p><se>-- <w>Итак</w>, -- <w><ana lex=""бодро""...",1002.0,3.0,комиссару,комиссару,Sdat,адресат,Периферия,...,аплодировать,0.0,3.0,Sdat,адресат,адресат,Периферия,лицо,аплодировать,Все аплодировали хору [артисту]. Зал аплодиров...


In [4]:
with open('annotated_corpus_fixed+syntaxnet.json') as file:
    corpus = json.load(file)

### Признаки объекта в дативе

1. Часть речи
2. Одушевленность
2. Число
3. Семантический класс
4. Само существительное

In [6]:
def parse_noun_morphology(morph): # для субъекта и датива
    morph = morph.split()
    if len(morph) == 1:
        morph = re.split(r',|=', morph[0])
    pos = morph[0]
    if len(morph) == 1:
        animacy = 'none'
        number = 'none'
    elif pos == 'S':
        animacy = morph[-3]
        number = morph[-2] 
    else:
        animacy = 'none'
        number = morph[-2] 
    return pos, animacy, number

In [7]:
def parse_noun_semantics(sem): # для субъекта и датива
    sem_dict = {'r': '', 't': '', 'pt': '', 'top': '', 'd': ''}
    for s in sem.split():
        for c in sem_dict:
            if s.startswith(c):
                sem_dict[c] = s
    return list(sem_dict.values())

In [8]:
def noun_features(word):
    if 'feat' in word:
        pos, animacy, number = parse_noun_morphology(word['feat'])
    else:
        pos = word['postag_p']
        animacy = 'none'
        number = 'none'
    if 'sem' in word:
        r, t, pt, top, d = parse_noun_semantics(word['sem'])
    else:
        r, t, pt, top, d = ['none'] * 5
    form = word['form']
    if 'lemma' in word:
        lem_form = word['lemma']
    else:
        lem_form = ''
    return lem_form, pos, animacy, number, r, t, pt, top, d, form

### Признаки субъекта

1. Часть речи
2. Одушевленность
2. Число
3. Семантический класс
4. Само существительное

### Признаки предиката, который управляет дативом

1. Семантический класс
2. Часть речи
3. Вид глагола, если глагол
4. Наклонение, если глагол
5. Сам глагол

In [9]:
def parse_verb_morphology(morph):
    morph = morph.split()
    if len(morph) == 1:
        morph = morph[0].split(',')
    pos = morph[0]
    if pos == 'V':
        aspect = morph[1]
        mood = morph[-1]
    else:
        aspect = 'none'
        mood = 'none'
    return pos, aspect, mood

In [10]:
def parse_verb_semantics(sem):
    sem_dict = {'t': '', 'ca': '', 'aux': '', 'd': ''}
    for s in sem.split():
        for c in sem_dict:
            if s.startswith(c):
                sem_dict[c] = s
    return list(sem_dict.values())

In [11]:
def verb_features(word):
    if 'feat' in word:
        pos, aspect, mood = parse_verb_morphology(word['feat'])
    else:
        pos = word['postag_p']
        aspect = 'none'
        mood = 'none'
    if 'sem' not in word:
        t, ca, aux, d = [''] * 4
    else:
        t, ca, aux, d = parse_verb_semantics(word['sem'])
    if 'lemma' in word:
        lem_form = word['lemma']
    else:
        lem_form = ''
    return lem_form, pos, aspect, mood, t, ca, aux, d

### Дополнительные признаки

1. Наличие предлога: *бинарный*
2. Наличие прямого объекта: *бинарный*

In [12]:
def find_syntactic_head(sent, dative):
    idx = dative['parent']
    head = sent[idx]
    while 'feat' in head and 'dat' in head['feat']:
        dative = head
        idx = head['parent']
        head = sent[idx]
        if 'feat' not in head or 'dat' not in head['feat']:
            break
        if sent[head['parent']] == dative:
            break 
    dat_features = noun_features(dative)
    d_lemma = dat_features[0]
    dat_features = dat_features[1:]
    prep = False
    if idx == -1:
        verb = sent[sent.index(dative)-1]
    else:
        verb = head
    if head['link_name'] == 'case':
        prep = True
        idx = head['parent']
        if idx == -1:
            verb = sent[head.index()-1]
        else:
            verb = sent[idx]
        verb = sent[idx]
    if verb['link_name'] != 'punct':
        features = verb_features(verb)
        lemma = features[0]
        features = features[1:]
    else:
        lemma = [0] * 300
        features = tuple(['punct'] * 7)
    return idx, lemma, prep, features, d_lemma, dat_features

In [13]:
def verb_dependencies(sent, verb):
    dobj = False
    subject_features = ['no_subject'] * 9
    for i in sent:
        if 'link_name' in i:
            if i['link_name'] == 'dobj' and i['parent'] == verb:
                dobj = True
            if i['link_name'] == 'nsubj' and i['parent'] == verb:
                subject_features = noun_features(i)
    lemma = subject_features[0]
    if lemma == 'no_subject':
        lemma = [0] * 300
    subject_features = subject_features[1:]
    return dobj, lemma, subject_features

### Создание датасета

In [14]:
def check_dative(sent):
    # проверка, есть ли объект в дативе в этом предложении 
    dative_sentence = False
    dative = ''
    for i in sent:
        if 'feat' in i.keys() and 'dat' in i['feat']:
            dative_sentence = True
            dative = i
            break
    return dative_sentence, dative

In [15]:
def parse_sentence(sent, dative):
    # обработка предложений с дативом
    sentence = [i['form'] for i in sent]
    verb, prep, verb_f, d_lemma, dat_features = find_syntactic_head(sent, dative)
    dobj, subj_f = verb_dependencies(sent, verb)
    features = (prep, dobj)
    features += dat_features + verb_f + tuple(subj_f)
    return sentence, features

In [16]:
def dataset(corpus):
    idx = 0
    sent_list = []
    lemma_dict = {}
    annotated = {}
    columns = ['key', 'prep', 'dobj', 'd.lemma', 'd.pos', 'd.animacy', 'd.number', 
               'd.r', 'd.t', 'd.pt', 'd.top', 'd.d', 'd.form', 'v.lemma', 'v.pos', 'v.aspect', 'v.mood', 'v.r', 'v.ca', 'v.aux', 'v_d', 
              's.lemma', 's.pos', 's.animacy', 's.number', 's.r', 's.t', 's.pt', 's.top', 's.d', 's.form']
    df_raws = []
    for key, value in tqdm(corpus.items()):
        for n, sent in enumerate(value):
            dative, word = check_dative(sent)
            if dative == True:
                sentence, features = parse_sentence(sent, word)
                sent_list.append([idx, key, ' '.join(sentence)])
                annotated[key] = value
                idx = str(key) + '_' + str(n)
                raw = [idx] + list(features)
                df_raws.append(raw)
    df = pd.DataFrame(df_raws, columns=columns)
    return sent_list, df, annotated

In [17]:
def dataset_with_annotation(df):
    columns = ['key', 'prep', 'dobj', 'd.lemma', 'd.pos', 'd.animacy', 'd.number', 
               'd.r', 'd.t', 'd.pt', 'd.top', 'd.d', 'd.form', 'v.lemma', 'v.pos', 'v.aspect', 'v.mood', 'v.r', 'v.ca', 'v.aux', 'v_d', 
              's.lemma', 's.pos', 's.animacy', 's.number', 's.r', 's.t', 's.pt', 's.top', 's.d', 's.form']
    df_raws = []
    for i in tqdm(range(len(df))):
        all_features = []
        idx = ''
        for n, sent in enumerate(df['syntax_net'].iloc[i]):
            dative, word = check_dative(sent)
            if dative == True:
                sentence, features = parse_sentence(sent, word)
                if list(features) != [] and type(df['WordDep'].iloc[i]) != float:
                    aim_form = features[11].lower().strip()
                    form = df['WordDep'].iloc[i].lower()
                    if aim_form == form.strip() or (' ' in form and aim_form == form.split()[1]):
                        all_features = list(features)
                        idx = df['ExIndex'].iloc[i].astype(str) + '_' + str(n)
        df_raws.append([idx, all_features, df['New_role'].iloc[i]])         
    return df_raws

In [18]:
sentences, df, annotated = dataset(corpus)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=59860.0), HTML(value='')))




In [19]:
df.head()

Unnamed: 0,key,prep,dobj,d.lemma,d.pos,d.animacy,d.number,d.r,d.t,d.pt,...,s.lemma,s.pos,s.animacy,s.number,s.r,s.t,s.pt,s.top,s.d,s.form
0,89370_1,False,False,пытка,S,inan,pl,r:abstr,t:impact,,...,Бухарин,S,anim,sg,r:propn,t:famn,,,,Бухарина
1,89371_0,False,False,капитан,S,anim,sg,r:concr,t:armpos,,...,патруль,S,inan,sg,r:concr,t:group,pt:set,,,патруль
2,89371_2,False,True,население,S,inan,sg,r:concr,,pt:aggr,...,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,,
3,89371_17,False,True,самоубийца,S,anim,sg,r:concr,t:hum,,...,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,,
4,89371_18,False,True,патруль,S,inan,sg,r:concr,t:group,pt:set,...,арестовать,V,none,inf,,,,,der:s,арестовать


In [20]:
sent_df = pd.DataFrame(sentences, columns=['idx', 'key', 'sentence'])

In [21]:
sent_df.to_csv('sent.csv')

In [22]:
roles_df['syntax_net'] = [corpus[str(roles_df['ExIndex'].iloc[i])] for i in range(len(roles_df))]

In [23]:
raws = dataset_with_annotation(roles_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2664.0), HTML(value='')))




In [24]:
ann = pd.DataFrame(raws, columns=['key', 'features', 'role'])

In [25]:
train_data = ann[ann['role'].notna()]

In [27]:
from sklearn.preprocessing import LabelEncoder
roles = LabelEncoder().fit_transform(train_data['role'].to_list())

In [28]:
prev = []
for i in range(len(train_data)):
    if train_data['features'].iloc[i] != []:
        prev.append([train_data['key'].iloc[i]] + train_data['features'].iloc[i])
    else:
        prev.append([0, False, False] + [''] * 26)

In [35]:
columns = ['key', 'prep', 'dobj', 'd.lemma', 'd.pos', 'd.animacy', 'd.number', 
               'd.r', 'd.t', 'd.pt', 'd.top', 'd.d', 'd.form', 'v.lemma', 'v.pos', 'v.aspect', 'v.mood', 'v.r', 'v.ca', 'v.aux', 'v_d', 
              's.lemma', 's.pos', 's.animacy', 's.number', 's.r', 's.t', 's.pt', 's.top', 's.d', 's.form']
final_dataset = pd.DataFrame(prev, columns=columns)

In [36]:
final_dataset['role'] = train_data['role'].tolist()

In [39]:
final_dataset = final_dataset.drop('d.form', axis=1).drop('s.form', axis=1)

In [40]:
final_dataset.to_csv('annotated_data.csv')

In [48]:
updated = pd.merge(df, final_dataset, how='outer')
updated = updated[updated['key'] != 0]
updated = updated.fillna('none')

In [52]:
updated.to_csv('dative.csv')