In [39]:
import json
import pandas as pd
import re
from tqdm.notebook import tqdm

### Данные из FrameBank

In [40]:
roles_df = pd.read_csv('data_from_framebank.csv')

In [41]:
roles_df.head()

Unnamed: 0,Unnamed: 1,ExIndex,Example_x,ConstrIndex,Place_x,PhraseGen,WordDep,Form_x,Role_x,Rank_x,...,KeyLexemes,Unnamed: 15,Place_y,Form_y,Role_y,New_role,Rank_y,SemClass,KeyLexemes_x,Example_y
0,0,1021,"<p><se> <w><ana lex=""Васька"" sem=""d:dimb t:hu...",201.0,1.0,Ваське,Ваське,Sdat,субъект психологического состояния,Несобственный,...,волноваться,,,,,экспериенцер,,,,
1,1,1142,"<p><se> <w><ana lex=""тогда"" sem=""t:time r:dem...",228.0,1.0,им,им,SPROdat,субъект перемещения,Несобственный,...,"выступать,выступить",,,,,экспериенцер,,,,
2,6,29231,"<se> <w><ana lex=""режиссер"" gr=""S,m,anim=sg,no...",303.0,4.0,к кулисе,к кулисе,к + Sdat,конечная точка,Периферия,...,"вывести,выводить",,,,,направление,,,,
3,8,36593,"<p><se> <w><ana lex=""Буш"" sem=""t:hum r:propn ...",1001.0,1.0,публике,публике,Sdat,субъект поведения,Несобственный,...,аплодировать,,,,,экспериенцер,,,,
4,9,42237,"<p><se>-- <w>Итак</w>, -- <w><ana lex=""бодро""...",1002.0,3.0,комиссару,комиссару,Sdat,адресат,Периферия,...,аплодировать,0.0,3.0,Sdat,адресат,адресат,Периферия,лицо,аплодировать,Все аплодировали хору [артисту]. Зал аплодиров...


In [42]:
with open('annotated_corpus_fixed+syntaxnet.json') as file:
    corpus = json.load(file)

### Признаки

#### Непрямой объект:

1. Часть речи
2. Одушевленность
2. Число
3. Семантическая классификация НКРЯ
4. Лемма

####  Субъект:

1. Часть речи
2. Одушевленность
2. Число
3. Семантическая классификация НКРЯ
4. Лемма

#### Предикат:

1. Семантическая классификация НКРЯ
2. Часть речи
3. Вид глагола, если глагол
4. Наклонение, если глагол
5. Лемма

#### Дополнительные признаки

1. Наличие предлога: *бинарный*
2. Наличие прямого объекта: *бинарный*

In [43]:
class FeatureExtractor():
    def __init__(self, corpus):
        self.corpus = corpus
        
        
    def parse_noun_morphology(self, morph: str) -> tuple:
        """
        This function extracts nominal morphology from RNC annotation. 

        Arguments:
        morph: morphology information from Russian National Corpora

        The function returns the part of speech, animacy and number. 
        """

        morph = morph.split()
        if len(morph) == 1:
            morph = re.split(r',|=', morph[0])
        pos = morph[0]
        if len(morph) == 1:
            animacy = 'none'
            number = 'none'
        elif pos == 'S':
            animacy = morph[-3]
            number = morph[-2] 
        else:
            animacy = 'none'
            number = morph[-2] 
        return pos, animacy, number
    
    
    def parse_noun_semantics(self, sem: str) -> list:
        """
        This function extracts the nominal semantics 
        from RNC annotation.

        Arguments:
        sem: the string with the semantic annotation 
        from Russian National Corpora

        The function returns the semantic class of the nouns, 
        their taxonomic class, meteological class, topological 
        class and information about their derivational structure
        """
        sem_dict = {'r': '', 't': '', 'pt': '', 'top': '', 'd': ''}
        for s in sem.split():
            for c in sem_dict:
                if s.startswith(c):
                    sem_dict[c] = s
        return list(sem_dict.values())
    

    def noun_features(self, word: str) -> tuple:
        """
        This function extracts morphological and semantic 
        annotation about a noun from the raw data
        
        Arguments:
        word: a string with the information about a word
        
        The function returns a lemma, a part of speech, information 
        about animacy, number, semantic class, taxonomic class, 
        meteological class, topological class, derivational 
        structure and a form itself
        """
        if 'feat' in word:
            pos, animacy, number = self.parse_noun_morphology(word['feat'])
        else:
            pos = word['postag_p']
            animacy = 'none'
            number = 'none'
        if 'sem' in word:
            r, t, pt, top, d = self.parse_noun_semantics(word['sem'])
        else:
            r, t, pt, top, d = ['none'] * 5
        form = word['form']
        if 'lemma' in word:
            lem_form = word['lemma']
        else:
            lem_form = ''
        return lem_form, pos, animacy, number, r, t, pt, top, d, form
    
    
    def parse_verb_morphology(self, morph: str) -> tuple:
        """
        This function extracts verbal morphology 
        from RNC annotation. 

        Arguments:
        morph: morphology information 
        from Russian National Corpora

        The function returns the part of speech, 
        aspect and mood if the predicate is a verb
        """
        morph = morph.split()
        if len(morph) == 1:
            morph = morph[0].split(',')
        pos = morph[0]
        if pos == 'V':
            aspect = morph[1]
            mood = morph[-1]
        else:
            aspect = 'none'
            mood = 'none'
        return pos, aspect, mood

    
    def parse_verb_semantics(self, sem: str) -> tuple:
        """
        This function extracts the verbal semantics 
        from RNC annotation.

        Arguments:
        sem: the string with the semantic annotation 
        from Russian National Corpora
        
        The function returns information about taxonomic class, 
        causative / non-causative verb, auxiliary / non-auxiliary verb, 
        derivational structure
        """
        sem_dict = {'t': '', 'ca': '', 'aux': '', 'd': ''}
        for s in sem.split():
            for c in sem_dict:
                if s.startswith(c):
                    sem_dict[c] = s
        return list(sem_dict.values())
    

    def verb_features(self, word: str) -> tuple:
        """
        This function extracts morphological and semantic annotation 
        about a verb from the raw data
        
        Arguments:
        word: a string with the information about a word
        
        The function returns a lemma, a part of speech, aspect, mood,
        information about taxonomic class, causative / non-causative verb,
        auxiliary / non-auxiliary verb, derivational structure
        """
        if 'feat' in word:
            pos, aspect, mood = self.parse_verb_morphology(word['feat'])
        else:
            pos = word['postag_p']
            aspect = 'none'
            mood = 'none'
        if 'sem' not in word:
            t, ca, aux, d = [''] * 4
        else:
            t, ca, aux, d = self.parse_verb_semantics(word['sem'])
        if 'lemma' in word:
            lem_form = word['lemma']
        else:
            lem_form = ''
        return lem_form, pos, aspect, mood, t, ca, aux, d

    
    def verb_dependencies(self, sent: str, verb: str) -> tuple:
        """
        This function parses the arguments of a verb
        
        Arguments:
        sent: the raw data about a sentence
        verb: the verb form 
        
        The function returns Boolen variable (true if the 
        verb has a direct object) and the features of a verb subject
        """
        dobj = False
        subject_features = ['no_subject'] * 9
        for i in sent:
            if 'link_name' in i:
                if i['link_name'] == 'dobj' and i['parent'] == verb:
                    dobj = True
                if i['link_name'] == 'nsubj' and i['parent'] == verb:
                    subject_features = self.noun_features(i)
        return dobj, subject_features


    def find_syntactic_head(self, sent: str, dative: str):
        """
        This function finds the syntactic head of a noun phrase 
        in the Dative case
        
        Arguments:
        sent: the raw data about a sentence
        dative: a word marked by the Dative case
        
        The function returns information about the preposition, 
        the features of a predicate and the features of a noun in Dative
        """
        idx = dative['parent']
        head = sent[idx]
        while 'feat' in head and 'dat' in head['feat']:
            dative = head
            idx = head['parent']
            head = sent[idx]
            if 'feat' not in head or 'dat' not in head['feat']:
                break
            if sent[head['parent']] == dative:
                break 
                
        dat_features = self.noun_features(dative)
        prep = False
        if idx == -1:
            verb = sent[sent.index(dative)-1]
        else:
            verb = head
        if head['link_name'] == 'case':
            prep = True
            idx = head['parent']
            if idx == -1:
                verb = sent[head.index()-1]
            else:
                verb = sent[idx]
            verb = sent[idx]
        if verb['link_name'] != 'punct':
            features = self.verb_features(verb)
        else:
            features = tuple(['punct'] * 7)
            
        return idx, prep, features, dat_features
    

    def parse_sentence(self, sent: str, dative: str) -> tuple:
        """
        This functions parses the sentence and returns 
        the sentence and its features
        
        Arguments:
        sent: the raw data about the sentence
        dative: the word in the Dative case
        """
        sentence = [i['form'] for i in sent]
        verb, prep, verb_f, dat_features = self.find_syntactic_head(sent, dative)
        dobj, subj_f = self.verb_dependencies(sent, verb)
        features = (prep, dobj)
        features += dat_features + verb_f + tuple(subj_f)
        return sentence, features
    
    
    def check_dative(self, sent: str) -> tuple:
        """
        This function checks if there is a noun phrase 
        marked by Dative in the sentence
        
        Arguments:
        sent: a sentence to check in
        
        The function returns Boolen variable (true if 
        there is a Dative noun phrase) and, if it is true,
        the word in the Dative case
        """
        dative_sentence = False
        dative = ''
        for i in sent:
            if 'feat' in i.keys() and 'dat' in i['feat']:
                dative_sentence = True
                dative = i
                break
        return dative_sentence, dative


    def dataset(self):
        """
        The function extracts features from unlabelled data
        and returns a DataFrame with features
        """
        idx = 0
        sent_list = []
        annotated = {}
        columns = ['key', 'prep', 'dobj', 'd.lemma', 'd.pos', 'd.animacy', 
                   'd.number', 'd.r', 'd.t', 'd.pt', 'd.top', 'd.d', 'd.form',
                   'v.lemma', 'v.pos', 'v.aspect', 'v.mood', 'v.r', 'v.ca', 'v.aux', 
                   'v_d', 's.lemma', 's.pos', 's.animacy', 's.number', 's.r', 
                   's.t', 's.pt', 's.top', 's.d', 's.form']
        df_raws = []
        for key, value in tqdm(self.corpus.items()):
            for n, sent in enumerate(value):
                dative, word = self.check_dative(sent)
                if dative == True:
                    sentence, features = self.parse_sentence(sent, word)
                    sent_list.append([idx, key, ' '.join(sentence)])
                    annotated[key] = value
                    idx = str(key) + '_' + str(n)
                    raw = [idx] + list(features)
                    df_raws.append(raw)
        df = pd.DataFrame(df_raws, columns=columns)
        return sent_list, df, annotated
    
    
    def dataset_with_annotation(self):
        """
        The function extracts features from unannotated data and finds the labels
        from FrameBank by Example Index
        
        The function returns a DataFrame with features 
        """
        df_raws = []
        for i in tqdm(range(len(self.corpus))):
            all_features = []
            idx = ''
            for n, sent in enumerate(self.corpus['syntax_net'].iloc[i]):
                dative, word = self.check_dative(sent)
                if dative == True:
                    sentence, features = self.parse_sentence(sent, word)
                    if list(features) != [] and type(self.corpus['WordDep'].iloc[i]) != float:
                        aim_form = features[11].lower().strip()
                        form = self.corpus['WordDep'].iloc[i].lower()
                        if aim_form == form.strip() or (' ' in form and aim_form == form.split()[1]):
                            all_features = list(features)
                            idx = self.corpus['ExIndex'].iloc[i].astype(str) + '_' + str(n)
            df_raws.append([idx, all_features, self.corpus['New_role'].iloc[i]])    
            ann = pd.DataFrame(df_raws, columns=['key', 'features', 'role'])
        return ann[ann['role'].notna()]

### Создание датасета

Создание датасета из неаннотированных данных: 

In [44]:
feature_extractor = FeatureExtractor(corpus)
sentences, df, annotated = feature_extractor.dataset()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=59860.0), HTML(value='')))




In [45]:
df.head()

Unnamed: 0,key,prep,dobj,d.lemma,d.pos,d.animacy,d.number,d.r,d.t,d.pt,...,s.lemma,s.pos,s.animacy,s.number,s.r,s.t,s.pt,s.top,s.d,s.form
0,89370_1,False,False,пытка,S,inan,pl,r:abstr,t:impact,,...,Бухарин,S,anim,sg,r:propn,t:famn,,,,Бухарина
1,89371_0,False,False,капитан,S,anim,sg,r:concr,t:armpos,,...,патруль,S,inan,sg,r:concr,t:group,pt:set,,,патруль
2,89371_2,False,True,население,S,inan,sg,r:concr,,pt:aggr,...,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,
3,89371_17,False,True,самоубийца,S,anim,sg,r:concr,t:hum,,...,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,
4,89371_18,False,True,патруль,S,inan,sg,r:concr,t:group,pt:set,...,арестовать,V,none,inf,,,,,der:s,арестовать


Создание csv-файла с индексами и предложениями: 

In [46]:
sent_df = pd.DataFrame(sentences, columns=['idx', 'key', 'sentence'])
sent_df.to_csv('sent.csv')

Создание датасета из аннотированных примеров:

In [47]:
roles_df['syntax_net'] = [corpus[str(roles_df['ExIndex'].iloc[i])] for i in range(len(roles_df))]
ann_feature_extractor = FeatureExtractor(roles_df)
ann = ann_feature_extractor.dataset_with_annotation()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2664.0), HTML(value='')))




In [48]:
prev = []
for i in range(len(ann)):
    if ann['features'].iloc[i] != []:
        prev.append([ann['key'].iloc[i]] + ann['features'].iloc[i])
    else:
        prev.append([0, False, False] + [''] * 26)

In [49]:
columns = ['key', 'prep', 'dobj', 'd.lemma', 'd.pos', 'd.animacy',
           'd.number', 'd.r', 'd.t', 'd.pt', 'd.top', 'd.d', 'd.form', 
           'v.lemma', 'v.pos', 'v.aspect', 'v.mood', 'v.r', 'v.ca', 
           'v.aux', 'v_d', 's.lemma', 's.pos', 's.animacy', 's.number', 
           's.r', 's.t', 's.pt', 's.top', 's.d', 's.form']

final_dataset = pd.DataFrame(prev, columns=columns)
final_dataset['role'] = ann['role'].tolist()
final_dataset = final_dataset.drop('d.form', axis=1).drop('s.form', axis=1)
final_dataset.to_csv('annotated_data.csv')

Создание объединенного датасета, где аннотированные примеры имеют метку класса, а остальные имеют класс 'none':

In [50]:
updated = pd.merge(df, final_dataset, how='outer')
updated = updated[updated['key'] != 0].fillna('none')
updated.to_csv('dative.csv')

In [51]:
updated.head()

Unnamed: 0,key,prep,dobj,d.lemma,d.pos,d.animacy,d.number,d.r,d.t,d.pt,...,s.pos,s.animacy,s.number,s.r,s.t,s.pt,s.top,s.d,s.form,role
0,89370_1,False,False,пытка,S,inan,pl,r:abstr,t:impact,,...,S,anim,sg,r:propn,t:famn,,,,Бухарина,none
1,89371_0,False,False,капитан,S,anim,sg,r:concr,t:armpos,,...,S,inan,sg,r:concr,t:group,pt:set,,,патруль,none
2,89371_2,False,True,население,S,inan,sg,r:concr,,pt:aggr,...,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,none,none
3,89371_17,False,True,самоубийца,S,anim,sg,r:concr,t:hum,,...,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,no_subject,none,none
4,89371_18,False,True,патруль,S,inan,sg,r:concr,t:group,pt:set,...,V,none,inf,,,,,der:s,арестовать,none
