In [1]:
from pymystem3 import Mystem
import pandas as pd
import os
from tqdm.notebook import tqdm
import natasha
from scipy import  stats
from collections import Counter
import pymorphy2
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import itertools
import json
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    PER,
    NamesExtractor,
    Doc
)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
tqdm.pandas()

morph = pymorphy2.MorphAnalyzer()
m = Mystem()

import warnings
warnings.filterwarnings('ignore')

# Create dataset

1. Open original dataset "src/original_dataset.csv"
2. Find grammatical base of each sentence using 'find_base' function -> "result/congruent_sentences.csv"
3. Manually correct the base sentence and save it in the column "sent_corrected" -> "result/congruent_sentences_corrected.csv"
4. Find syntactic structure of the sentences and words' frequeincies and length in the "sent_corrected" column using "find_struct" function -> "congruent_sentences_corrected_marked.csv"
5. For missing values (words' frequencies, gender) in "result/congruent_sentences_corrected_marked.csv" add values manually or using find_ipm_in_ruscorpora function
6. Manually select appropriate sentences in "result/congruent_sentences_corrected_marked.csv" (add column "selected") and split dataset in two groups: "to_evaluate" (experimental stimuli) and "control" (control stimuli) -> "result/congruent_sentences_corrected_marked+.csv"
7. Estimate the distribution of 'length' and 'IPM' stimuli parameters between sentences with different syntactic structure
8. Generate incongruent sentences based on selected stimuli -> "result/incongruent_sentences_toloka.csv"
9. Evaluate sentences via TOLOKA -> "result/toloka_evaluation.csv"
10. Based on TOLOKA evaluation results, select only those sentences which error types were correctly assessed by participants for each of four error types -> "result/final_candidates.csv"
11. Manually check and select most appropriate stimuli for the corpus ->  "result/FINAL_STIMULI.csv"


In [2]:
def find_base(text: str):
    """
    Find grammatical base of the sentence (Subject + Verb + Object)
    and checks whether it is complete (contains three arguments)
    :param text: text of the sentence to check
    :return: (text of grammatical base (str), mark whether base is complete (bool))
    """
    res = {'nsubj': '',
           'root': '',
          'dobj': ''}
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser) 
    tokens = {tok.id: tok for tok in doc.tokens}
    ok = 0
    for tok in doc.tokens:
        if tok.rel == 'root':
            res['root'] = tok.text
            ok += tok.pos == 'VERB'
        elif tok.rel == 'nsubj' and tokens[tok.head_id].rel == 'root':
            res['nsubj'] = tok.text
            ok += tok.pos == 'NOUN'
        elif tok.rel == 'obj' and tokens[tok.head_id].rel == 'root':
            res['dobj'] = tok.text
            ok += tok.pos == 'NOUN'        
    return (' '.join(res.values()).strip().capitalize(), ok == 3)

def find_struct(text: str) -> str:
    """
    Fuction to find syntactic structure of the input sentence
    :param text: text of the sentence to check
    :return: syntactic structure of the sentence
    """
    
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser) 
    tokens = {tok.id: tok for tok in doc.tokens}
    ok = 0
    if (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'NOUN' and 
        len(doc.tokens) == 3):
        return 'Subject - Verb - Object'
    elif len(doc.tokens) == 3:
        return 'check me'
    elif (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'NOUN' and 
        doc.tokens[3].pos == 'NOUN' and 
        doc.tokens[3].feats['Case'] == 'Gen' and 
        len(doc.tokens) == 4):
        return 'Subject - Verb - Object - Gen'
    elif (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'ADJ' and 
        doc.tokens[3].pos == 'NOUN' and 
        len(doc.tokens) == 4):
        return 'Subject - Verb - Adj - Object'
    elif (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'NOUN' and 
        doc.tokens[3].pos == 'ADP' and 
        doc.tokens[4].pos == 'NOUN' and 
        len(doc.tokens) == 5):
        return 'Subject - Verb - Object - PP'
    
def count_syllables(word):
    """
    Function to count syllables in the word
    :param word: word
    :return: length of word in syllables
    """
    if not isinstance(word, str):
        return
    counter = 0
    vowels = 'аеиюэоыуея'
    for letter in word:
        if letter in vowels:
            counter += 1
    return counter

def normalize(word):
    """
    Function to get lemma (initial form) of the word
    :param word: word
    :return: lemma (initial form) of the word
    """
    if not isinstance(word, str):
        return
    return morph.parse(word)[0].normal_form

In [3]:
def process_table(df: pd.DataFrame, column: str, df_name=None, sheet=None) -> pd.DataFrame:
    """
    A function to find sentence syntactic structure and words' length in syllables
    :param df: pd.DataFrame
    :param df_name: path to df
    :param sheet: sheet_name
    :param column: column name
    :return: DataFrame
    """
    ruscorpora_frequencies = pd.read_csv('src/ruscorpora_frequencies.csv')

    frequencies = {word: ipm for word, ipm in zip(ruscorpora_frequencies['lex'],
                                                  ruscorpora_frequencies['ipm'])}
    
    if df_name is not None:
        df = pd.read_excel(df_name, sheet_name=sheet)
    df['length'] = df[column].apply(lambda x: len(x.split(' ')))
    df['structure'] = df[column].apply(find_struct)
    df['words'] = df[column].apply(lambda x: x.split())

    subjects = []
    verbs = []
    objectives = []
    genitives = []
    adjectives = []
    prepositions = []
    objectives2 = []

    for sent, struct in zip(df['words'], df['structure']):
        subjects.append(sent[0].lower())
        verbs.append(sent[1])
        obj = sent[3] if struct == 'Subject - Verb - Adj - Object' else sent[2]
        objectives.append(obj)
        gen = sent[3] if struct == 'Subject - Verb - Object - Gen' else None
        genitives.append(gen)
        adj = sent[2] if struct == 'Subject - Verb - Adj - Object' else None
        adjectives.append(adj)
        prep = sent[3] if struct == 'Subject - Verb - Object - PP' else None
        prepositions.append(prep)
        if struct == 'Subject - Verb - Object - PP':
            obj2 = sent[4] 
        else:
            obj2 = None
        objectives2.append(obj2)


    df['subject'] = subjects
    df['verb'] = verbs
    df['object'] = objectives
    df['gen'] = genitives
    df['adj'] = adjectives
    df['preposition'] = prepositions
    df['object2'] = objectives2

    for column in ['subject', 'verb', 'object', 'gen', 'adj',
                   'preposition', 'object2']:
        df[f'{column}_length'] = df[column].map(count_syllables)
        df[f'{column}_lemma'] = df[column].map(lambda x: normalize(x))
        df[f'{column}_ipm'] = df[f'{column}_lemma'].apply(lambda x: frequencies.get(x))
    
    return df

In [4]:
def find_ipm_in_ruscorpora(df, target_columns):
    """
    Function to collect words' IPM and gender in ruscorpora.ru
    """

    def find_ipm(word, driver):
        try:
            driver.get(f'https://ruscorpora.ru/explore?req={word}')  
            time.sleep(0.5)
            driver.find_elements(By.CLASS_NAME, 'link--accent')[1].click()
            time.sleep(0.5)
            button = [button for button in driver.find_elements(By.TAG_NAME, 'button')
                     if button.text == 'Частотность'][0]
            button.click()
            time.sleep(0.5)
            return float(driver.find_element(By.XPATH, '/html/body/div[4]/main/div/div[3]/div[1]/div[1]/div[2]/div/table/tbody/tr/td[5]/span').text.replace(',', '.')) 
        except Exception as e:
            print(e, word)
            return 

    driver = webdriver.Chrome()
    
    words = []
    
    for column in target_columns: 
        words += df[f'{column} lemma'].tolist()

    words_ipm_dict = {w: find_ipm(w, driver) for w in tqdm(set(words))}

    for col in ['Subject', 'Object', 'Object 2',
                'Verb', 'Object', 'Gen', 'Adj',
                'Object 2']:
        df[f'ipm_{col}'] = df[f'{col} lemma'].map(words_ipm_dict)
        df[f'{col} gender'] = df[f'{col} lemma'].map(
            lambda x: morph.parse(x)[0].tag.gender if isinstance(x, str) else None)
        
    return df

# Evaluate syntactic structures of original sentences

- Open original dataset "src/original_dataset.csv"
- Find grammatical base of each sentence using 'find_base' function -> "result/congruent_sentences.csv"

In [5]:
if not os.path.exists('result'):
    os.mkdir('result')
    
df = pd.read_csv('src/original_dataset.csv')
df['base'] = df['congruent'].apply(find_base)
df[['base_sent', 'criterion']] = pd.DataFrame(df.base.tolist(), index = df.index)
df = df.drop_duplicates()
df = df[df['criterion']][['congruent', 'base_sent']]
df.to_csv('result/congruent_sentences.csv', index=False)
df.head()

Unnamed: 0,congruent,base_sent
0,Компания начала внутреннее расследование произ...,Компания начала расследование
4,Пара имеет ныне трёх дочерей .,Пара имеет дочерей
8,Тогда пираты похитили четырех человек .,Пираты похитили человек
14,В ассоциации организации комментировать ситуац...,Отказались ситуацию
15,Многие тысячи покинули места погромов .,Тысячи покинули места


- Manually correct the base sentence and save it in the column "sent_corrected" -> "result/congruent_sentences_corrected.csv"
- Find syntactic structure of the sentences and words' frequeincies and length in the "sent_corrected" column using "find_struct" function -> "congruent_sentences_corrected_marked.csv"
- For missing values (words' frequencies, gender) in "result/congruent_sentences_corrected_marked.csv" add values manually or using find_ipm_in_ruscorpora function


In [6]:
df = pd.read_csv('result/congruent_sentences_corrected.csv')
df_marked = process_table(df, 'sent_corrected')
df_marked = df_marked[df_marked['structure'].notnull()]
df_marked.to_csv('result/congruent_sentences_corrected_marked.csv')
df_marked.head()

Unnamed: 0,congruent,base_sent,sent_corrected,length,structure,words,subject,verb,object,gen,...,gen_ipm,adj_length,adj_lemma,adj_ipm,preposition_length,preposition_lemma,preposition_ipm,object2_length,object2_lemma,object2_ipm
2,Авиакомпания официально подтвердила факт круше...,Авиакомпания подтвердила факт,Авиакомпания подтвердила факт крушения,4,Subject - Verb - Object - Gen,"[Авиакомпания, подтвердила, факт, крушения]",авиакомпания,подтвердила,факт,крушения,...,,,,,,,,,,
3,Также массовую дезинфекцию проходят городские ...,Автобусы проходят дезинфекцию,Автобусы проходят массовую дезинфекцию,4,Subject - Verb - Adj - Object,"[Автобусы, проходят, массовую, дезинфекцию]",автобусы,проходят,дезинфекцию,,...,,4.0,массовый,50.834561,,,,,,
4,В результате автомобиль получил технические по...,Автомобиль получил повреждения,Автомобиль получил технические повреждения,4,Subject - Verb - Adj - Object,"[Автомобиль, получил, технические, повреждения]",автомобиль,получил,повреждения,,...,,5.0,технический,73.606628,,,,,,
5,Однако сами авторы выдвинули более осторожное ...,Авторы выдвинули предположение,Авторы выдвинули осторожное предположение,4,Subject - Verb - Adj - Object,"[Авторы, выдвинули, осторожное, предположение]",авторы,выдвинули,предположение,,...,,5.0,осторожный,,,,,,,
6,Авторы письма выразили возмущение перебоями ра...,Авторы выразили возмущение,Авторы выразили возмущение,3,Subject - Verb - Object,"[Авторы, выразили, возмущение]",авторы,выразили,возмущение,,...,,,,,,,,,,


In [7]:
df_marked.value_counts('structure')

structure
Subject - Verb - Object          297
Subject - Verb - Adj - Object    184
Subject - Verb - Object - Gen    149
check me                          18
Subject - Verb - Object - PP      14
Name: count, dtype: int64

In [8]:
# target_columns = ['Subject', 'Object','Verb', 'Gen', 'Adj']
# df_processed =  find_ipm_in_ruscorpora(df_marked, target_columns)

# Stimuli evaluation and selection

- Manually select appropriate sentences in "result/congruent_sentences_corrected_marked.csv" (add column "selected") and split dataset in two groups: "to_evaluate" (experimental stimuli) and "control" (control stimuli) -> "result/congruent_sentences_corrected_marked+.csv"
- Estimate the distribution of 'length' and 'IPM' stimuli parameters between sentences with different syntactic structure

In [9]:
df = pd.read_csv('result/congruent_sentences_corrected_marked+.csv')
df = df[(df['task_type'] == 'to_evaluate') & (
    df['selected'] == 1) & (
    df['structure'].isin(['Subject - Verb - Object',
                          'Subject - Verb - Object - Gen',
                          'Subject - Verb - Adj - Object']))]

for item in ['subject', 'verb', 'object']:
    print(item)
    df[f'{item}_ipm'] = df[f'{item}_ipm'].dropna()
    for comb in itertools.combinations(df['structure'].unique(), 2):
        print(comb)
        print('ipm', 
              stats.ttest_ind(df[df['structure'] == comb[0]][f'{item}_ipm'].dropna().tolist(),
                              df[df['structure'] == comb[1]][f'{item}_ipm'].dropna().tolist()).pvalue > 0.05)
        print('length', 
              stats.ttest_ind(df[df['structure'] == comb[0]][f'{item}_length'].dropna().tolist(),
                              df[df['structure'] == comb[1]][f'{item}_length']).pvalue > 0.05)
        print()

subject
('Subject - Verb - Adj - Object', 'Subject - Verb - Object')
ipm True
length True

('Subject - Verb - Adj - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

('Subject - Verb - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

verb
('Subject - Verb - Adj - Object', 'Subject - Verb - Object')
ipm True
length True

('Subject - Verb - Adj - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

('Subject - Verb - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

object
('Subject - Verb - Adj - Object', 'Subject - Verb - Object')
ipm True
length True

('Subject - Verb - Adj - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

('Subject - Verb - Object', 'Subject - Verb - Object - Gen')
ipm True
length True



In [10]:
df['structure'].value_counts()

structure
Subject - Verb - Adj - Object    75
Subject - Verb - Object          75
Subject - Verb - Object - Gen    75
Name: count, dtype: int64

In [11]:
stats.ttest_ind(df[df['structure'] == 'Subject - Verb - Adj - Object'][f'adj_ipm'].dropna().tolist(),
                df[df['structure'] == 'Subject - Verb - Object - Gen'][f'gen_ipm'].dropna().tolist()).pvalue

np.float64(0.8337370955973877)

In [12]:
df_control = df[df['task_type'] == 'control']
df_evaluate = df[df['task_type'] == 'to_evaluate']

for item in ['subject', 'verb', 'object']:
    print(item)
    print('ipm', 
              stats.ttest_ind(df_evaluate[f'{item}_ipm'].dropna().tolist(),
                              df_control[f'{item}_ipm'].dropna().tolist()).pvalue > 0.05)
    print('length', 
              stats.ttest_ind(df_evaluate[f'{item}_length'].dropna().tolist(),
                              df_control[f'{item}_length']).pvalue > 0.05)
    print()

subject
ipm False
length False

verb
ipm False
length False

object
ipm False
length False



## Stimuli selection based on Toloka assessment

- Generate incongruent sentences based on selected stimuli -> "result/incongruent_sentences_toloka.csv"
- Evaluate sentences via TOLOKA -> "result/toloka_evaluation.csv"

In [13]:
incongruent_stimuli = pd.read_csv('result/incongruent_sentences_toloka.csv')
evaluation_res = pd.read_csv('result/toloka_evaluation.csv')
stimuli_properties = pd.read_csv('result/congruent_sentences_corrected_marked+.csv')

df = pd.merge(incongruent_stimuli, evaluation_res, on=['sentence'])
df = pd.merge(df, stimuli_properties, on=['sentence_id', 'congruent', 'position'])
df.head()

Unnamed: 0,sentence_id,congruent,position,sentence,target,semantics_grammar,semantics,grammar,no,unknown,...,gen_length,gen_lemma,gen_ipm,gen_gender,adj_length,adj_lemma,adj_ipm,adj_gender,task_type,selected
0,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую дезинфекцией,grammar,,,1.0,,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
1,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую фортуну,semantics,,1.0,,,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
2,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую фортуной,semantics_grammar,1.0,,,,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
3,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую дезинфекцию,no,,0.166667,,0.833333,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
4,2,Авторы получали подарки,2,Авторы получали районах,semantics_grammar,1.0,,,,,...,,,0.02,,,,0.02,,to_evaluate,1


 - Based on TOLOKA evaluation results, select only those sentences which error types were correctly assessed by participants for each of four error types -> "result/final_candidates.csv"

In [14]:
df = df[(df['selected'] == True) & (df['task_type'] == 'to_evaluate'
       ) & (df['percent'] >= 0.75) & (df['target'] == df['most_popular'])].drop_duplicates()
selected_ids = []

for sent_id, sent_group in df.groupby('sentence_id'):
    if sent_group['target'].unique().shape[0] == 4:
        selected_ids.append(sent_id)
df = df[df['sentence_id'].isin(selected_ids)]
df.to_csv('result/final_candidates.csv', index=False)
df.head()

Unnamed: 0,sentence_id,congruent,position,sentence,target,semantics_grammar,semantics,grammar,no,unknown,...,gen_length,gen_lemma,gen_ipm,gen_gender,adj_length,adj_lemma,adj_ipm,adj_gender,task_type,selected
0,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую дезинфекцией,grammar,,,1.0,,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
1,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую фортуну,semantics,,1.0,,,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
2,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую фортуной,semantics_grammar,1.0,,,,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
3,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую дезинфекцию,no,,0.166667,,0.833333,,...,,,0.02,,4.0,массовый,50.834561,masc,to_evaluate,1
4,2,Авторы получали подарки,2,Авторы получали районах,semantics_grammar,1.0,,,,,...,,,0.02,,,,0.02,,to_evaluate,1


- Manually check and select most appropriate stimuli for the corpus ->  "result/FINAL_STIMULI.csv"

In [15]:
df = pd.read_csv('STIMULI_FINAL.csv')

for_stats = df[['sentence_id','structure', 'subject_length', 'subject_ipm',
    'verb_length', 'verb_ipm', 'object_length', 'object_ipm',
    'gen_ipm', 'gen_length', 'adj_length', 'adj_ipm']].drop_duplicates()

stats_df = []
for arg in ['subject', 'verb', 'object', 'gen', 'adj']:
    for struct, struct_df in for_stats.groupby('structure'):
        stats_df.append({'Sentence structure': struct,
                        'Sentence argument': arg,
                        'Mean length': struct_df[f'{arg}_length'].mean(),
                        'SD length': struct_df[f'{arg}_length'].std(),
                        'Mean IPM': struct_df[f'{arg}_ipm'].mean(),
                        'SD IPM': struct_df[f'{arg}_ipm'].std()})
pd.DataFrame(stats_df).round(2)

Unnamed: 0,Sentence structure,Sentence argument,Mean length,SD length,Mean IPM,SD IPM
0,Subject - Verb - Adj - Object,subject,3.2,1.01,162.41,427.71
1,Subject - Verb - Object,subject,3.18,0.98,102.75,136.13
2,Subject - Verb - Object - Gen,subject,3.6,1.28,117.35,187.18
3,Subject - Verb - Adj - Object,verb,3.5,0.97,154.37,180.9
4,Subject - Verb - Object,verb,3.64,0.94,113.02,159.37
5,Subject - Verb - Object - Gen,verb,3.76,0.89,123.96,201.76
6,Subject - Verb - Adj - Object,object,3.26,1.4,94.79,87.65
7,Subject - Verb - Object,object,3.3,1.2,120.05,168.54
8,Subject - Verb - Object - Gen,object,3.32,1.32,129.81,209.4
9,Subject - Verb - Adj - Object,gen,,,0.02,0.0
