In [7]:
from pymystem3 import Mystem
import pandas as pd
from tqdm.notebook import tqdm
import natasha
from scipy import  stats
from collections import Counter
import pymorphy2
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import itertools
import json
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
tqdm.pandas()

morph = pymorphy2.MorphAnalyzer()

In [3]:
def find_base(text):
    res = {'nsubj': '',
           'root': '',
          'dobj': ''}
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser) 
    tokens = {tok.id: tok for tok in doc.tokens}
    ok = 0
    for tok in doc.tokens:
        if tok.rel == 'root':
            res['root'] = tok.text
            ok += tok.pos == 'VERB'
        elif tok.rel == 'nsubj' and tokens[tok.head_id].rel == 'root':
            res['nsubj'] = tok.text
            ok += tok.pos == 'NOUN'
        elif tok.rel == 'obj' and tokens[tok.head_id].rel == 'root':
            res['dobj'] = tok.text
            ok += tok.pos == 'NOUN'        
    return (' '.join(res.values()).strip().capitalize(), ok == 3)

m = Mystem()

In [None]:
df_noun = pd.read_excel('Neurointerfaces_ incongruent sentences.xlsx', 'all_NOUN')
df_noun['table_name'] = 'all_NOUN'
df_verb = pd.read_excel('Neurointerfaces_ incongruent sentences.xlsx', 'all_VERB')
df_verb['table_name'] = 'all_VERB'
df_adj = pd.read_excel('Neurointerfaces_ incongruent sentences.xlsx', 'all_ADJ')
df_adj['table_name'] = 'all_ADJ'
df = pd.concat([df_noun, df_verb, df_adj])

In [None]:
df['base'] = df['congruent'].progress_apply(find_base)
df[['base sent', 'criterion']] = pd.DataFrame(df.base.tolist(), index = df.index)
df['criterion'].sum()

In [None]:
df = df.drop_duplicates()
df[df['criterion']].to_excel('stimuli_AZH.xlsx', index=False)

# Проверка

In [4]:
def find_struct(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser) 
    tokens = {tok.id: tok for tok in doc.tokens}
    ok = 0
    if (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'NOUN' and 
        len(doc.tokens) == 3):
        return 'Subject - Verb - Object'
    elif len(doc.tokens) == 3:
        return 'check me'
    elif (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'NOUN' and 
        doc.tokens[3].pos == 'NOUN' and 
        doc.tokens[3].feats['Case'] == 'Gen' and 
        len(doc.tokens) == 4):
        return 'Subject - Verb - Object - Gen'
    elif (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'ADJ' and 
        doc.tokens[3].pos == 'NOUN' and 
        len(doc.tokens) == 4):
        return 'Subject - Verb - Adj - Obj'
    elif (doc.tokens[0].pos == 'NOUN' and 
        doc.tokens[1].pos == 'VERB' and 
        doc.tokens[2].pos == 'NOUN' and 
        doc.tokens[3].pos == 'ADP' and 
        doc.tokens[4].pos == 'NOUN' and 
        len(doc.tokens) == 5):
        return 'Subject - Verb - Obj - PP'
    
def count_syllables(word):
    if not isinstance(word, str):
        return
    counter = 0
    vowels = 'аеиюэоыуея'
    for letter in word:
        if letter in vowels:
            counter += 1
    return counter

def normalize(word):
    if not isinstance(word, str):
        return
    return morph.parse(word)[0].normal_form

In [5]:
def process_table(df: pd.DataFrame, column: str, df_name=None, sheet=None) -> pd.DataFrame:
    """
    A function to find sentence syntactic structure,
    words' length in syllables and word frequencies
    :param df: pd.DataFrame
    :param df_name: path to df
    :param sheet: sheet_name
    :param column: column name
    :return: DataFrame
    """
    if df_name is not None:
        df = pd.read_excel(df_name, sheet_name=sheet)
    df['length'] = df[column].apply(lambda x: len(x.split(' ')))
    df['Structure'] = df[column].progress_apply(find_struct)
    df['words'] = df[column].apply(lambda x: x.split())

    subjects = []
    verbs = []
    objectives = []
    genitives = []
    adjectives = []
    prepositions = []
    objectives2 = []

    for sent, struct in zip(df['words'], df['Structure']):
        subjects.append(sent[0].lower())
        verbs.append(sent[1])
        obj = sent[2] if struct != 'Subject - Verb - Adj - Object' else sent[3]
        objectives.append(obj)
        gen = sent[3] if struct == 'Subject - Verb - Object - Gen' else None
        genitives.append(gen)
        adj = sent[2] if struct == 'Subject - Verb - Adj - Object' else None
        adjectives.append(adj)
        prep = sent[3] if struct == 'Subject - Verb - Object - PP' else None
        prepositions.append(prep)
        if struct == 'Subject - Verb - Object - PP':
            obj2 = sent[4] 
        else:
            obj2 = None
        objectives2.append(obj2)


    df['Subject'] = subjects
    df['Verb'] = verbs
    df['Object'] = objectives
    df['Gen'] = genitives
    df['Adj'] = adjectives
    df['Preposition'] = prepositions
    df['Object 2'] = objectives2

    for column in ['Subject', 'Verb',
                   'Object', 'Gen', 'Adj',
                   'Preposition', 'Object 2']:
        df[f'{column} length'] = df[column].map(count_syllables)
        df[f'{column} lemma'] = df[column].map(lambda x: normalize(x))
    
    return df

In [6]:
def find_ipm_in_dict(df):
    frequency_noun = pd.read_excel('ruscorpora_content_noun.xlsx')
    frequency_adj = pd.read_excel('ruscorpora_content_adj.xlsx')
    frequency_verb = pd.read_excel('ruscorpora_content_verb.xlsx')

    frequency_noun = frequency_noun.rename(columns={'ipm': 'ipm_noun'})
    frequency_adj = frequency_adj.rename(columns={'ipm': 'ipm_adj'})
    frequency_verb = frequency_verb.rename(columns={'ipm': 'ipm_verb'})

    df = pd.merge(df, frequency_noun[['lex', 'ipm_noun']], how='left',
                  left_on='Subject lemma', right_on='lex')
    df = pd.merge(df, frequency_noun[['lex', 'ipm_noun']], how='left',
                  left_on='Gen lemma', right_on='lex')
    df = pd.merge(df, frequency_noun[['lex', 'ipm_noun']], how='left', 
                  left_on='Object lemma', right_on='lex')
    df = pd.merge(df, frequency_noun[['lex', 'ipm_noun']], how='left', 
                  left_on='Object 2 lemma', right_on='lex')
    df = pd.merge(df, frequency_verb[['lex', 'ipm_verb']], how='left', 
                  left_on='Verb lemma', right_on='lex')
    df = pd.merge(df, frequency_adj[['lex', 'ipm_adj']], how='left', 
                  left_on='Adj lemma', right_on='lex')
    
    return df

In [7]:
def find_ipm_in_ruscorpora(df, target_columns):    

    def find_ipm(word, driver):
        try:
            driver.get(F'https://ruscorpora.ru/explore?req={word}')  
            time.sleep(0.5)
            driver.find_elements(By.CLASS_NAME, 'link--accent')[1].click()
            time.sleep(0.5)
            button = [button for button in driver.find_elements(By.TAG_NAME, 'button')
                     if button.text == 'Частотность'][0]
            button.click()
            time.sleep(0.5)
            return float(driver.find_element(By.XPATH, '/html/body/div[4]/main/div/div[3]/div[1]/div[1]/div[2]/div/table/tbody/tr/td[5]/span').text.replace(',', '.')) 
        except Exception as e:
            print(e, word)
            return 

    driver = webdriver.Chrome()
    
    words = []
    
    for column in target_columns: 
        words += df[f'{column} lemma'].tolist()

    words_ipm_dict = {w: find_ipm(w, driver) for w in tqdm(set(words))}

    for col in ['Subject', 'Object', 'Object 2',
                'Verb', 'Object', 'Gen', 'Adj',
                'Object 2']:
        df[f'ipm_{col}'] = df[f'{col} lemma'].map(words_ipm_dict)
        df[f'{col} gender'] = df[f'{col} lemma'].map(
            lambda x: morph.parse(x)[0].tag.gender if isinstance(x, str) else None)
        
    return df

# Выбор контрольных стимулов

In [8]:
my_df = pd.read_excel('stimuli_versions\stimuli_AZH.xlsx', sheet_name='stimuli')
katya_df = pd.read_excel('stimuli_versions\stimuli_new.xlsx', sheet_name='break_everything')
result_df = pd.merge(katya_df, my_df, how='left', left_on='congruent', right_on='sent corrected' )

## Объединение датасетов

In [9]:
result_df_processed = result_df[result_df['Structure'].notnull()]

result_df_unprocessed = result_df[result_df['Structure'].isnull()].drop_duplicates(subset=['congruent'])
df_processed = process_table(result_df_unprocessed, 'congruent')

df_processed = df_processed[df_processed['Structure'].isin(['Subject - Verb - Object - Gen',
                                               'Subject - Verb - Adj - Obj',
                                               'Subject - Verb - Obj']
                                               )]
target_columns = ['Subject', 'Object','Verb', 'Gen', 'Adj']
df_processed =  find_ipm_in_ruscorpora(df_processed, target_columns)
df_processed.head()

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'ipm_{col}'] = df[f'{col} lemma'].map(words_ipm_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col} gender'] = df[f'{col} lemma'].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'ipm_{col}'] = df[f'{col} lemma'].map(words_ipm_dict)
A value is trying to be set on a copy of a 

Unnamed: 0,congruent,sem_incongruent,position,gram_incongruent_CASE,gram_incongruent_NUMBER,sem_incongruent_CASE,sem_incongruent_NUMBER,sent corrected,selected,Structure,...,Adj gender,Preposition,Object 2,Preposition length,Preposition lemma,Object 2 length,Object 2 lemma,ipm_Object 2,Object 2 gender,Verb gender
572,Полиция ведет поиски заявителя,Полиция ведет округа заявителя,2,Полиция ведет поискам заявителя,Полиция ведет поиск заявителя,Полиция ведет округ заявителя,Полиция ведет округов заявителя,,,Subject - Verb - Object - Gen,...,,,,,,,,0.04,,


In [10]:
result_df = pd.concat([df_processed, result_df_processed]).reset_index()

result_df = result_df.dropna(subset=['sem_incongruent_CASE', 'sem_incongruent_NUMBER',
                                    'gram_incongruent_NUMBER', 'gram_incongruent_CASE'],
                            how='any')
result_df = result_df.sort_values('congruent').reset_index()

# Берем первый вариант сломанной семантики

duplicates_first = [1]
for idx in range(1, result_df.shape[0]):
    if result_df['congruent'][idx-1] != result_df['congruent'][idx]:
        duplicates_first.append(1)
    else:
        duplicates_first.append(0)
result_df['duplicates_first']  = duplicates_first  
result_df = result_df[result_df['duplicates_first'] == 1]
result_df['sentence_id'] = list(range(result_df.shape[0]))
target_columns = ['sentence_id', 'congruent', 'position', 'sem_incongruent', 
                  'gram_incongruent_CASE', 'gram_incongruent_NUMBER',
                  'sem_incongruent_CASE', 'sem_incongruent_NUMBER',
                  'Structure', 'length', 'words', 
                  'Subject', 'Verb', 'Object', 'Gen', 'Adj', 
                  'Subject length', 'Subject lemma', 
                  'Verb length', 'Verb lemma', 
                  'Object length', 'Object lemma', 
                  'Gen length', 'Gen lemma', 
                  'Adj length', 'Adj lemma', 
                  'ipm_Subject', 'ipm_Gen', 'ipm_Object', 'ipm_Verb', 'ipm_Adj', 
                  'Subject gender', 'Object gender', 'Gen gender', 'Adj gender']
result_df = result_df.sort_values('congruent')[target_columns]


In [11]:
df_selected = pd.read_excel('stimuli_versions/stimuli_selected.xlsx')
result_df = pd.merge(result_df, df_selected[['task_type', 'selected', 'congruent']],
                     how='left', on='congruent')
result_df.head(2)

Unnamed: 0,sentence_id,congruent,position,sem_incongruent,gram_incongruent_CASE,gram_incongruent_NUMBER,sem_incongruent_CASE,sem_incongruent_NUMBER,Structure,length,...,ipm_Gen,ipm_Object,ipm_Verb,ipm_Adj,Subject gender,Object gender,Gen gender,Adj gender,task_type,selected
0,0,Автобусы проходят массовую дезинфекцию,3,Автобусы проходят массовую фортуну,Автобусы проходят массовую дезинфекцией,Автобусы проходят массовую дезинфекции,Автобусы проходят массовую фортуной,Автобусы проходят массовую фортуны,Subject - Verb - Adj - Object,4.0,...,0.02,1.63,181.770609,50.834561,masc,femn,,masc,to_evaluate,1
1,1,Автомобиль получил технические повреждения,3,Автомобиль получил технические поколения,Автомобиль получил технические повреждение,Автомобиль получил технические повреждений,Автомобиль получил технические поколением,Автомобиль получил технические поколений,Subject - Verb - Adj - Object,4.0,...,0.02,14.89,552.890944,73.606628,masc,neut,,masc,,0


In [None]:
result_df.to_excel('stimuli_all.xlsx', index=False)

## Тесты

In [12]:
df = result_df
df = df[(df['task_type'] == 'to_evaluate')]

#next 5 lines to check distribution in after toloka evaluation
# selected = pd.read_excel('toloka_selected.xlsx')
# df = df[df['congruent'].isin(selected['sentence'].tolist())]

# comp1 = mc.MultiComparison(dataframe[ValueColumn], dataframe[CategoricalColumn])
# tbl, a1, a2 = comp1.allpairtest(stats.ttest_ind, method= "bonf")

for item in ['Subject', 'Verb', 'Object']:
    print(item)
    df[f'ipm_{item}'] = df[f'ipm_{item}'].dropna()
    for comb in itertools.combinations(df['Structure'].unique(), 2):
        print(comb)
        print('ipm', 
              stats.ttest_ind(df[df['Structure'] == comb[0]][f'ipm_{item}'].dropna().tolist(),
                              df[df['Structure'] == comb[1]][f'ipm_{item}'].dropna().tolist()).pvalue > 0.05)
        print('length', 
              stats.ttest_ind(df[df['Structure'] == comb[0]][f'{item} length'].dropna().tolist(),
                              df[df['Structure'] == comb[1]][f'{item} length']).pvalue > 0.05)
        print()

Subject
('Subject - Verb - Adj - Object', 'Subject - Verb - Object')
ipm True
length True

('Subject - Verb - Adj - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

('Subject - Verb - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

Verb
('Subject - Verb - Adj - Object', 'Subject - Verb - Object')
ipm True
length True

('Subject - Verb - Adj - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

('Subject - Verb - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

Object
('Subject - Verb - Adj - Object', 'Subject - Verb - Object')
ipm True
length True

('Subject - Verb - Adj - Object', 'Subject - Verb - Object - Gen')
ipm True
length True

('Subject - Verb - Object', 'Subject - Verb - Object - Gen')
ipm True
length True



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'ipm_{item}'] = df[f'ipm_{item}'].dropna()


In [13]:
df['Structure'].value_counts()

Subject - Verb - Adj - Object    75
Subject - Verb - Object          75
Subject - Verb - Object - Gen    75
Name: Structure, dtype: int64

In [14]:
stats.ttest_ind(df[df['Structure'] == 'Subject - Verb - Adj - Object'][f'ipm_Adj'].dropna().tolist(),
                df[df['Structure'] == 'Subject - Verb - Object - Gen'][f'ipm_Gen'].dropna().tolist()).pvalue

0.8638782979165149

In [15]:
df1 = result_df[result_df['task_type'] == 'control']
df2 = result_df[result_df['task_type'] == 'to_evaluate']

# comp1 = mc.MultiComparison(dataframe[ValueColumn], dataframe[CategoricalColumn])
# tbl, a1, a2 = comp1.allpairtest(stats.ttest_ind, method= "bonf")

for item in ['Subject', 'Verb', 'Object']:
    print(item)
    df1[f'ipm_{item}'] = df1[f'ipm_{item}'].dropna()
    df2[f'ipm_{item}'] = df2[f'ipm_{item}'].dropna()
    print('ipm', 
              stats.ttest_ind(df1[f'ipm_{item}'].dropna().tolist(),
                              df2[f'ipm_{item}'].dropna().tolist()).pvalue > 0.05)
    print('length', 
              stats.ttest_ind(df1[f'{item} length'].dropna().tolist(),
                              df2[f'{item} length']).pvalue > 0.05)
    print()

Subject
ipm True
length True

Verb
ipm True
length True

Object
ipm True
length True



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[f'ipm_{item}'] = df1[f'ipm_{item}'].dropna()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[f'ipm_{item}'] = df2[f'ipm_{item}'].dropna()


## Формирование выборки для толоки

In [9]:
df = pd.read_excel('stimuli_versions/stimuli_all.xlsx')
# df = df[df['selected'] == 1].sort_values('task_type', ascending=False).reset_index()
my_pull = []
for idx, row in df.dropna(subset='task_type').iterrows():
    my_pull.append({'sentence_id': row['sentence_id'],
                    'sentence': row['congruent'], 
                    'target': 'no'})
    my_pull.append({'sentence_id': row['sentence_id'],
                    'sentence': row['sem_incongruent'], 
                    'target': 'semantics'})
    my_pull.append({'sentence_id': row['sentence_id'],
                    'sentence': row['gram_incongruent_CASE'], 
                    'target': 'grammar'})
    my_pull.append({'sentence_id': row['sentence_id'],
                    'sentence': row['sem_incongruent_CASE'], 
                    'target': 'semantics_grammar'})
my_pull = pd.DataFrame(my_pull)     

# Для второй итерации
# my_pull = my_pull[my_pull['error_type'].isin(['gram_incongruent_CASE', 'sem_incongruent_CASE'])]
# my_pull2 = my_pull[['sentence']].rename(columns={'sentence': 'INPUT:comment'})
# my_pull2.to_excel('toloka_iter2_case_only.xlsx', index=False)

In [10]:
df2 = pd.read_excel('stimuli_versions/stimuli_edited.xlsx')
new_stimuli = pd.concat([my_pull, df2])

In [11]:
df2.head(2)

Unnamed: 0,sentence_id,target,sentence
0,4,semantics_grammar,Адвокат обжаловал воспоминанием властей
1,6,grammar,Администратор нажала кнопке сигнализации


In [12]:
my_pull.head(2)

Unnamed: 0,sentence_id,sentence,target
0,0,Автобусы проходят массовую дезинфекцию,no
1,0,Автобусы проходят массовую фортуну,semantics


In [13]:
new_stimuli.head(2)

Unnamed: 0,sentence_id,sentence,target
0,0,Автобусы проходят массовую дезинфекцию,no
1,0,Автобусы проходят массовую фортуну,semantics


In [14]:
new_stimuli.to_excel('stimuli_versions/toloka.xlsx', index=False)

In [15]:
new_stimuli[new_stimuli['sentence_id'] == 8]

Unnamed: 0,sentence_id,sentence,target
24,8,Артиллеристы подготовили орудия,no
25,8,Артиллеристы подготовили посещения,semantics
26,8,Артиллеристы подготовили орудию,grammar
27,8,Артиллеристы подготовили посещение,semantics_grammar
97,8,Артиллеристы подготовили телевидения,semantics
98,8,Артиллеристы подготовили телевидению,semantics_grammar
262,8,Артиллеристы подготовили знакомстве,semantics_grammar
263,8,Артиллеристы подготовили знакомства,semantics


In [2]:
import pandas as pd

df = pd.read_excel('FINAL.xlsx')

In [3]:
df.head(2)

Unnamed: 0,sentence_id,sentence,congruent,target,Structure,percent,position,semantics_grammar,semantics,grammar,...,Subject gender,Object gender,Gen gender,Adj gender,task_type,target==eval,percent>75&evalCorrect,percent>75&evalCorrect&normEvalCorrect&SemEvalCorrect,percent>75&evalCorrect&normEvalCorrect&GramEvalCorrect,percent>75&evalCorrect&normEvalCorrect&SemGramEvalCorrect
0,0,Автобусы проходят массовую дезинфекцией,Автобусы проходят массовую дезинфекцию,grammar,Subject - Verb - Adj - Object,1.0,3,,,1.0,...,masc,femn,,masc,to_evaluate,True,True,False,False,False
1,0,Автобусы проходят массовую дезинфекцию,Автобусы проходят массовую дезинфекцию,no,Subject - Verb - Adj - Object,0.833333,3,,0.166667,,...,masc,femn,,masc,to_evaluate,True,True,False,False,False


In [9]:
exp_table = pd.DataFrame(df['sentence'].apply(lambda x:x.split()).tolist())
exp_table['sent_id'] = df['sentence_id']
exp_table['target'] = df['target']
exp_table = exp_table.rename(columns={0: 'w1', 1: 'w2', 2: 'w3', 3:'w4'})

In [11]:
exp_table.to_excel('stimuli_list.xlsx', index=False)

In [17]:
exp_table2 = pd.DataFrame(df['sentence'].apply(lambda x:x.split()))
exp_table2['sent_id'] = df['sentence_id']
exp_table2['target'] = df['target']
exp_table2.to_excel('stimuli_list2.xlsx', index=False)

In [11]:
import pandas as pd

df = pd.read_csv('stimuli_AZh - FINAL.csv')
df[['ipm_Subject',
       'ipm_Gen', 'ipm_Object', 'ipm_Verb', 'ipm_Adj', ]] = df[['ipm_Subject',
       'ipm_Gen', 'ipm_Object', 'ipm_Verb', 'ipm_Adj', ]].map(lambda x: 
                                                              x.replace(',', '.') if 
                                                              isinstance(x, str) else x).astype(float)
df.columns

Index(['sentence_id', 'sentence', 'congruent', 'target', 'Structure',
       'percent', 'position', 'semantics_grammar', 'semantics', 'grammar',
       'no', 'unknown', 'most_popular', 'iter', 'length', 'words', 'Subject',
       'Verb', 'Object', 'Gen', 'Adj', 'Subject length', 'Subject lemma',
       'Verb length', 'Verb lemma', 'Object length', 'Object lemma',
       'Gen length', 'Gen lemma', 'Adj length', 'Adj lemma', 'ipm_Subject',
       'ipm_Gen', 'ipm_Object', 'ipm_Verb', 'ipm_Adj', 'Subject gender',
       'Object gender', 'Gen gender', 'Adj gender', 'task_type',
       'target==eval', 'percent>75&evalCorrect',
       'percent>75&evalCorrect&normEvalCorrect&SemEvalCorrect',
       'percent>75&evalCorrect&normEvalCorrect&GramEvalCorrect',
       'percent>75&evalCorrect&normEvalCorrect&SemGramEvalCorrect'],
      dtype='object')

In [23]:
df[[ 'Structure', 'Subject length', 'Subject lemma',
       'Verb length', 'Verb lemma', 'Object length', 'Object lemma',
       'Gen length', 'Gen lemma', 'Adj length', 'Adj lemma', 'ipm_Subject',
       'ipm_Gen', 'ipm_Object', 'ipm_Verb', 'ipm_Adj', ]].groupby('Structure').describe().to_excel('stats.xlsx')

In [39]:
df = pd.read_excel('stats.xlsx')

In [41]:
df.round(2)

Unnamed: 0,Sentence structure,Sentence argument,"Mean length, syllables","Mean length, syllables.1","Mean frequency, IPM","SD frequency, IPM"
0,Subject - Verb - Adj - Object,Subject,3.2,1.0,162.41,424.47
1,Subject - Verb - Object,Subject,3.18,0.98,102.75,135.1
2,Subject - Verb - Object - Gen,Subject,3.6,1.27,117.35,185.7
3,Subject - Verb - Adj - Object,Verb,3.5,0.97,154.37,179.53
4,Subject - Verb - Object,Verb,3.64,0.94,113.02,158.14
5,Subject - Verb - Object - Gen,Verb,3.76,0.89,123.96,200.2
6,Subject - Verb - Adj - Object,Object,3.26,1.39,94.79,86.98
7,Subject - Verb - Object,Object,3.3,1.19,120.05,167.27
8,Subject - Verb - Object - Gen,Object,3.32,1.31,129.81,207.81
9,Subject - Verb - Adj - Object,Genitive,,,0.02,0.0


In [44]:
text = []
for _, row in df.round(2).iterrows():
    text.append(' & '.join(row.values.astype(str)))

In [47]:
print(' \\\\\n'.join(text))

Subject - Verb - Adj - Object & Subject & 3.2 & 1.0 & 162.41 & 424.47 \\
Subject - Verb - Object & Subject & 3.18 & 0.98 & 102.75 & 135.1 \\
Subject - Verb - Object - Gen & Subject & 3.6 & 1.27 & 117.35 & 185.7 \\
Subject - Verb - Adj - Object & Verb & 3.5 & 0.97 & 154.37 & 179.53 \\
Subject - Verb - Object & Verb & 3.64 & 0.94 & 113.02 & 158.14 \\
Subject - Verb - Object - Gen & Verb & 3.76 & 0.89 & 123.96 & 200.2 \\
Subject - Verb - Adj - Object & Object & 3.26 & 1.39 & 94.79 & 86.98 \\
Subject - Verb - Object & Object & 3.3 & 1.19 & 120.05 & 167.27 \\
Subject - Verb - Object - Gen & Object & 3.32 & 1.31 & 129.81 & 207.81 \\
Subject - Verb - Adj - Object & Genitive & nan & nan & 0.02 & 0.0 \\
Subject - Verb - Object & Genitive & nan & nan & 0.02 & 0.0 \\
Subject - Verb - Object - Gen & Genitive & 3.78 & 1.38 & 137.51 & 248.08 \\
Subject - Verb - Adj - Object & Adjective & 4.0 & 1.13 & 141.28 & 274.98 \\
Subject - Verb - Object & Adjective & nan & nan & 0.02 & 0.0 \\
Subject - Verb - 