In [1]:
import pandas as pd
import os
import nltk
import numpy as np
from collections import Counter

c_d = os.getcwd()
a_d = os.path.join(c_d, 'acquisition')
t_d = os.path.join(c_d, 'test')
a_files = [os.path.join(a_d, file) for file in os.listdir(a_d)]
t_files = [os.path.join(t_d, file) for file in os.listdir(t_d)]

In [2]:
def afc_answer(a, b, c, d):
    afc_dict = {'right': d,
               'down': c,
               'left': b}
    return afc_dict[a] == 'green'

def check_reg(x, y):
    if (x == 1 and y == 'left') or (x == 0 and y == 'right'):
        return True
    return False

def recall_text(keys):
    if not isinstance(keys, str):
        return ''
    res = ''
    keys = keys.strip('[]')
    keys = keys.split('\"')
    keys = [key for key in keys if key != ',']
    for key in keys:
        key = key.strip()
        if key == 'backspace':
            if len(res) > 0:
                res = res[:-1]
        elif key in 'qwertyuiopasdfghjklzxcvbnm':
            res += key
    return res
  
def first_rt(x):
    if isinstance(x, str):
        return  float(x.strip('[]').split(',')[0]) 
    return 7

def recall_distance(x, y, words):
    my_words = list(words)
    my_words.remove(y)
    if x == '' :
        return -1
    elif x in my_words:
        return -2
    return nltk.edit_distance(x, y) 
    
def sem_answer(key, category):
    if isinstance(key, str):
        sem_dict = {'left':'природа', 'right':'человек'}
        return sem_dict[key] == category   
    return False

In [3]:
def afc(table):
    AFC = table[['AL', 'key_resp_2.keys', 'key_resp_2.rt', 
                 'AFC1_color', 'AFC2_color', 'AFC3_color']].dropna()
    AFC['afc_answer'] = [afc_answer(a, b, c, d) for a, b, c, d in zip(
        AFC['key_resp_2.keys'], AFC['AFC1_color'], AFC['AFC2_color'], AFC['AFC3_color'])]
    AFC = AFC[['AL', 'afc_answer', 'key_resp_2.rt']]

    AFC.rename(columns={'key_resp_2.rt': 'afc_rt'}, inplace=True)

    return AFC

def recall_acquisition(table):
    recall_a0 = table[['AL', 'key_resp.keys',
                       'recall_text', 'key_resp.rt',
                       'recall_rt' ]][19:29]
    recall_a1 = table[['AL', 'key_resp.keys',
                       'recall_text', 'key_resp.rt',
                       'recall_rt' ]][47:56]
    recall_a2 = table[['AL', 'key_resp.keys',
                       'recall_text', 'key_resp.rt',
                       'recall_rt' ]][74:83]
    recall_a3 = table[['AL', 'key_resp.keys', 
                       'recall_text', 'key_resp.rt',
                       'recall_rt']][101:110]

    recall_a = pd.concat([recall_a0, recall_a1],axis=0)
    recall_a = pd.concat([recall_a, recall_a2], axis=0)
    recall_a = pd.concat([recall_a, recall_a3], axis=0)

    recall_a['recall_answers'] = [recall_text(x) 
                                  for x in recall_a['key_resp.keys']]
    recall_a['recall_first_rt'] = [first_rt(x) for x in recall_a['key_resp.rt']]

    recall_a['recall_answer_dist']  = [recall_distance(x, y, recall_a['AL'])
                                           for x, y in zip(recall_a['recall_answers'],
                                                           recall_a['AL'])]

    recall_a = recall_a[['AL', 'recall_answers', 'recall_answer_dist',
                        'recall_first_rt', 'recall_rt']]

    return recall_a



def recall_test(table):
    recall_t = table[['recall_AL', 'key_resp_4.keys', 
                      'recall_test_text', 'key_resp_4.rt', 
                      'recall_test_rt' ]][150:190]

    recall_t['recall_test_answers'] = [recall_text(x) for x in recall_t['key_resp_4.keys']]
    recall_t['recall_test_first_rt'] = [first_rt(x) for x in recall_t['key_resp_4.rt']]

    recall_t['recall_test_answer_dist']  = [recall_distance(x, y, recall_t['recall_AL'])
                                           for x, y in zip(recall_t['recall_test_answers'],
                                                           recall_t['recall_AL'])]

    recall_t = recall_t[['recall_AL', 
                         'recall_test_answers', 
                         'recall_test_answer_dist',
                        'recall_test_first_rt', 
                         'recall_test_rt']]
    
    

    recall_t.rename(columns={'recall_AL': 'AL'}, inplace=True)
    return recall_t

def recall_test_t(table):
    recall_t = table[['recall_AL', 'key_resp_2.keys', 
                      'recall_test_text', 'key_resp_2.rt', 
                      'recall_test_rt' ]][40:80]

    recall_t['recall_test2_answers'] = [recall_text(x) 
                                  for x in recall_t['key_resp_2.keys']]
    recall_t['recall_test2_first_rt'] = [first_rt(x) for x in recall_t['key_resp_2.rt']]

    recall_t['recall_test2_answer_dist']  = [recall_distance(x, y, recall_t['recall_AL'])
                                                 for x, y in zip(recall_t['recall_test2_answers'],
                                                                 recall_t['recall_AL'])]

    recall_t = recall_t[['recall_AL', 
                         'recall_test2_answers', 
                         'recall_test2_answer_dist',
                        'recall_test2_first_rt', 
                         'recall_test_rt']]

    recall_t.rename(columns={'recall_AL': 'AL', 'recall_test_rt': 'recall_test2_rt'}, inplace=True)
    return recall_t

def recognition(table):
    recog = table[['AL','recognition_text', 
                   'recognition_correctness',
                   'key_resp_3.keys', 'key_resp_3.rt']][110:150]

    recog['recog_answers'] = [check_reg(x, y) \
                            for x, y in zip(recog['recognition_correctness'], 
                                            recog['key_resp_3.keys'])]
    recog = recog[['AL', 'recognition_text', 
                   'recog_answers', 
                  'key_resp_3.rt']]

    recog.rename(columns={'key_resp_3.rt': 'recog_rt'}, inplace=True)

    return recog

def recognition_t(table):
    recog = table[['AL','recognition_text', 
                   'recognition_correctness',
                   'key_resp.keys', 'key_resp.rt']][:40]

    recog['recog2_answers'] = [check_reg(x, y) \
                            for x, y in zip(recog['recognition_correctness'], 
                                            recog['key_resp.keys'])]
    recog = recog[['AL', 'recog2_answers', 'key_resp.rt']]

    recog.rename(columns={'key_resp.rt': 'recog2_rt'}, inplace=True)

    return recog

def semantic(table, word_dict):
    semcat = pd.read_csv('semantic_categories.csv', sep=';')
    semcat.rename(columns={'word': 'L1'}, inplace=True)
    semcat = pd.merge(semcat, word_dict, on='L1')
    sem = table[['SEM_AL', 'key_resp_5.keys',
                 'key_resp_5.rt']][190:230]
    sem.rename(columns={'key_resp_5.rt': 'sem_rt'}, inplace=True)
    sem.rename(columns={'SEM_AL': 'AL'}, inplace=True)
    sem = pd.merge(sem, semcat, on='AL')
    sem['sem_answers'] = [sem_answer(x, y) for x, y 
                          in zip(sem['key_resp_5.keys'], 
                                 sem['category'])]
    sem = sem[['AL', 'sem_answers',  'sem_rt']]
    return sem

def semantic_t(table, word_dict):
    semcat = pd.read_csv('semantic_categories.csv', sep=';')
    semcat.rename(columns={'word': 'L1'}, inplace=True)
    semcat = pd.merge(semcat, word_dict, on='L1')
    sem = table[['SEM_AL', 'key_resp_3.keys',
                 'key_resp_3.rt']][80:120]
    sem.rename(columns={'key_resp_3.rt': 'sem2_rt'}, inplace=True)
    sem.rename(columns={'SEM_AL': 'AL'}, inplace=True)
    sem = pd.merge(sem, semcat, on='AL')
    sem['sem2_answers'] = [sem_answer(x, y) for x, y 
                          in zip(sem['key_resp_3.keys'], 
                                 sem['category'])]
    sem = sem[['AL', 'sem2_answers', 'sem2_rt']]
    return sem

def all_acquisition(table):
    word_dict = table[['AL', 'L1']][110:150]
    AFC = afc(table)
    recall_a = recall_acquisition(table)
    recall_t = recall_test(table)
    recog = recognition(table)
    sem = semantic(table, word_dict)

    table = pd.merge(word_dict, AFC, on='AL', how='outer')
    table = pd.merge(table, recall_a, on='AL', how='outer')
    table = pd.merge(table, recog, on='AL', how='outer')
    table = pd.merge(table, recall_t, on='AL', how='outer')
    table = pd.merge(table, sem, on='AL', how='outer')
    return table

def all_test(table):
    word_dict = table[['AL', 'L1']][:40]
    recall_t = recall_test_t(table)
    recog = recognition_t(table)
    sem = semantic_t(table, word_dict)

    table = pd.merge(word_dict, recog, on='AL', how='outer')
    table = pd.merge(table, recall_t, on='AL', how='outer')
    table = pd.merge(table, sem, on='AL', how='outer')
    table.drop('L1', axis=1, inplace=True)
    return table

def all_exp(table1, table2, var):
    file = pd.read_csv(table1)
    file_2 = pd.read_csv(table2, sep=',')
    table = pd.merge(all_acquisition(file), 
                     all_test(file_2), 
                     on="AL")
    s = stats(table, var)
    table = table.fillna('').append(s[0])

    return table, s[1]
    
def stats(table, variant):
    
    recalls = [Counter(table['recall_answer_dist'].dropna()),
              Counter(table['recall_test_answer_dist'].dropna()),
              Counter(table['recall_test2_answer_dist'].dropna())]
    rec_totals = [i[0] for i in recalls]
    
    recalls = [sorted(recalls[0].items(), key = lambda x: x[0]),
              sorted(recalls[1].items(), key = lambda x: x[0]),
              sorted(recalls[2].items(), key = lambda x: x[0])]
   
    recs = []
    
    for idx, key in enumerate(['recall_a_dist', 'recall_t_dist', 'recall_t2_dist']):
        t = ''
        for p in recalls[idx]:
            t += f'{str(p[0])}\t{p[1]}\n'
        recs.append(t)
    
    t1 = np.zeros(shape=(2,25))
    to_mean = ['afc_rt', 'recall_answer_distance', 'recall_first_rt', 'recall_rt',
               'recog_rt', 
               'recall_test_answer_distance', 'recall_test_first_rt', 'recall_test_rt', 
               'sem_rt', 'recog2_rt', 
               'recall_test2_answer_distance',
               'recall_test2_first_rt', 'recall_test2_rt','sem_rt2']
    to_sum = ['afc_answer', 'recog_answers', 'sem_answers', 'recog2_answers', 'sem2_answers']
    rec_names = ['recall_answers', 'recall_test_answers', 'recall_test2_answers']
    stats = [['variant', variant]]
    for idx, key in enumerate(table.keys()):
        if key in to_sum:
            k = table[key].dropna().sum()
            t1[0][idx] = k
            stats.append([key + '_sum', k])
        elif key in to_mean:
            k = table[key].dropna().mean()
            t1[1][idx]  = k
            stats.append([key+'_mean', k])
            
            
    t1 = pd.DataFrame(t1, columns=table.keys())
    for key in t1.keys():
        for idx in range(2):
            if t1[key][idx] == 0:
                t1[key][idx] = ''
    
    t1['AL'][0] = 'SUM'
    t1['AL'][1] = 'MEAN'
    
    t1['recall_answers'][0] = recs[0]
    t1['recall_test_answers'][0] = recs[1]
    t1['recall_test2_answers'][0] = recs[2]
    stats.append(['recall_correct', rec_totals[0]])
    stats.append(['recall_test_correct', rec_totals[1]])
    stats.append(['recall_test2_correct', rec_totals[2]])
    
    st = pd.DataFrame(stats, columns=['parameter', 'value'])
    
                
    return t1, st
    

In [4]:
p5 = all_exp(a_files[4], t_files[4], 5)

p3 = all_exp(a_files[2], t_files[1], 3)

p2 = all_exp(a_files[3], t_files[3], 2)

p1 = all_exp(a_files[0], t_files[0], 0)

p0 = all_exp(a_files[1], t_files[2], 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
all_stats = pd.merge(p0[1], p1[1], on='parameter')
all_stats = pd.merge(all_stats, p2[1], on='parameter')
all_stats = pd.merge(all_stats, p3[1], on='parameter')
all_stats = pd.merge(all_stats, p5[1], on='parameter')

In [None]:
p0[0].to_excel('p0.xlsx', index=False)
p1[0].to_excel('p1.xlsx', index=False)
p2[0].to_excel('p2.xlsx', index=False)
p3[0].to_excel('p3.xlsx', index=False)
p5[0].to_excel('p5.xlsx', index=False)
all_stats.to_excel('all_stats.xlsx')