In [1]:
import pandas as pd
import os
import nltk
import numpy as np
from collections import Counter
from collections import defaultdict

ALL_RT = defaultdict(list)

c_d = os.getcwd()
a_d = os.path.join(c_d, 'acquisition')
t_d = os.path.join(c_d, 'test')
a_files = [os.path.join(a_d, file) for file in os.listdir(a_d)]
t_files = [os.path.join(t_d, file) for file in os.listdir(t_d)]

In [2]:
def afc_answer(a, b, c, d):
    afc_dict = {'right': d,
               'down': c,
               'left': b}
    return afc_dict[a] == 'green'

def check_reg(x, y):
    if (x == 1 and y == 'left') or (x == 0 and y == 'right'):
        return True
    return False

def recall_text(keys):
    if not isinstance(keys, str):
        return ''
    res = ''
    keys = keys.strip('[]')
    keys = keys.split('\"')
    keys = [key for key in keys if key != ',']
    for key in keys:
        key = key.strip()
        if key == 'backspace':
            if len(res) > 0:
                res = res[:-1]
        elif key in 'qwertyuiopasdfghjklzxcvbnm':
            res += key
    return res
  
def first_rt(x):
    if isinstance(x, str):
        return  float(x.strip('[]').split(',')[0]) 
    return 7

def recall_distance(x, y, words):
    my_words = list(words)
    my_words.remove(y)
    if x == '' :
        return -1
    elif x in my_words:
        return -2
    return nltk.edit_distance(x, y) 
    
def sem_answer(key, category):
    if isinstance(key, str):
        sem_dict = {'left':'природа', 'right':'человек'}
        return sem_dict[key] == category   
    return False

In [3]:
def afc(table):
    AFC = table[['AL', 'key_resp_2.keys', 'key_resp_2.rt', 
                 'AFC1_color', 'AFC2_color', 'AFC3_color']].dropna()
    AFC['afc'] = [afc_answer(a, b, c, d) for a, b, c, d in zip(
        AFC['key_resp_2.keys'], AFC['AFC1_color'], AFC['AFC2_color'], AFC['AFC3_color'])]
    AFC = AFC[['AL', 'afc', 'key_resp_2.rt']]

    AFC.rename(columns={'key_resp_2.rt': 'afc_rt'}, inplace=True)

    return AFC

def recall_acquisition(table):
    recall_a0 = table[['AL', 'key_resp.keys',
                       'recall_text', 'key_resp.rt',
                       'recall_rt' ]][19:29]
    recall_a1 = table[['AL', 'key_resp.keys',
                       'recall_text', 'key_resp.rt',
                       'recall_rt' ]][47:56]
    recall_a2 = table[['AL', 'key_resp.keys',
                       'recall_text', 'key_resp.rt',
                       'recall_rt' ]][74:83]
    recall_a3 = table[['AL', 'key_resp.keys', 
                       'recall_text', 'key_resp.rt',
                       'recall_rt']][101:110]

    recall_a = pd.concat([recall_a0, recall_a1],axis=0)
    recall_a = pd.concat([recall_a, recall_a2], axis=0)
    recall_a = pd.concat([recall_a, recall_a3], axis=0)

    recall_a['recall'] = [recall_text(x) 
                                  for x in recall_a['key_resp.keys']]
    recall_a['recall_first_rt'] = [first_rt(x) for x in recall_a['key_resp.rt']]

    recall_a['recall_dist']  = [recall_distance(x, y, recall_a['AL'])
                                           for x, y in zip(recall_a['recall'],
                                                           recall_a['AL'])]

    recall_a = recall_a[['AL', 'recall', 'recall_dist',
                        'recall_first_rt', 'recall_rt']]

    return recall_a



def recall_test(table):
    recall_t = table[['recall_AL', 'key_resp_4.keys', 
                      'recall_test_text', 'key_resp_4.rt', 
                      'recall_test_rt' ]][150:190]

    recall_t['recall_1'] = [recall_text(x) for x in recall_t['key_resp_4.keys']]
    recall_t['recall_1_first_rt'] = [first_rt(x) for x in recall_t['key_resp_4.rt']]

    recall_t['recall_1_dist']  = [recall_distance(x, y, recall_t['recall_AL'])
                                           for x, y in zip(recall_t['recall_1'],
                                                           recall_t['recall_AL'])]

    recall_t = recall_t[['recall_AL', 
                         'recall_1', 
                         'recall_1_dist',
                        'recall_1_first_rt', 
                         'recall_test_rt']]
    
    

    recall_t.rename(columns={'recall_AL': 'AL', 'recall_test_rt': 'recall_1_rt'}, inplace=True)
    return recall_t

def recall_test_t(table):
    recall_t = table[['recall_AL', 'key_resp_2.keys', 
                      'recall_test_text', 'key_resp_2.rt', 
                      'recall_test_rt' ]][40:80]

    recall_t['recall_2'] = [recall_text(x) 
                                  for x in recall_t['key_resp_2.keys']]
    recall_t['recall_2_first_rt'] = [first_rt(x) for x in recall_t['key_resp_2.rt']]

    recall_t['recall_2_dist']  = [recall_distance(x, y, recall_t['recall_AL'])
                                                 for x, y in zip(recall_t['recall_2'],
                                                                 recall_t['recall_AL'])]

    recall_t = recall_t[['recall_AL', 
                         'recall_2', 
                         'recall_2_dist',
                        'recall_2_first_rt', 
                         'recall_test_rt']]

    recall_t.rename(columns={'recall_AL': 'AL', 'recall_test_rt': 'recall_2_rt'}, inplace=True)
    return recall_t

def recognition(table):
    recog = table[['AL','recognition_text', 
                   'recognition_correctness',
                   'key_resp_3.keys', 'key_resp_3.rt']][110:150]

    recog['recog'] = [check_reg(x, y) \
                            for x, y in zip(recog['recognition_correctness'], 
                                            recog['key_resp_3.keys'])]
    recog = recog[['AL', 'recognition_text', 
                   'recog', 
                  'key_resp_3.rt']]

    recog.rename(columns={'key_resp_3.rt': 'recog_rt'}, inplace=True)

    return recog

def recognition_t(table):
    recog = table[['AL','recognition_text', 
                   'recognition_correctness',
                   'key_resp.keys', 'key_resp.rt']][:40]

    recog['recog2'] = [check_reg(x, y) \
                            for x, y in zip(recog['recognition_correctness'], 
                                            recog['key_resp.keys'])]
    recog = recog[['AL', 'recog2', 'key_resp.rt']]

    recog.rename(columns={'key_resp.rt': 'recog2_rt'}, inplace=True)

    return recog

def semantic(table, word_dict):
    semcat = pd.read_csv('semantic_categories.csv', sep=';')
    semcat.rename(columns={'word': 'L1'}, inplace=True)
    semcat = pd.merge(semcat, word_dict, on='L1')
    sem = table[['SEM_AL', 'key_resp_5.keys',
                 'key_resp_5.rt']][190:230]
    sem.rename(columns={'key_resp_5.rt': 'sem_rt'}, inplace=True)
    sem.rename(columns={'SEM_AL': 'AL'}, inplace=True)
    sem = pd.merge(sem, semcat, on='AL')
    sem['sem'] = [sem_answer(x, y) for x, y 
                          in zip(sem['key_resp_5.keys'], 
                                 sem['category'])]
    sem = sem[['AL', 'sem',  'sem_rt']]
    return sem

def semantic_t(table, word_dict):
    semcat = pd.read_csv('semantic_categories.csv', sep=';')
    semcat.rename(columns={'word': 'L1'}, inplace=True)
    semcat = pd.merge(semcat, word_dict, on='L1')
    sem = table[['SEM_AL', 'key_resp_3.keys',
                 'key_resp_3.rt']][80:120]
    sem.rename(columns={'key_resp_3.rt': 'sem2_rt'}, inplace=True)
    sem.rename(columns={'SEM_AL': 'AL'}, inplace=True)
    sem = pd.merge(sem, semcat, on='AL')
    sem['sem2'] = [sem_answer(x, y) for x, y 
                          in zip(sem['key_resp_3.keys'], 
                                 sem['category'])]
    sem = sem[['AL', 'sem2', 'sem2_rt']]
    return sem

def all_acquisition(table):
    word_dict = table[['AL', 'L1']][110:150]
    AFC = afc(table)
    recall_a = recall_acquisition(table)
    recall_t = recall_test(table)
    recog = recognition(table)
    sem = semantic(table, word_dict)

    table = pd.merge(word_dict, AFC, on='AL', how='outer')
    table = pd.merge(table, recall_a, on='AL', how='outer')
    table = pd.merge(table, recog, on='AL', how='outer')
    table = pd.merge(table, recall_t, on='AL', how='outer')
    table = pd.merge(table, sem, on='AL', how='outer')
    return table

def all_test(table):
    word_dict = table[['AL', 'L1']][:40]
    recall_t = recall_test_t(table)
    recog = recognition_t(table)
    sem = semantic_t(table, word_dict)

    table = pd.merge(word_dict, recog, on='AL', how='outer')
    table = pd.merge(table, recall_t, on='AL', how='outer')
    table = pd.merge(table, sem, on='AL', how='outer')
    table.drop('L1', axis=1, inplace=True)
    return table

def all_exp(table1, table2, var):
    file = pd.read_csv(table1)
    file_2 = pd.read_csv(table2, sep=',')
    table = pd.merge(all_acquisition(file), 
                     all_test(file_2), 
                     on="AL")
    s = stats(table, var)
    table = table.fillna('').append(s[0])

    return table, s[1]
    
def stats(table, variant):
    
    recalls = [Counter(table['recall_dist'].dropna()),
              Counter(table['recall_1_dist'].dropna()),
              Counter(table['recall_2_dist'].dropna())]
    rec_totals = [i[0] for i in recalls]
    
    recalls = [sorted(recalls[0].items(), key = lambda x: x[0]),
              sorted(recalls[1].items(), key = lambda x: x[0]),
              sorted(recalls[2].items(), key = lambda x: x[0])]
   
    recs = []
    
    for idx, key in enumerate(['recall_a_dist', 'recall_t_dist', 'recall_t2_dist']):
        t = ''
        for p in recalls[idx]:
            t += f'{str(p[0])}\t{p[1]}\n'
        recs.append(t)
    
    t1 = np.zeros(shape=(2,25))
    to_mean = ['afc_rt', 'recall_dist', 'recall_first_rt', 'recall_rt',
               'recog_rt', 
               'recall_1_dist', 'recall_1_first_rt', 'recall_1_rt',
               'sem_rt', 'recog2_rt', 
               'recall_2_dist',
               'recall_2_first_rt', 'recall_2_rt','sem2_rt']
    to_sum = ['afc', 'recog', 'sem', 'recog2', 'sem2']
    rec_names = ['recall', 'recall_1', 'recall_2']
    stats = [['variant', variant]]
    for idx, key in enumerate(table.keys()):
        if key in to_sum:
            k = table[key].dropna().sum()
            t1[0][idx] = k
            stats.append([key, k, ''])
        elif key in to_mean:
            k = table[key].dropna()
            t1[1][idx]  = k.mean()
            if 'dist'not in key:
                stats.append([key, k.mean(), np.std(k)])
                ALL_RT[key].append(k)
            
    t1 = pd.DataFrame(t1, columns=table.keys())
    for key in t1.keys():
        for idx in range(2):
            if t1[key][idx] == 0:
                t1[key][idx] = ''
    
    t1['AL'][0] = 'SUM'
    t1['AL'][1] = 'MEAN'
    
    t1['recall'][0] = recs[0]
    t1['recall_1'][0] = recs[1]
    t1['recall_2'][0] = recs[2]
    stats.insert(3, ['recall_0', rec_totals[0]])
    stats.insert(8, ['recall_1', rec_totals[1]])
    stats.insert(15, ['recall_2', rec_totals[2]])
    
    st = pd.DataFrame(stats, columns=['parameter', f'value_{variant}', f'SD_{variant}'])
    
                
    return t1, st
  
def saving(*args):
    for idx, i in enumerate(args):
        i.to_excel(f'p{idx + 1}.xlsx', index=False)


In [4]:
def all_stats():
    all_stats = pd.merge(p0[1], p1[1], on='parameter', how='outer')
    all_stats = pd.merge(all_stats, p2[1], on='parameter', how='outer')
    all_stats = pd.merge(all_stats, p3[1], on='parameter', how='outer')
    all_stats = pd.merge(all_stats, p5[1], on='parameter', how='outer')
    MEANS = []
    for key in ALL_RT:
        ALL_RT[key] = [item for inner in ALL_RT[key] for item in inner]
        t = [f'{key}', round(np.mean(ALL_RT[key]), 3), round(np.std(ALL_RT[key]), 3)]
        MEANS.append(t)

    values = [key for key in all_stats.keys() if 'value' in key]
    for idx, key in enumerate(all_stats['parameter']):
        if  'rt' not in key:
            m = [all_stats[key][idx] for key in values]
            t = [key, round(np.mean(m), 2), round(np.std(m), 2)]
            MEANS.append(t)


    MEANS = pd.DataFrame(MEANS, columns=['parameter','MEAN', 'SD_mean'])
    all_stats = pd.merge(all_stats, MEANS, on='parameter', how='outer')
    all_stats['parameter'] = ['Participant ID', 'AFC accuracy, %', 'AFC RT, s',
                             'Recall accuracy, %', 'Recall first-key RT, s', 'Recall last-key RT, s',
                             'Recognition accuracy, %', 'Recognition RT, s',
                             'Recall accuracy, %', 'Recall first-key RT, s', 'Recall last-key RT, s',
                             'Sem. dec. accuracy, %', 'Sem. dec. RT, s',
                             'Recognition accuracy, %', 'Recognition RT, s',
                             'Recall accuracy, %', 'Recall first-key RT, s', 'Recall last-key RT, s',
                             'Sem. dec. accuracy, %', 'Sem. dec. RT, s']
    all_stats = all_stats.fillna('')
    def to_mean(x, y):
        if isinstance(x, float) and 'RT' not in y and 'dist' not in y:
            return round(x / 40 * 100, 2)
        elif 'RT' in y:
            return round(x, 2)
        elif not isinstance(x, float):
            return ''
        elif y == 'variant':
            return x


    for key in all_stats.keys()[1:]:
        all_stats[key]= [to_mean(i, all_stats['parameter'][idx]) for idx, i in enumerate(all_stats[key])]

    all_stats.to_excel('all_stats.xlsx', index=False)
    return all_stats

In [5]:
p5 = all_exp(a_files[4], t_files[4], 5)

p3 = all_exp(a_files[2], t_files[1], 3)

p2 = all_exp(a_files[3], t_files[3], 2)

p1 = all_exp(a_files[0], t_files[0], 0)

p0 = all_exp(a_files[1], t_files[2], 0)

all_stats()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,parameter,value_0_x,SD_0_x,value_0_y,SD_0_y,value_2,SD_2,value_3,SD_3,value_5,SD_5,MEAN,SD_mean
0,Participant ID,0.0,,0.0,,5.0,,7.5,,12.5,,5.0,4.75
1,"AFC accuracy, %",82.5,,60.0,,90.0,,82.5,,80.0,,79.0,10.08
2,"AFC RT, s",2.18,0.62,2.13,0.51,1.79,0.39,1.88,0.38,2.09,0.51,2.01,0.51
3,"Recall accuracy, %",40.0,,0.0,,52.5,,17.5,,10.0,,24.0,19.4
4,"Recall first-key RT, s",1.63,0.99,1.59,1.13,2.62,1.53,1.95,0.71,3.94,1.87,2.35,1.58
5,"Recall last-key RT, s",3.76,0.88,2.38,1.57,5.15,1.28,4.75,1.15,6.86,0.59,4.58,1.88
6,"Recognition accuracy, %",55.0,,50.0,,90.0,,90.0,,62.5,,69.5,17.2
7,"Recognition RT, s",2.28,0.53,0.94,0.29,1.14,0.21,1.62,0.37,2.43,0.54,1.68,0.72
8,"Recall accuracy, %",25.0,,0.0,,32.5,,20.0,,2.5,,16.0,12.7
9,"Recall first-key RT, s",4.71,1.79,0.73,0.67,2.37,1.1,1.79,0.55,4.49,2.04,2.82,2.06


In [None]:
p0[0]

In [None]:
saving(p0[0], p1[0], p2[0], p3[0], p5[0])