# **LoopsResultsExploration**

## **1. Importing and First Proccesing**

In [1]:
import pandas as pd
from scipy import stats
import plotly.express as px
import ProcessingConfig as config
import ProcessingFuncs as process
import AnalysisFuncs as analyze

pd.options.display.max_columns = 50

In [2]:
raw_data = pd.read_excel(config.cleaning_config['raw_data_path'])

first_lines_data = process.clean_data(raw_data
                                      , drop_assign_steps=True
                                      , drop_first_loop_steps=True
                                      , only_first_lines=True
                                      , filter_subjects=False
                                      , filter_trials=False
                                      , filter_steps=False)

analyze.get_sample_size(first_lines_data)

original shape: (21074, 32)
threshold for outliers detection: 3 (iqr).
drop_assign: 552 rows were filtered out.
drop_first_loop: 2589 rows were filtered out.
is_first_line: 14068 lines were filtered.
final shape: (3865, 26)
There are 46 sessions from 23 subjects.


## **2. Exploring Outliers**

### 2.1. Response-Time Between Subjects

In [3]:
# filtering only necessary columns
response_times = first_lines_data[['subject', 'step_num', 'rt']].copy()

In [4]:
# calculating mean response time per subject
mean_rt_per_subject = response_times[['rt', 'subject']].groupby('subject').mean()
mean_rt_per_subject.columns = ['mean_rt']

# creating a histogram of mean response times
px.histogram(mean_rt_per_subject, nbins=10, x='mean_rt', title='Mean Response Time Per Subject'
             , labels={'mean_rt': 'mean response time (ms)'})

In [5]:
mean_rt_q1, mean_rt_q3 = mean_rt_per_subject['mean_rt'].quantile([0.25, 0.75])
mean_rt_iqr = mean_rt_q3 - mean_rt_q1

mean_rt_per_subject['anomaly_grade'] = mean_rt_per_subject['mean_rt'].apply(process.get_single_anomaly_grade
                                                                                            , args=(mean_rt_q1, mean_rt_q3, mean_rt_iqr))

# creating a histogram of mean response times anomaly grades
px.histogram(mean_rt_per_subject['anomaly_grade'], nbins=10, x='anomaly_grade'
             , title='Mean Response Time Anomaly Grade per Subject', labels={'anomaly_grade': 'anomaly grade'})

In [6]:
# calculating response time variance per subject
std_rt_per_subject = response_times[['rt', 'subject']].groupby('subject').std()
std_rt_per_subject.columns = ['rt_std']

# creating a histogram of response times variances
px.histogram(std_rt_per_subject, nbins=10, x='rt_std'
             , title='Response Time Variance Per Subject', labels={'rt_std': 'response time std (ms)'})

In [7]:
std_rt_q1, std_rt_q3 = std_rt_per_subject['rt_std'].quantile([0.25, 0.75])
std_rt_iqr = std_rt_q3 - std_rt_q1

std_rt_per_subject['anomaly_grade'] = std_rt_per_subject['rt_std'].apply(process.get_single_anomaly_grade
                                                                                            , args=(std_rt_q1, std_rt_q3, std_rt_iqr))

# creating a histogram of mean response times anomaly grades
px.histogram(std_rt_per_subject, nbins=10, x='anomaly_grade'
             , title='Response Time Variance Anomaly Grade per Subject'
             , labels={'anomaly_grade': 'anomaly grade'})

In [8]:
most_varied_subject = std_rt_per_subject['anomaly_grade'].idxmax()
max_variance = round(std_rt_per_subject['anomaly_grade'].max(), 2)
slowest_subject = mean_rt_per_subject['anomaly_grade'].idxmax()
slowest_mean_rt = round(mean_rt_per_subject['anomaly_grade'].max(), 2)

print(f'slowest subject: {slowest_subject} (grade:{max_variance}), most varied subject: {most_varied_subject} (grade:{slowest_mean_rt})')

slowest subject: 112B (grade:3.62), most varied subject: 112B (grade:1.93)


### 2.2. Mistakes Rate (%) Between Subjects

In [9]:
# filtering only necessary columns
response_success = first_lines_data[['subject', 'step_num', 'trial', 'correct', 'loop_step', 'loop_type_switch']].copy()

# calculating quantiles
g_c_mean = response_success['correct'].mean()
print(f'General success rate: {round(g_c_mean, 2)}')

General success rate: 0.96


In [10]:
success_per_subject = response_success[['subject', 'correct']].groupby('subject').mean()
success_per_subject.rename(columns={'correct': 'success_rate'}, inplace=True)

# manually excluding perfect 0 success rate subjects
success_per_subject = success_per_subject[success_per_subject['success_rate'] > 0]

px.histogram(success_per_subject)
px.histogram(success_per_subject, nbins=10, x='success_rate'
             , title='Success Rate per Subject', labels={'success_rate': 'success rate (%)'})

In [11]:
success_q1, success_q3 = success_per_subject['success_rate'].quantile([0.25, 0.75])
success_iqr = success_q3 - success_q1

success_per_subject['anomaly_grade'] = success_per_subject['success_rate'].apply(process.get_single_anomaly_grade
                                                                                    , args=(success_q1, success_q3, success_iqr))

px.histogram(success_per_subject, nbins=10, x='anomaly_grade'
             , title='Success Rate Anomaly Grade per Subject', labels={'anomaly_grade': 'anomaly grade'})

### 2.3. Trial Mistakes Rate (%)

In [12]:
# calculating response success rate per trial
success_per_trial = response_success[['subject', 'trial', 'correct']].groupby(['subject', 'trial']).mean()
success_per_trial.rename(columns={'correct': 'success_per_trial'}, inplace=True)

# excluding perfect 0 success rate trials
success_per_trial = success_per_trial[success_per_trial['success_per_trial'] > 0]

In [13]:
pr_trials_hist = px.histogram(success_per_trial, nbins=20, x="success_per_trial"
                   , title='Success Rate per Trial (only first steps)', labels={'success_per_trial':'success rate'})
pr_trials_hist.show()

In [14]:
switching_diff_per_trial_success = pd.DataFrame()

# merging success_per_trial with raw_data, and excluding perfect 0 success rate trials
success_per_trial_diff = success_per_trial.reset_index()
success_per_trial_diff = raw_data.merge(success_per_trial_diff, how='left', on=['subject', 'trial'])
success_per_trial_diff = success_per_trial_diff[success_per_trial_diff['success_per_trial'] > 0]
success_per_trial_diff.rename(columns={'correct_x': 'correct', 'loop_type_switch_x': 'loop_type_switch'
                                       , 'rt_x': 'rt', 'step_id_x': 'step_id'}
                              , inplace=True)

# mean response time grouped by loop switching and success rate per trial.
success_per_trial_diff['success_per_trial_bin'] = pd.cut(success_per_trial_diff['success_per_trial'], bins=3).astype(str)
success_per_trial_diff['success_per_trial_bin'].unique()
switching_diff_per_trial_success['mean_response_time'] = success_per_trial_diff[success_per_trial_diff['correct']].groupby(['success_per_trial_bin', 'loop_type_switch'])['rt'].mean()
switching_diff_per_trial_success['size'] = success_per_trial_diff[success_per_trial_diff['correct']].groupby(['success_per_trial_bin', 'loop_type_switch'])['step_id'].count()

switching_diff_per_trial_success.unstack()

Unnamed: 0_level_0,mean_response_time,mean_response_time,size,size
loop_type_switch,False,True,False,True
success_per_trial_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"(0.571, 0.714]",1828.577778,1835.859813,450,321
"(0.714, 0.857]",1715.530315,1877.884227,2474,1883
"(0.857, 1.0]",1750.322381,1826.712744,8248,6395


In [15]:
px.bar(switching_diff_per_trial_success.reset_index(), x='success_per_trial_bin', y='mean_response_time'
       , color='loop_type_switch', barmode='group', title='Success Rate per Trial and Priming Effect'
       , labels={'success_per_trial_bin': 'success per trial', 'mean_response_time': 'mean response time'})

### 2.3. Single Steps Response-Time Within Subject

In [16]:
# calculating response time quantiles and IQR per subject
quantiles_per_subject = response_times[['rt', 'subject']].groupby('subject').quantile([0.25, 0.75]).unstack()
quantiles_per_subject.columns = ['q1', 'q3']
quantiles_per_subject['iqr'] = quantiles_per_subject['q3'] - quantiles_per_subject['q1']

quantiles_per_subject.head(3)

Unnamed: 0_level_0,q1,q3,iqr
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101A,2569.0,5233.75,2664.75
101B,1282.0,2930.0,1648.0
102A,2552.5,5526.75,2974.25


In [17]:
# finding if a step is an outlier in terms of response time within subject
def is_step_outlier(step):
    rt = step['rt']
    subject = step['subject']
        
    q3, iqr = quantiles_per_subject.loc[subject, 'q3'], quantiles_per_subject.loc[subject, 'iqr']
    return process.is_positive_outlier(rt, q3, iqr, config.cleaning_config['filter_threshold'])

In [18]:
# finding outliers
subjective_outlier_mask = first_lines_data.apply(is_step_outlier, axis=1)
outlier_steps = first_lines_data.loc[subjective_outlier_mask, ['subject', 'step_num', 'loop_type_switch', 'rt']]

# searching for outliers influence on effect
outliers_and_effect = outlier_steps.groupby('loop_type_switch')['rt'].count().to_frame()
outliers_and_effect.rename(columns={'step_num': 'num_of_steps'}, inplace=True)
outliers_and_effect['mean_rt'] = outlier_steps.groupby('loop_type_switch')['rt'].mean()

outliers_and_effect

Unnamed: 0_level_0,rt,mean_rt
loop_type_switch,Unnamed: 1_level_1,Unnamed: 2_level_1
False,26,12392.307692
True,33,12998.484848


In [19]:
px.bar(outliers_and_effect.reset_index(), x='loop_type_switch', y='mean_rt', color='loop_type_switch'
       , title='Priming Effect Within Outlier Steps'
       , labels={'mean_rt': 'mean response time'
                 , 'loop_type_switch': 'switching loop type'})

In [30]:
def get_step_anomaly_grade(step):
    return abs(process.get_multiple_anomaly_grade(step, 'rt'))

anomaly_grades = raw_data.merge(quantiles_per_subject, how='left', on='subject')
anomaly_grades['anomaly_grade'] = anomaly_grades.apply(get_step_anomaly_grade, axis=1)
anomaly_grades['anomaly_grade_bin'] = pd.cut(anomaly_grades['anomaly_grade'], bins=[-1.5, 0, 1.5, 3, 5, max(anomaly_grades['anomaly_grade'])])

anomaly_grade_mean_rt = anomaly_grades.groupby(['anomaly_grade_bin', 'loop_type_switch'])['rt'].agg(['count', 'mean'])
anomaly_grade_mean_rt.rename(columns={'count': 'n_steps', 'mean': 'mean_rt'}, inplace=True)
anomaly_grade_mean_rt.reset_index(inplace=True)
anomaly_grade_mean_rt['anomaly_grade_bin'] = anomaly_grade_mean_rt['anomaly_grade_bin'].astype(str)

anomaly_grade_mean_rt

Unnamed: 0,anomaly_grade_bin,loop_type_switch,n_steps,mean_rt
0,"(-1.5, 0.0]",False,2731,2969.809227
1,"(-1.5, 0.0]",True,2077,2952.271545
2,"(0.0, 1.5]",False,8697,1339.555479
3,"(0.0, 1.5]",True,6338,1430.292364
4,"(1.5, 3.0]",False,614,1925.278502
5,"(1.5, 3.0]",True,495,2350.937374
6,"(3.0, 5.0]",False,48,11731.625
7,"(3.0, 5.0]",True,46,11489.434783
8,"(5.0, 25.911]",False,18,21360.277778
9,"(5.0, 25.911]",True,10,17328.0


In [31]:
def get_pearson_r(grade_bin):
    inf = float(grade_bin[1:grade_bin.find(',')])
    sup = float(grade_bin[grade_bin.find(' ') + 1:-1])
    grade_bin_mask = anomaly_grades['anomaly_grade'].between(inf, sup)
    return analyze.test_rt_switch_corr(anomaly_grades[grade_bin_mask], alpha=0.05, print_msg=False)
    
anomaly_grade_mean_rt['pearson_r'] = anomaly_grade_mean_rt['anomaly_grade_bin'].apply(get_pearson_r)

In [36]:
px.bar(anomaly_grade_mean_rt, x='anomaly_grade_bin', y='mean_rt', color='loop_type_switch'
       , barmode='group', title='Influence of Step Response Time Anomaly Grade On Priming Effect'
       , labels={'mean_rt': 'mean response time'
               , 'anomaly_grade_bin': 'anomaly grades (IQR)'
               , 'loop_type_switch': 'switching loop type'})

In [33]:
px.bar(anomaly_grade_mean_rt[['anomaly_grade_bin', 'pearson_r']].drop_duplicates(), x='anomaly_grade_bin', y='pearson_r'
       , title='Influence of Step Response Time Anomaly Grade On Priming Effect'
              , labels={'pearson_r': 'pearson correlation (response time and switch)'
               , 'anomaly_grade_bin': 'anomaly grades (IQR)'
               , 'loop_type_switch': 'switching loop type'}))

### 

## **3. Time per Session Analysis**

In [24]:
from os import listdir
from os.path import isfile, join

raw_results_folder = '../../msa-1/scripts/loops1/exp1_data/results/raw'

In [25]:
def get_total_times(path: str):
    # importing the file names from directory
    data_files = [f for f in listdir(path) if isfile(join(path, f)) and not f.startswith('Participant')]

    # creating a list of max time elapsed for every session
    max_times = {}
    for file_name in data_files:
        curr_times = pd.read_csv(path + f'\{file_name}', usecols=['time_elapsed'], dtype={'time_elapsed': int})
        curr_times = curr_times.squeeze()
        start_time = min(curr_times) / 60000
        end_time = max(curr_times) / 60000
        max_times.update({file_name :  (end_time - start_time)})
    
    return max_times

In [26]:
max_times = get_total_times(raw_results_folder)
only_times = list(max_times.values())

In [27]:
time_per_session = px.histogram(only_times, nbins=20, title='Time Per Session')
time_per_session.show()