# **LoopsResultsExploration**

## **1. Importing and First Proccesing**

In [2]:
import pandas as pd
import plotly.express as px
from DataCleaning import *
from ProcessingConfig import *

pd.options.display.max_columns = 50

In [3]:
raw_data = pd.read_excel(cleaning_config['raw_data_path'])

raw_data = pd.read_excel(cleaning_config['raw_data_path'])
print(f'original shape: {raw_data.shape}')
outliers_threshold = cleaning_config['filter_threshold']
print(f"threshold for outliers detection: {outliers_threshold}")

drop_columns(raw_data, cleaning_config['unnecessary_columns'])
convert_types(raw_data, cleaning_config['type_conversions'])
raw_data = filter_slow_subjects(raw_data, outliers_threshold)
raw_data = filter_bad_subjects(raw_data, outliers_threshold)
raw_data = drop_first_loop(raw_data)
raw_data = only_first_line(raw_data)
# raw_data = filter_bad_trials(raw_data, threshold=0.9)
# raw_data = filter_slow_steps(raw_data, outliers_threshold)

print(f'final shape: {raw_data.shape}')

original shape: (13294, 32)
threshold for outliers detection: 2.25
filter_slow_subjects: No slow subjects detected.
filter_bad_subjects: No bad subjects detected (in terms of low success rate).
drop_first_loop: 1630 rows were filtered out.
only_first_line: 9227 rows were filtered out.
final shape: (2437, 25)


In [None]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13294 entries, 0 to 13293
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   subject                       13294 non-null  object 
 1   step_num                      13294 non-null  int64  
 2   uid                           13294 non-null  int64  
 3   trial_start_time              13294 non-null  int64  
 4   rt                            13294 non-null  int32  
 5   response                      12946 non-null  object 
 6   loop_step                     13294 non-null  int32  
 7   trial_order                   13294 non-null  int64  
 8   trial_set                     13294 non-null  int64  
 9   trial                         13294 non-null  int64  
 10  core_program                  13294 non-null  int64  
 11  variant                       13294 non-null  object 
 12  step_id                       13294 non-null  int64  
 13  t

In [None]:
raw_data.sample(3)

Unnamed: 0,subject,step_num,uid,trial_start_time,rt,response,loop_step,trial_order,trial_set,trial,core_program,variant,step_id,text1,response_needed,expected_response,is_loop,loop_type,loop_type_switch,n_iterations,n_loop_lines,expected_response_whole_loop,step_type,prev_loop_type,correct
1664,108B,292,87,1145428,4549,4,1,8,1,10,5,FFWWFWWF,6,"for i in [1, 2]:\n a *= 3\n a -= 11",True,4.0,True,for,True,2,2,"[15, 4, 12, 1]",loop,,True
2856,101B,99,106,619393,1329,20,3,3,2,12,6,FFWWFWWF,7,while a <= 9:\n a -= 1\n a *= 5,True,20.0,True,while,True,2,2,"[1, 5, 4, 20]",loop,,True
11754,105A,286,86,883183,2007,6,1,8,2,1010,5,WWFFWFFW,5,"for i in [1, 2, 3]:\n a = average(a,4)",True,6.0,True,for,False,3,1,"[8, 6, 5]",loop,,True


In [None]:
raw_data[['subject', 'step_num']].groupby('subject')

## **2. Exploring Outliers**

### 2.1. Response-Time Between Subjects

In [None]:
# filtering only necessary columns
response_times = raw_data[['subject', 'step_num', 'rt']].copy()

In [None]:
# checking for outliers in terms of mean response time
mean_rt_per_subject = response_times[['rt', 'subject']].groupby('subject').mean()
mean_rt_per_subject.columns = ['mean_rt']

g_rt_q1, g_rt_q3 = mean_rt_per_subject['mean_rt'].quantile([0.25, 0.75])
g_rt_iqr = g_rt_q1 - g_rt_q3

mean_outlier_mask = mean_rt_per_subject['mean_rt'].apply(is_negative_outlier
                                                      , args=(g_rt_q1, g_rt_q3, g_rt_iqr
                                                              , cleaning_config['filter_threshold']))
rt_outlier_subject = mean_rt_per_subject[mean_outlier_mask].index

if rt_outlier_subject.size > 0:
    print(f'Seems that: {list(rt_outlier_subject)} are outliers in terms of mean response time within subject.')
else:
    print("No Outliers detected! at least in terms of mean response time within subject.")

No Outliers detected! at least in terms of mean response time within subject.


### 2.2. Mistakes Rate (%) Between Subjects

In [None]:
# filtering only necessary columns
response_success = raw_data[['subject', 'step_num', 'trial', 'correct', 'loop_step']].copy()

# calculating quantiles
g_c_mean = response_success['correct'].mean()
print(f'mean general success rate: {round(g_c_mean, 2)}')

mean general success rate: 0.94


In [None]:
success_per_subject = response_success[['subject', 'correct']].groupby('subject').mean()
success_per_subject.rename(columns={'correct': 'success_rate'}, inplace=True)
success_per_subject.sort_values(by='success_rate', ascending=False).T

subject,103B,108A,107B,104B,104A,105A,106A,106B,108B,109B,102B,103A,102A,109A,101B
success_rate,0.972588,0.969298,0.958561,0.954098,0.953998,0.953947,0.953005,0.952174,0.949672,0.948634,0.93587,0.932609,0.929039,0.896401,0.858974


In [None]:
success_rate_q1, success_rate_q3 = success_per_subject['success_rate'].quantile([0.25, 0.75])
success_rate_iqr = success_rate_q1 = success_rate_q3

success_outlier_mask = success_per_subject['success_rate'].apply(is_negative_outlier
                                        , args=(success_rate_q1, success_rate_q3, success_rate_iqr
                                            , cleaning_config['filter_threshold']))

success_outlier_subjects = success_per_subject[success_outlier_mask]

if success_outlier_subjects.size > 0:
    print(f'Seems that: {list(success_outlier_subjects)} are outliers in terms of success rate within subject.')
else:
    print("No Outliers detected! at least in terms of success rate within subject.")

No Outliers detected! at least in terms of success rate within subject.


### 2.3. Trial Mistakes Rate (%)

In [None]:
# calculating response success rate per trial
success_per_trial = response_success[['subject', 'trial', 'correct']].groupby(['subject', 'trial']).mean()
success_per_trial.rename(columns={'correct': 'success_per_trial'}, inplace=True)

success_per_trial.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,success_per_trial
subject,trial,Unnamed: 2_level_1
101B,1,0.9
101B,2,0.651163
101B,3,0.860465


In [None]:
reg_trials_hist = px.histogram(success_per_trial, nbins=20, x="success_per_trial"
                   , title='Success Rate per Trial (all data)', labels={'success_per_trial':'success rate'})
reg_trials_hist.show()

In [None]:
pr_trials_hist = px.histogram(pr_success_per_trial, nbins=20, x="success_per_trial"
                   , title='Success Rate per Trial (only first steps)', labels={'success_per_trial':'success rate'})
pr_trials_hist.show()

In [None]:
# actually finding the trial outliers in terms of success rate within subject
trial_success_q1, trial_success_q3 = success_per_trial['success_per_trial'].quantile([0.25, 0.75])
trial_success_iqr = trial_success_q1 - trial_success_q3
outlier_trails_mask = success_per_trial['success_per_trial'].apply(is_negative_outlier
                                                                   , args=(trial_success_q1, trial_success_q3
                                                                           , trial_success_iqr, cleaning_config['filter_threshold']))
outlier_trails = success_per_trial[outlier_trails_mask]

print("The following trials are outliers in term of trial success rate within subject:")
outlier_trails.sort_values(by='success_per_trial')

The following trials are outliers in term of trial success rate within subject:


Unnamed: 0_level_0,Unnamed: 1_level_0,success_per_trial
subject,trial,Unnamed: 2_level_1
101B,2,0.651163
101B,9,0.666667
101B,1004,0.785714
101B,1001,0.804878
101B,6,0.809524
103A,10,0.810811
109A,1002,0.815789
109A,1005,0.818182
102B,3,0.822222
101B,1002,0.829268


### 2.3. Single Steps Response-Time Within Subject

In [None]:
# calculating response time quantiles and IQR per subject
quantiles_per_subject = response_times[['rt', 'subject']].groupby('subject').quantile([0.25, 0.75]).unstack()
quantiles_per_subject.columns = ['q1', 'q3']
quantiles_per_subject['iqr'] = quantiles_per_subject['q3'] - quantiles_per_subject['q1']

quantiles_per_subject.head(3)

Unnamed: 0_level_0,q1,q3,iqr
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101B,1282.0,2930.0,1648.0
102A,2552.5,5526.75,2974.25
102B,2184.25,4457.0,2272.75


In [None]:
# finding if a step is an outlier in terms of response time within subject
def is_subjective_outlier(step):
    rt = step['rt']
    subject = step['subject']
        
    subject_quantiles = quantiles_per_subject.loc[subject]
    q1, q3, iqr = subject_quantiles['q1'], subject_quantiles['q3'], subject_quantiles['iqr']
    return is_negative_outlier(rt, q1, q3, iqr, cleaning_config['filter_threshold'])

In [None]:
# filtering only outliers
subjective_outlier_mask = response_times.apply(is_subjective_outlier, axis=1)
outlier_steps = response_times[subjective_outlier_mask]

In [None]:
outlier_steps = outlier_steps.merge(quantiles_per_subject, how='left', left_on='subject', right_index=True)
outlier_steps.head()

Unnamed: 0,subject,step_num,rt,q1,q3,iqr
54,107B,54,7899,1949.25,3412.5,1463.25
465,104B,6,9413,1932.25,3792.0,1859.75
655,104B,196,8954,1932.25,3792.0,1859.75
1383,108B,11,16865,2116.25,4969.0,2852.75
1463,108B,91,12970,2116.25,4969.0,2852.75
