# **LoopsResultsExploration**

## **1. Importing and First Proccesing**

In [3]:
import pandas as pd
from DataCleaning import *
from ProcessingConfig import *

pd.options.display.max_columns = 50

In [4]:
raw_data = pd.read_excel(cleaning_config['raw_data_path'])

# partial cleaning
drop_columns(raw_data, cleaning_config['unnecessary_columns'])
convert_types(raw_data, cleaning_config['type_conversions'])
raw_data = drop_first_loop(raw_data)
raw_data = only_first_line(raw_data)

-- drop_first_loop: 1630 rows were filtered out.
-- drop_first_line: 9227 rows were filtered out.


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2437 entries, 6 to 13287
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   subject                       2437 non-null   object 
 1   step_num                      2437 non-null   int64  
 2   uid                           2437 non-null   int64  
 3   trial_start_time              2437 non-null   int64  
 4   rt                            2437 non-null   int32  
 5   response                      2437 non-null   object 
 6   loop_step                     2437 non-null   int32  
 7   trial_order                   2437 non-null   int64  
 8   trial_set                     2437 non-null   int64  
 9   trial                         2437 non-null   int64  
 10  core_program                  2437 non-null   int64  
 11  variant                       2437 non-null   object 
 12  step_id                       2437 non-null   int64  
 13  te

In [6]:
raw_data.sample(3)

Unnamed: 0,subject,step_num,uid,trial_start_time,rt,response,loop_step,trial_order,trial_set,trial,core_program,variant,step_id,text1,response_needed,expected_response,is_loop,loop_type,loop_type_switch,n_iterations,n_loop_lines,expected_response_whole_loop,step_type,prev_loop_type,correct
11209,106A,198,94,547145,1766,20,0,6,1,1011,6,FWWFFFWW,4,while a >= 10:\n a += 1\n a -= 8,True,20.0,True,while,False,2,2,"[20, 12, 13, 5]",loop,while,True
11967,103B,43,48,270437,3033,10,0,11,1,6,3,FFWFFWFF,3,"for i in [1, 2]:\n a /= 2",True,10.0,True,for,False,2,1,"[10, 5]",loop,for,True
5978,104A,8,76,52913,2882,19,0,1,1,9,5,WFFWWWFF,4,"for i in [1, 2]:\n a += 5\n a -= 6",True,19.0,True,for,False,2,2,"[19, 13, 18, 12]",loop,for,True


## **2. Exploring Outliers**

In [9]:
def is_outlier(x, x_q1, x_q3, x_iqr, threshold):
    if x_iqr == 0:
        return False
    return (x_q1 - x) / x_iqr >= threshold or (x - x_q3) / x_iqr >= threshold

### 2.1. Response-Time Between Subjects

In [10]:
# filtering only necessary columns
response_times = raw_data[['subject', 'step_num', 'rt']].copy()

In [11]:
# checking for outliers in terms of mean response time
mean_rt_per_subject = response_times[['rt', 'subject']].groupby('subject').mean()
mean_rt_per_subject.columns = ['mean_rt']

g_rt_q1, g_rt_q3 = mean_rt_per_subject['mean_rt'].quantile([0.25, 0.75])
g_rt_iqr = g_rt_q1 - g_rt_q3

mean_outlier_mask = mean_rt_per_subject['mean_rt'].apply(is_outlier
                                                      , args=(g_rt_q1, g_rt_q3, g_rt_iqr
                                                              , cleaning_config['filter_threshold']))
rt_outlier_subject = mean_rt_per_subject[mean_outlier_mask].index

if rt_outlier_subject.size > 0:
    print(f'Seems that: {list(rt_outlier_subject)} are outliers in terms of mean response time within subject.')
else:
    print("No Outliers detected! at least in terms of mean response time within subject.")

Seems that: ['103B'] are outliers in terms of mean response time within subject.


### 2.2. Mistakes Rate (%) Between Subjects

In [12]:
# filtering only necessary columns
response_success = raw_data[['subject', 'step_num', 'trial', 'correct']].copy()

# calculating quantiles
g_c_mean = response_success['correct'].mean()
print(f'mean general success rate: {round(g_c_mean, 2)}')

mean general success rate: 0.96


In [13]:
success_per_subject = response_success[['subject', 'correct']].groupby('subject').mean()
success_per_subject.rename(columns={'correct': 'success_rate'}, inplace=True)
success_per_subject.sort_values(by='success_rate', ascending=False).T

subject,103B,107B,108A,103A,105A,106B,104B,109B,104A,108B,102B,106A,102A,109A,101B
success_rate,1.0,1.0,0.994048,0.988095,0.988095,0.988095,0.982143,0.97619,0.964286,0.964286,0.952381,0.952381,0.904762,0.904762,0.87574


In [14]:
success_rate_q1, success_rate_q3 = success_per_subject['success_rate'].quantile([0.25, 0.75])
success_rate_iqr = success_rate_q1 = success_rate_q3

success_outlier_mask = success_per_subject['success_rate'].apply(is_outlier
                                        , args=(success_rate_q1, success_rate_q3, success_rate_iqr
                                            , cleaning_config['filter_threshold']))

success_outlier_subjects = success_per_subject[success_outlier_mask]

if success_outlier_subjects.size > 0:
    print(f'Seems that: {list(success_outlier_subjects)} are outliers in terms of success rate within subject.')
else:
    print("No Outliers detected! at least in terms of success rate within subject.")

No Outliers detected! at least in terms of success rate within subject.


### 2.3. Trial Mistakes Rate (%)

In [15]:
# calculating response success rate per trial
success_per_trial = response_success[['subject', 'trial', 'correct']].groupby(['subject', 'trial']).mean()
success_per_trial.rename(columns={'correct': 'success_per_trial'}, inplace=True)

success_per_trial.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,success_per_trial
subject,trial,Unnamed: 2_level_1
101B,1,0.857143
101B,2,0.714286
101B,3,1.0


In [16]:
# actually finding the trial outliers in terms of success rate within subject
trial_success_q1, trial_success_q3 = success_per_trial['success_per_trial'].quantile([0.25, 0.75])
trial_success_iqr = trial_success_q1 - trial_success_q3
outlier_trails_mask = success_per_trial['success_per_trial'].apply(is_outlier
                                                                   , args=(trial_success_q1, trial_success_q3
                                                                           , trial_success_iqr, cleaning_config['filter_threshold']))
outlier_trails = success_per_trial[outlier_trails_mask]

print("The following trials are outliers in term of trial success rate within subject:")
outlier_trails.sort_values(by='success_per_trial')

The following trials are outliers in term of trial success rate within subject:


Unnamed: 0_level_0,Unnamed: 1_level_0,success_per_trial
subject,trial,Unnamed: 2_level_1


### 2.3. Single Steps Response-Time Within Subject

In [17]:
# calculating response time quantiles and IQR per subject
quantiles_per_subject = response_times[['rt', 'subject']].groupby('subject').quantile([0.25, 0.75]).unstack()
quantiles_per_subject.columns = ['q1', 'q3']
quantiles_per_subject['iqr'] = quantiles_per_subject['q3'] - quantiles_per_subject['q1']

quantiles_per_subject.head(3)

Unnamed: 0_level_0,q1,q3,iqr
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101B,1282.0,2930.0,1648.0
102A,2552.5,5526.75,2974.25
102B,2184.25,4457.0,2272.75


In [18]:
# finding if a step is an outlier in terms of response time within subject
def is_subjective_outlier(step):
    rt = step['rt']
    subject = step['subject']
        
    subject_quantiles = quantiles_per_subject.loc[subject]
    q1, q3, iqr = subject_quantiles['q1'], subject_quantiles['q3'], subject_quantiles['iqr']
    return is_outlier(rt, q1, q3, iqr, cleaning_config['filter_threshold'])

In [19]:
# filtering only outliers
subjective_outlier_mask = response_times.apply(is_subjective_outlier, axis=1)
outlier_steps = response_times[subjective_outlier_mask]

In [20]:
outlier_steps = outlier_steps.merge(quantiles_per_subject, how='left', left_on='subject', right_index=True)
outlier_steps.head()

Unnamed: 0,subject,step_num,rt,q1,q3,iqr
54,107B,54,7899,1949.25,3412.5,1463.25
465,104B,6,9413,1932.25,3792.0,1859.75
655,104B,196,8954,1932.25,3792.0,1859.75
1383,108B,11,16865,2116.25,4969.0,2852.75
1463,108B,91,12970,2116.25,4969.0,2852.75
