# **LoopsResultsExploration**

## **1. Importing and First Proccesing**

In [1]:
import pandas as pd
import plotly.express as px
from DataCleaning import *
from ProcessingConfig import *

pd.options.display.max_columns = 50

In [2]:
raw_data = pd.read_excel(cleaning_config['raw_data_path'])

raw_data = pd.read_excel(cleaning_config['raw_data_path'])
print(f'original shape: {raw_data.shape}')
outliers_threshold = cleaning_config['filter_threshold']
print(f"threshold for outliers detection: {outliers_threshold}")

drop_columns(raw_data, cleaning_config['unnecessary_columns'])
convert_types(raw_data, cleaning_config['type_conversions'])
raw_data = filter_slow_subjects(raw_data, outliers_threshold)
raw_data = filter_bad_subjects(raw_data, outliers_threshold)
raw_data = drop_first_loop(raw_data)
raw_data = only_first_line(raw_data)
# raw_data = filter_bad_trials(raw_data, threshold=0.9)
# raw_data = filter_slow_steps(raw_data, outliers_threshold)

print(f'final shape: {raw_data.shape}')

original shape: (13294, 32)
threshold for outliers detection: 2.25
filter_slow_subjects: No slow subjects detected.
filter_bad_subjects: No bad subjects detected (in terms of low success rate).
drop_first_loop: 1630 rows were filtered out.
only_first_line: 9227 rows were filtered out.
final shape: (2437, 25)


In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2437 entries, 6 to 13287
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   subject                       2437 non-null   object 
 1   step_num                      2437 non-null   int64  
 2   uid                           2437 non-null   int64  
 3   trial_start_time              2437 non-null   int64  
 4   rt                            2437 non-null   int32  
 5   response                      2437 non-null   object 
 6   loop_step                     2437 non-null   int32  
 7   trial_order                   2437 non-null   int64  
 8   trial_set                     2437 non-null   int64  
 9   trial                         2437 non-null   int64  
 10  core_program                  2437 non-null   int64  
 11  variant                       2437 non-null   object 
 12  step_id                       2437 non-null   int64  
 13  te

In [4]:
raw_data.sample(3)

Unnamed: 0,subject,step_num,uid,trial_start_time,rt,response,loop_step,trial_order,trial_set,trial,core_program,variant,step_id,text1,response_needed,expected_response,is_loop,loop_type,loop_type_switch,n_iterations,n_loop_lines,expected_response_whole_loop,step_type,prev_loop_type,correct
8901,105A,183,72,694371,6204,12,0,5,1,1008,4,WWFWWFWW,9,while a <= 17:\n a += 4,True,12.0,True,while,False,3,1,"[12, 16, 20]",loop,while,True
6974,109A,91,40,328182,5293,8,0,3,1,5,3,WFFFWWWF,4,"for i in [1, 2, 3]:\n a = round_up(a*1.5)",True,8.0,True,for,False,3,1,"[8, 12, 18]",loop,for,True
11675,105A,207,31,647550,2019,3,0,6,2,4,2,FWWWWFFW,4,while a <= 7:\n a += 2,True,3.0,True,while,False,4,1,"[3, 5, 7, 9]",loop,while,True


In [5]:
raw_data['subject'].sort_values().unique()

array(['101B', '102A', '102B', '103A', '103B', '104A', '104B', '105A',
       '106A', '106B', '107B', '108A', '108B', '109A', '109B'],
      dtype=object)

## **2. Exploring Outliers**

### 2.1. Response-Time Between Subjects

In [6]:
# filtering only necessary columns
response_times = raw_data[['subject', 'step_num', 'rt']].copy()

In [7]:
# checking for outliers in terms of mean response time
mean_rt_per_subject = response_times[['rt', 'subject']].groupby('subject').mean()
mean_rt_per_subject.columns = ['mean_rt']

g_rt_q1, g_rt_q3 = mean_rt_per_subject['mean_rt'].quantile([0.25, 0.75])
g_rt_iqr = g_rt_q1 - g_rt_q3

mean_outlier_mask = mean_rt_per_subject['mean_rt'].apply(is_negative_outlier
                                                      , args=(g_rt_q1, g_rt_iqr
                                                              , cleaning_config['filter_threshold']))
rt_outlier_subject = mean_rt_per_subject[mean_outlier_mask].index

if rt_outlier_subject.size > 0:
    print(f'Seems that: {list(rt_outlier_subject)} are outliers in terms of mean response time within subject.')
else:
    print("No Outliers detected! at least in terms of mean response time within subject.")

No Outliers detected! at least in terms of mean response time within subject.


### 2.2. Mistakes Rate (%) Between Subjects

In [8]:
# filtering only necessary columns
response_success = raw_data[['subject', 'step_num', 'trial', 'correct', 'loop_step']].copy()

# calculating quantiles
g_c_mean = response_success['correct'].mean()
print(f'mean general success rate: {round(g_c_mean, 2)}')

mean general success rate: 0.96


In [9]:
success_per_subject = response_success[['subject', 'correct']].groupby('subject').mean()
success_per_subject.rename(columns={'correct': 'success_rate'}, inplace=True)
success_per_subject.sort_values(by='success_rate', ascending=False).T

subject,103B,107B,108A,103A,105A,106B,104B,109B,104A,108B,102B,106A,102A,109A,101B
success_rate,1.0,1.0,0.994048,0.988095,0.988095,0.988095,0.982143,0.97619,0.964286,0.964286,0.952381,0.952381,0.904762,0.904762,0.87574


In [10]:
success_rate_q1, success_rate_q3 = success_per_subject['success_rate'].quantile([0.25, 0.75])
success_rate_iqr = success_rate_q1 = success_rate_q3

success_outlier_mask = success_per_subject['success_rate'].apply(is_negative_outlier
                                        , args=(success_rate_q1, success_rate_iqr
                                            , cleaning_config['filter_threshold']))

success_outlier_subjects = success_per_subject[success_outlier_mask]

if success_outlier_subjects.size > 0:
    print(f'Seems that: {list(success_outlier_subjects)} are outliers in terms of success rate within subject.')
else:
    print("No Outliers detected! at least in terms of success rate within subject.")

No Outliers detected! at least in terms of success rate within subject.


### 2.3. Trial Mistakes Rate (%)

In [11]:
# calculating response success rate per trial
success_per_trial = response_success[['subject', 'trial', 'correct']].groupby(['subject', 'trial']).mean()
success_per_trial.rename(columns={'correct': 'success_per_trial'}, inplace=True)

success_per_trial.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,success_per_trial
subject,trial,Unnamed: 2_level_1
101B,1,0.857143
101B,2,0.714286
101B,3,1.0


In [12]:
reg_trials_hist = px.histogram(success_per_trial, nbins=20, x="success_per_trial"
                   , title='Success Rate per Trial (all data)', labels={'success_per_trial':'success rate'})
reg_trials_hist.show()

In [13]:
pr_trials_hist = px.histogram(pr_success_per_trial, nbins=20, x="success_per_trial"
                   , title='Success Rate per Trial (only first steps)', labels={'success_per_trial':'success rate'})
pr_trials_hist.show()

NameError: name 'pr_success_per_trial' is not defined

In [14]:
# actually finding the trial outliers in terms of success rate within subject
trial_success_q1, trial_success_q3 = success_per_trial['success_per_trial'].quantile([0.25, 0.75])
trial_success_iqr = trial_success_q1 - trial_success_q3
outlier_trails_mask = success_per_trial['success_per_trial'].apply(is_negative_outlier
                                                                   , args=(trial_success_q1
                                                                           , trial_success_iqr, cleaning_config['filter_threshold']))
outlier_trails = success_per_trial[outlier_trails_mask]

print("The following trials are outliers in term of trial success rate within subject:")
outlier_trails.sort_values(by='success_per_trial')

The following trials are outliers in term of trial success rate within subject:


Unnamed: 0_level_0,Unnamed: 1_level_0,success_per_trial
subject,trial,Unnamed: 2_level_1


### 2.3. Single Steps Response-Time Within Subject

In [15]:
# calculating response time quantiles and IQR per subject
quantiles_per_subject = response_times[['rt', 'subject']].groupby('subject').quantile([0.25, 0.75]).unstack()
quantiles_per_subject.columns = ['q1', 'q3']
quantiles_per_subject['iqr'] = quantiles_per_subject['q3'] - quantiles_per_subject['q1']

quantiles_per_subject.head(3)

Unnamed: 0_level_0,q1,q3,iqr
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101B,1282.0,2930.0,1648.0
102A,2552.5,5526.75,2974.25
102B,2184.25,4457.0,2272.75


In [16]:
# finding if a step is an outlier in terms of response time within subject
def is_subjective_outlier(step):
    rt = step['rt']
    subject = step['subject']
        
    subject_quantiles = quantiles_per_subject.loc[subject]
    q1, iqr = subject_quantiles['q1'], subject_quantiles['iqr']
    return is_negative_outlier(rt, q1, iqr, cleaning_config['filter_threshold'])

In [17]:
# filtering only outliers
subjective_outlier_mask = response_times.apply(is_subjective_outlier, axis=1)
outlier_steps = response_times[subjective_outlier_mask]

In [18]:
outlier_steps = outlier_steps.merge(quantiles_per_subject, how='left', left_on='subject', right_index=True)
outlier_steps.head()

Unnamed: 0,subject,step_num,rt,q1,q3,iqr


## **3. Time per Session Analysis**

In [19]:
from os import listdir
from os.path import isfile, join

raw_results_folder = r'C:\Users\97254\OneDrive\שולחן העבודה\Projects\MathThinkingLabDA\Loops\Results\RawResults'

In [20]:
def get_total_times(path: str):
    # importing the file names from directory
    data_files = [f for f in listdir(path) if isfile(join(path, f))]
    
    # creating a list of max time elapsed for every session
    max_times = {}
    for file_name in data_files:
        curr_times = pd.read_csv(path + f'\{file_name}', usecols=['time_elapsed'], dtype={'time_elapsed': int})
        curr_times = curr_times.squeeze()
        start_time = min(curr_times) / 60000
        end_time = max(curr_times) / 60000
        max_times.update({file_name :  (end_time - start_time)})
    
    return max_times

In [21]:
max_times = get_total_times(raw_results_folder)
only_times = list(max_times.values())

In [22]:
time_per_session = px.histogram(only_times, nbins=20, title='Time Per Session')
time_per_session.show()