# Statistical Analysis

## Imports & Initializations 

In [1]:
import os
import scipy.stats as stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# load model results from /formatted_data
trial_types = ["solo", "solo_con", "pair", "pair_con"]
model_results = {}
TLX_results = {}
eye_tracking_results = {}
gsr_results = {}

for trial_type in trial_types:
    model_results[trial_type] = json.load(open(f"formatted_data/{trial_type}_model_results.json"))
    TLX_results[trial_type] = pd.read_csv(f"formatted_data/{trial_type}_TLX_results.csv").to_dict()
    eye_tracking_results[trial_type] = pd.read_csv(f"formatted_data/{trial_type}_eye_results.csv").to_dict()
    gsr_results[trial_type] = pd.read_csv(f"formatted_data/{trial_type}_gsr_results.csv").to_dict()

def paired_t_test(data1, data2):
    print("Normality Check:")
    print("Data1:", stats.shapiro(data1))
    print("Data2:", stats.shapiro(data2))

    if stats.shapiro(data1)[1] > 0.05 and stats.shapiro(data2)[1] > 0.05:
        print("Data is normally distributed, use paired parametric test")
        t_stat, p_value = stats.ttest_rel(data1, data2)
        print("Paired T-Test:")
        print("T-statistic:", t_stat)
        print("P-value:", p_value)
    else:
        print("Data is not normally distributed, use non-parametric test")
        w_stat, p_value = stats.wilcoxon(data1, data2)
        print("Wilcoxon Signed-Rank Test:")
        print("W-statistic:", w_stat)
        print("P-value:", p_value)

def independent_t_test(data1, data2):
    print("Normality Check:")
    print("Data1:", stats.shapiro(data1))
    print("Data2:", stats.shapiro(data2))

    if stats.shapiro(data1)[1] > 0.05 and stats.shapiro(data2)[1] > 0.05:
        print("Data is normally distributed, use independent parametric test")
        t_stat, p_value = stats.ttest_ind(data1, data2)
        print("Independent T-Test:")
        print("T-statistic:", t_stat)
        print("P-value:", p_value)
    else:
        print("Data is not normally distributed, use non-parametric test")
        u_stat, p_value = stats.mannwhitneyu(data1, data2)
        print("Mann-Whitney U Test:")
        print("U-statistic:", u_stat)
        print("P-value:", p_value)

def mixed_effects_model(data,dependent_var,condition_group):
    model = smf.mixedlm(f'{dependent_var} ~ {condition_group}', data, groups=data['participant_id'])
    result = model.fit()
    print(result.summary())

def safe_float_convert(x):
    try:
        return float(x)
    except ValueError:
        return np.nan  # Convert non-numeric values to NaN


## Research Question 1: How does pair-based collaboration impact cognitive load and model performance in an annotation task in an IML system?

- Dependent Variables: Cognitive load (Measured by: GSR, eye-tracker, NASA-TLX scores), LSTM Model performance (Measured by: Accuracy, Precision, Recall)
- Independent Variable: Collaboration mode (solo vs. pair)



### Pair-based collaboration impact on cognitive load

#### t-test on TLX Scores

In [3]:
validation_category = "Rating Scale Score"
solo_TLX_scores = []
pair_TLX_scores = []

for category in TLX_results["solo"].keys():
    if category.startswith(validation_category):
        solo_TLX_scores.append(TLX_results["solo"][category][0])
for category in TLX_results["solo_con"]:
    if category.startswith(validation_category):
        solo_TLX_scores.append(TLX_results["solo_con"][category][0])

for category in TLX_results["pair"]:
    if category.startswith(validation_category):
        pair_TLX_scores.append(TLX_results["pair"][category][0])
for category in TLX_results["pair_con"]:
    if category.startswith(validation_category):
        pair_TLX_scores.append(TLX_results["pair_con"][category][0])

independent_t_test(solo_TLX_scores, pair_TLX_scores)

data = []
for trial_type in ['solo', 'solo_con', 'pair', 'pair_con']:
    for category in TLX_results[trial_type].keys():
        if category.startswith("Rating Scale Score"):
            participant_id = category.split()[-1]
            trial_type = trial_type
            score = TLX_results[trial_type][category][0]
            data.append((participant_id, trial_type, score))
tlx_df = pd.DataFrame(data, columns=['participant_id', 'Condition', 'TLXScore'])
# combine solo and solo_con, pair and pair_con
tlx_df['Condition'] = tlx_df['Condition'].replace({'solo_con': 'solo', 'pair_con': 'pair'})
#print(tlx_df.head())

mixed_effects_model(tlx_df, 'TLXScore', 'Condition')

Normality Check:
Data1: ShapiroResult(statistic=0.9541932940483093, pvalue=0.5588769912719727)
Data2: ShapiroResult(statistic=0.9060248732566833, pvalue=0.3269185423851013)
Data is normally distributed, use independent parametric test
Independent T-Test:
T-statistic: 1.166151337203741
P-value: 0.2560401450937756
            Mixed Linear Model Regression Results
Model:               MixedLM   Dependent Variable:   TLXScore
No. Observations:    24        Method:               REML    
No. Groups:          8         Scale:                68.1437 
Min. group size:     2         Log-Likelihood:       -86.7501
Max. group size:     4         Converged:            Yes     
Mean group size:     3.0                                     
-------------------------------------------------------------
                   Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------------
Intercept          36.022    5.609  6.422 0.000 25.028 47.015
Condition[T.solo

#### t-test on Average Pupil Diameter

In [4]:
solo_average_diameter_results = []
pair_average_diameter_results = []

for category in eye_tracking_results["solo"].keys():
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["solo"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        solo_average_diameter_results.append(average_diameter)
for category in eye_tracking_results["solo_con"]:
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["solo_con"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        solo_average_diameter_results.append(average_diameter)

for category in eye_tracking_results["pair"]:
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["pair"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        pair_average_diameter_results.append(average_diameter)
for category in eye_tracking_results["pair_con"]:
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["pair_con"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        pair_average_diameter_results.append(average_diameter)

#print(solo_average_diameter_results)
#print(pair_average_diameter_results)

independent_t_test(solo_average_diameter_results, pair_average_diameter_results)

data = []
for trial_type in ['solo', 'solo_con', 'pair', 'pair_con']:
    for category in eye_tracking_results[trial_type].keys():
        if category.startswith("diameter"):
            participant_id = category.split()[-1]
            trial_type = trial_type
            diameter = np.mean([value for value in eye_tracking_results[trial_type][category].values() if not np.isnan(value)])
            data.append((participant_id, trial_type, diameter))
eye_df = pd.DataFrame(data, columns=['participant_id', 'Condition', 'AverageDiameter'])
# combine solo and solo_con, pair and pair_con
eye_df['Condition'] = eye_df['Condition'].replace({'solo_con': 'solo', 'pair_con': 'pair'})
#print(eye_df.head(33))

mixed_effects_model(eye_df, 'AverageDiameter', 'Condition')

Normality Check:
Data1: ShapiroResult(statistic=0.6162278056144714, pvalue=6.019883613817001e-08)
Data2: ShapiroResult(statistic=0.8831117153167725, pvalue=0.043398331850767136)
Data is not normally distributed, use non-parametric test
Mann-Whitney U Test:
U-statistic: 323.0
P-value: 0.1458397251879579
            Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: AverageDiameter
No. Observations: 48      Method:             REML           
No. Groups:       16      Scale:              0.0024         
Min. group size:  2       Log-Likelihood:     69.5013        
Max. group size:  4       Converged:          Yes            
Mean group size:  3.0                                        
-------------------------------------------------------------
                   Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------------
Intercept          -0.001    0.015 -0.071 0.943 -0.031  0.029
Condition[T.solo]   0.014 



#### t-test on Average Skin Conductance

In [5]:
solo_average_gsr = []
pair_average_gsr = []

for category in gsr_results["solo"].keys():
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["solo"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        solo_average_gsr.append(average_gsr)
for category in gsr_results["solo_con"].keys():
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["solo_con"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        solo_average_gsr.append(average_gsr)

for category in gsr_results["pair"].keys():
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["pair"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        pair_average_gsr.append(average_gsr)
for category in gsr_results["pair_con"].keys():
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["pair_con"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        pair_average_gsr.append(average_gsr)

#print(solo_average_gsr)
#print(pair_average_gsr)

independent_t_test(solo_average_gsr, pair_average_gsr)

data = []
for trial_type in ['solo', 'solo_con', 'pair', 'pair_con']:
    for category in gsr_results[trial_type].keys():
        if category.startswith("GSR Conductance CAL (u Siemens)"):
            participant_id = category.split()[-1]
            trial_type = trial_type
            gsr_values = list(gsr_results[trial_type][category].values())
            gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
            gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
            average_gsr = np.mean(gsr_values)
            data.append((participant_id, trial_type, average_gsr))
gsr_df = pd.DataFrame(data, columns=['participant_id', 'Condition', 'AverageGSR'])
# combine solo and solo_con, pair and pair_con
gsr_df['Condition'] = gsr_df['Condition'].replace({'solo_con': 'solo', 'pair_con': 'pair'})
#print(gsr_df.head(20))

mixed_effects_model(gsr_df, 'AverageGSR', 'Condition')

Normality Check:
Data1: ShapiroResult(statistic=0.669791579246521, pvalue=7.992655446287245e-05)
Data2: ShapiroResult(statistic=0.9123731851577759, pvalue=0.3710785210132599)
Data is not normally distributed, use non-parametric test
Mann-Whitney U Test:
U-statistic: 77.0
P-value: 0.44359774263582685
           Mixed Linear Model Regression Results
Model:              MixedLM  Dependent Variable:  AverageGSR
No. Observations:   24       Method:              REML      
No. Groups:         8        Scale:               0.0000    
Min. group size:    2        Log-Likelihood:      726.4908  
Max. group size:    4        Converged:           Yes       
Mean group size:    3.0                                     
------------------------------------------------------------
                  Coef.  Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------------
Intercept         -0.000    0.000 -0.291 0.771 -0.000  0.000
Condition[T.solo]  0.000    0.000  1.196



### Pair-based collaboration impact on model performance

#### t-test on accuracy with respect to dataset

In [6]:
validation_type = "Updated model accuracy with respect to dataset"
solo_validation_values = []
pair_validation_values = []

for category in model_results["solo"].keys():
    if category.startswith(validation_type):
        solo_validation_values.append(model_results["solo"][category])
for category in model_results["solo_con"]:
    if category.startswith(validation_type):
        solo_validation_values.append(model_results["solo_con"][category])

for category in model_results["pair"]:
    if category.startswith(validation_type):
        pair_validation_values.append(model_results["pair"][category])
for category in model_results["pair_con"]:
    if category.startswith(validation_type):
        pair_validation_values.append(model_results["pair_con"][category])

#print(solo_validation_values)
#print(pair_validation_values)

independent_t_test(solo_validation_values, pair_validation_values)

Normality Check:
Data1: ShapiroResult(statistic=0.6439616680145264, pvalue=4.3395168177085e-05)
Data2: ShapiroResult(statistic=0.728634238243103, pvalue=0.02385682798922062)
Data is not normally distributed, use non-parametric test
Mann-Whitney U Test:
U-statistic: 32.0
P-value: 1.0


## Research Question 2: What is the effect of varying the controllability of the system on cognitive load and model performance in an IML system?

- Dependent Variables: Cognitive load (Measured by: GSR, eye-tracker, NASA-TLX scores), LSTM Model performance (Measured by: Accuracy, Precision, Recall)
- Independent Variable: Controllability (low vs. high)

### Controllability of the system impact on cognitive load

#### t-test on TLX Scores

In [7]:
validation_category = "Rating Scale Score"
con_tlx_scores = []
no_con_tlx_scores = []

for category in TLX_results["solo"].keys():
    if category.startswith(validation_category):
        no_con_tlx_scores.append(TLX_results["solo"][category][0])
for category in TLX_results["pair"]:
    if category.startswith(validation_category):
        no_con_tlx_scores.append(TLX_results["pair"][category][0])

for category in TLX_results["solo_con"]:
    if category.startswith(validation_category):
        con_tlx_scores.append(TLX_results["solo_con"][category][0])
for category in TLX_results["pair_con"]:
    if category.startswith(validation_category):
        con_tlx_scores.append(TLX_results["pair_con"][category][0])

#print(con_tlx_scores)
#print(no_con_tlx_scores)

paired_t_test(con_tlx_scores, no_con_tlx_scores)

Normality Check:
Data1: ShapiroResult(statistic=0.9362173676490784, pvalue=0.4506763219833374)
Data2: ShapiroResult(statistic=0.93773353099823, pvalue=0.4692804515361786)
Data is normally distributed, use paired parametric test
Paired T-Test:
T-statistic: 4.097496026137246
P-value: 0.0017668888816182462


#### t-test on average diameter 

In [8]:
con_validation_values = []
no_con_validation_values = []

for category in eye_tracking_results["solo"].keys():
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["solo"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        no_con_validation_values.append(average_diameter)
for category in eye_tracking_results["pair"]:
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["pair"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        no_con_validation_values.append(average_diameter)

for category in eye_tracking_results["solo_con"]:
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["solo_con"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        con_validation_values.append(average_diameter)
for category in eye_tracking_results["pair_con"]:
    if category.startswith("diameter"):
        diameters = list(eye_tracking_results["pair_con"][category].values())
        diameters = [diameter for diameter in diameters if not np.isnan(diameter)]
        average_diameter = np.mean(diameters)
        con_validation_values.append(average_diameter)

#print(con_validation_values)
#print(no_con_validation_values)

paired_t_test(con_validation_values, no_con_validation_values)

Normality Check:
Data1: ShapiroResult(statistic=0.6962406635284424, pvalue=8.986825378087815e-06)
Data2: ShapiroResult(statistic=0.5993242263793945, pvalue=6.070997073948092e-07)
Data is not normally distributed, use non-parametric test
Wilcoxon Signed-Rank Test:
W-statistic: 82.0
P-value: 0.05264079570770264


#### t-test on skin conductance

In [9]:
con_gsr = []
no_con_gsr = []

for category in gsr_results["solo"].keys():
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["solo"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        no_con_gsr.append(average_gsr)
for category in gsr_results["pair"]:
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["pair"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        no_con_gsr.append(average_gsr)

for category in gsr_results["solo_con"]:
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["solo_con"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        con_gsr.append(average_gsr)
for category in gsr_results["pair_con"]:
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["pair_con"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        average_gsr = np.mean(gsr_values)
        con_gsr.append(average_gsr)

#print(con_gsr)
#print(no_con_gsr)

paired_t_test(con_gsr, no_con_gsr)

Normality Check:
Data1: ShapiroResult(statistic=0.54163658618927, pvalue=3.6446555895963684e-05)
Data2: ShapiroResult(statistic=0.9138855338096619, pvalue=0.23920249938964844)
Data is not normally distributed, use non-parametric test
Wilcoxon Signed-Rank Test:
W-statistic: 27.0
P-value: 0.38037109375


In [10]:
con_gsr = []
no_con_gsr = []

for category in gsr_results["solo"].keys():
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["solo"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        max_gsr = max(gsr_values)
        no_con_gsr.append(max_gsr)
for category in gsr_results["pair"]:
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["pair"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        max_gsr = max(gsr_values)
        no_con_gsr.append(max_gsr)

for category in gsr_results["solo_con"]:
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["solo_con"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        max_gsr = max(gsr_values)
        con_gsr.append(max_gsr)
for category in gsr_results["pair_con"]:
    if category.startswith("GSR Conductance CAL (u Siemens)"):
        gsr_values = list(gsr_results["pair_con"][category].values())
        gsr_values = [safe_float_convert(gsr) for gsr in gsr_values]
        gsr_values = [gsr for gsr in gsr_values if not np.isnan(gsr)]
        max_gsr = max(gsr_values)
        con_gsr.append(max_gsr)

#print(con_gsr)
#print(no_con_gsr)

paired_t_test(con_gsr, no_con_gsr)

Normality Check:
Data1: ShapiroResult(statistic=0.8783904314041138, pvalue=0.08358535170555115)
Data2: ShapiroResult(statistic=0.934589684009552, pvalue=0.431304931640625)
Data is normally distributed, use paired parametric test
Paired T-Test:
T-statistic: 0.6559251278498037
P-value: 0.5253544108853087


### Controllability of the system impact on model performance

In [11]:
validation_type = "Updated model accuracy with respect to dataset"
con_validation_values = []
no_con_validation_values = []

for category in model_results["solo"].keys():
    if category.startswith(validation_type):
        print(category)
        no_con_validation_values.append(model_results["solo"][category])
for category in model_results["pair"]:
    if category.startswith(validation_type):
        print(category)
        no_con_validation_values.append(model_results["pair"][category])

for category in model_results["solo_con"]:
    if category.startswith(validation_type):
        print(category)
        con_validation_values.append(model_results["solo_con"][category])
for category in model_results["pair_con"]:
    if category.startswith(validation_type):
        print(category)
        con_validation_values.append(model_results["pair_con"][category])

print(con_validation_values)
print(no_con_validation_values) 

paired_t_test(con_validation_values, no_con_validation_values)

Updated model accuracy with respect to dataset for Participant 001
Updated model accuracy with respect to dataset for Participant 002
Updated model accuracy with respect to dataset for Participant 004
Updated model accuracy with respect to dataset for Participant 003
Updated model accuracy with respect to dataset for Participant 005
Updated model accuracy with respect to dataset for Participant 006
Updated model accuracy with respect to dataset for Participant 007
Updated model accuracy with respect to dataset for Participant 008
Updated model accuracy with respect to dataset for Participant 001,006
Updated model accuracy with respect to dataset for Participant 002,003
Updated model accuracy with respect to dataset for Participant 001
Updated model accuracy with respect to dataset for Participant 002
Updated model accuracy with respect to dataset for Participant 003
Updated model accuracy with respect to dataset for Participant 004
Updated model accuracy with respect to dataset for Par