In [11]:
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests
import os
import warnings

In [12]:
def cohens_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    pooled_std = np.sqrt(((nx-1)*np.std(x, ddof=1)**2 + (ny-1)*np.std(y, ddof=1)**2) / dof)
    return (np.mean(x) - np.mean(y)) / pooled_std

In [13]:
# Load data
data_path_correctness = os.path.join('..', 'data', 
                                     'performance', 'SpringCorrectness.csv')
correctness = pd.read_csv(data_path_correctness)

data_path_time = os.path.join('..', 'data', 
                              'performance', 'SpringTime.csv')
time = pd.read_csv(data_path_time)

In [14]:
def analyze_rq1(group_data, metric_name):
    warnings.filterwarnings('ignore', message='Exact p-value calculation does not work if there are zeros')
    warnings.filterwarnings('ignore', message='Sample size too small for normal approximation')

    results = {}
    
    # Define session columns
    sessions = ['T0', 'T1', 'T2', 'T3', 'T4', 'T5']
    
    # Within-group analysis (Friedman test)
    for group in [1, 2, 3, 4]:
        group_df = group_data[group_data['Group_ID'] == group]
        
        # Keep only participants with complete data for all sessions
        complete_cases = group_df.dropna(subset=sessions)
        
        if len(complete_cases) < 3:
            print(f"Group {group} has insufficient complete cases ({len(complete_cases)}). Skipping Friedman test.")
            continue
            
        # Prepare data for Friedman test
        data = [complete_cases[session] for session in sessions]
        
        try:
            stat, p = friedmanchisquare(*data)
            results[f'Group {group} Friedman'] = {'χ²': stat, 'p': p}
        except ValueError as e:
            print(f"Error in Group {group}: {str(e)}")
            continue
        
        # Post-hoc pairwise comparisons (T0 vs others)
        p_values = []
        comparisons = []
        effect_sizes = []
        
        for session in sessions[1:]:  # Compare T0 with T1-T5
            try:
                # Use only participants with data for both T0 and current session
                valid_idx = group_df[['T0', session]].dropna().index
                subset = group_df.loc[valid_idx]
                
                _, p = wilcoxon(subset['T0'], subset[session])
                d = cohens_d(subset[session], subset['T0'])
                
                p_values.append(p)
                comparisons.append(f'T0 vs {session}')
                effect_sizes.append(d)
            except Exception as e:
                print(f"Error in Group {group} {session}: {str(e)}")
                p_values.append(np.nan)
                effect_sizes.append(np.nan)
                continue
        
        # Bonferroni correction
        reject, adj_p, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')
        
        results[f'Group {group} Posthoc'] = {
            'comparisons': comparisons,
            'p_values': adj_p,
            'effect_sizes': effect_sizes
        }
    
    # Between-group analysis at each session
    for session in sessions:
        groups = []
        valid_groups = []
        for g in [1, 2, 3, 4]:
            group_vals = group_data[group_data['Group_ID'] == g][session].dropna()
            if len(group_vals) > 0:
                groups.append(group_vals)
                valid_groups.append(g)
        
        if len(groups) < 2:
            print(f"Not enough groups with data for {session}")
            continue
            
        try:
            h_stat, p = kruskal(*groups)
            results[f'Between Groups {session}'] = {'H': h_stat, 'p': p}
            
            # Pairwise comparisons
            pair_results = []
            p_values = []
            for i in range(len(valid_groups)):
                for j in range(i+1, len(valid_groups)):
                    g1 = groups[i]
                    g2 = groups[j]
                    try:
                        _, p = mannwhitneyu(g1, g2)
                        d = cohens_d(g1, g2)
                        pair_results.append({
                            'groups': f'G{valid_groups[i]} vs G{valid_groups[j]}',
                            'p': p,
                            'cohens_d': d
                        })
                        p_values.append(p)
                    except:
                        continue
            
            # Apply Bonferroni correction
            if p_values:
                reject, adj_p, _, _ = multipletests(p_values, method='bonferroni')
                for i in range(len(pair_results)):
                    pair_results[i]['adj_p'] = adj_p[i]
                
                results[f'Between Groups {session} Pairs'] = pair_results
        except Exception as e:
            print(f"Error in {session} between-group analysis: {str(e)}")
            continue
    
    return results


In [15]:
# Analyze correctness
print("Analyzing Correctness Data...")
correctness_results = analyze_rq1(correctness, "Correctness")

# Analyze time-to-completion
print("\nAnalyzing Time Data...")
time_results = analyze_rq1(time, "Time")

# Print results
def print_results(results, title):
    print(f"\n{title} Results:")
    for key in results:
        if 'Pairs' in key:
            print(f"\n{key}:")
            for item in results[key]:
                print(f"{item['groups']}: adj_p={item.get('adj_p', np.nan):.3f}, d={item['cohens_d']:.2f}")
        else:
            print(f"{key}: {results[key]}")

print_results(correctness_results, "Correctness")
print_results(time_results, "Time")

Analyzing Correctness Data...

Analyzing Time Data...

Correctness Results:
Group 1 Friedman: {'χ²': 4.230769230769238, 'p': 0.5166916593078766}
Group 1 Posthoc: {'comparisons': ['T0 vs T1', 'T0 vs T2', 'T0 vs T3', 'T0 vs T4', 'T0 vs T5'], 'p_values': array([1.        , 1.        , 0.78649604, 0.78649604, 1.        ]), 'effect_sizes': [-0.23570226039551584, 0.43259045634870014, 0.43259045634870014, 0.43259045634870014, 0.0]}
Group 2 Friedman: {'χ²': 3.0851063829787204, 'p': 0.686866708451628}
Group 2 Posthoc: {'comparisons': ['T0 vs T1', 'T0 vs T2', 'T0 vs T3', 'T0 vs T4', 'T0 vs T5'], 'p_values': array([0.78649604, 1.        , 1.        , 1.        , 1.        ]), 'effect_sizes': [-0.4242640687119285, 0.1916629694999819, 0.0, 0.21081851067789203, 0.0]}
Group 3 Friedman: {'χ²': 3.8135593220338913, 'p': 0.5765589657739507}
Group 3 Posthoc: {'comparisons': ['T0 vs T1', 'T0 vs T2', 'T0 vs T3', 'T0 vs T4', 'T0 vs T5'], 'p_values': array([1.        , 1.        , 0.89856247, 1.        , 1.  