In [17]:
import numpy as np
import pandas as pd
import os
from scipy import stats
from typing import List, Tuple, Dict

In [18]:
def load_and_prepare_data(correctness_path: str, time_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load and prepare the data from CSV files for analysis.
    
    Args:
        correctness_path: Path to correctness CSV file
        time_path: Path to time CSV file
        
    Returns:
        Two DataFrames (correctness_long, time_long) in long format with columns:
        Participant_ID, Group_ID, Test, Value
    """
    # Load data
    correctness = pd.read_csv(correctness_path)
    time = pd.read_csv(time_path)
    
    # Get test columns (T0-T5)
    test_cols = [col for col in correctness.columns if col.startswith('T')]
    
    # Function to reshape data to long format
    def reshape_to_long(df: pd.DataFrame) -> pd.DataFrame:
        return pd.melt(
            df,
            id_vars=['Participant_ID', 'Group_ID'],
            value_vars=test_cols,
            var_name='Test',
            value_name='Value'
        )
    
    # Reshape both datasets
    correctness_long = reshape_to_long(correctness)
    time_long = reshape_to_long(time)
    
    return correctness_long, time_long



In [19]:
# calculate cv, threshold is 15%
def calculate_cv(window_data: np.ndarray) -> float:
    """
    Calculate Coefficient of Variation for a window of sessions.
    Returns CV as a percentage.
    """
    return (np.std(window_data) / np.mean(window_data)) * 100


In [20]:
# calculate improvement rate, thredhold is 10%
def calculate_improvement_rate(current: float, previous: float) -> float:
    """
    Calculate improvement rate between consecutive sessions as percentage.
    """
    return ((current - previous) / previous) * 100


In [21]:
def analyze_sliding_window(performance_data: np.ndarray, 
                         test_names: List[str] = ['T0', 'T1', 'T2', 'T3', 'T4', 'T5'],
                         window_size: int = 3,
                         cv_threshold: float = 15.0,
                         improvement_threshold: float = 10.0) -> Dict:
    """
    Analyze performance stability using sliding windows.
    Handles cases where consecutive tests have identical values.
    
    Args:
        performance_data: Array of performance metrics across tests
        test_names: List of test names (T0-T5)
        window_size: Size of sliding window (default=3)
        cv_threshold: Maximum CV% to indicate stability (default=15%)
        improvement_threshold: Maximum improvement rate to indicate stability (default=10%)
    
    Returns:
        Dictionary containing stability metrics for each window
    """
    n_windows = len(performance_data) - window_size + 1
    results = {}
    
    for i in range(n_windows):
        window = performance_data[i:i+window_size]
        window_label = f"{test_names[i]}-{test_names[i+window_size-1]}"
        
        # Calculate CV
        cv = calculate_cv(window)
        
        # Calculate improvement rates between consecutive tests
        improvements = [
            calculate_improvement_rate(window[j+1], window[j])
            for j in range(len(window)-1)
        ]
        
        # Perform Friedman test
        try:
            friedman_stat, friedman_p = stats.friedmanchisquare(*[
                [x] for x in window
            ])
        except ValueError:  # Handle case where all values are identical
            friedman_p = 1.0  # No difference between groups
        
        # Perform Wilcoxon tests for consecutive pairs
        wilcoxon_results = []
        for j in range(len(window)-1):
            if window[j] == window[j+1]:  # If values are identical
                p_value = 1.0  # No difference between pairs
            else:
                try:
                    stat, p_value = stats.wilcoxon([window[j]], [window[j+1]])
                except ValueError:  # Handle case where differences are all zero
                    p_value = 1.0  # No difference between pairs
            
            wilcoxon_results.append({
                'pair': f'{test_names[i+j]}-{test_names[i+j+1]}',
                'p_value': p_value
            })
        
        # Check stability criteria
        cv_stable = cv < cv_threshold
        improvement_stable = all(abs(imp) < improvement_threshold for imp in improvements)
        statistical_stable = friedman_p > 0.05 and all(w['p_value'] > 0.05 for w in wilcoxon_results)
        
        results[window_label] = {
            'cv': cv,
            'improvement_rates': improvements,
            'friedman_p': friedman_p,
            'wilcoxon_results': wilcoxon_results,
            'cv_stable': cv_stable,
            'improvement_stable': improvement_stable,
            'statistical_stable': statistical_stable,
            'overall_stable': cv_stable and improvement_stable and statistical_stable
        }
    
    return results



In [22]:
def find_stability_point(data: pd.DataFrame, group: int) -> Dict:
    """
    Find the stability point for a specific instruction group.
    
    Args:
        data: DataFrame in long format with columns: Participant_ID, Group_ID, Test, Value
        group: Group ID to analyze
        
    Returns:
        Dictionary containing stability analysis results for the group including:
        - window_analysis: Detailed analysis for each sliding window
        - stability_point: First window where all stability criteria are met
        - performance_trajectory: Mean performance values across tests
        - error: Error message if no data found for the group
    """
    # Get mean performance for each test in the group
    group_data = data[data['Group_ID'] == group]
    
    # Check if we have data for this group
    if len(group_data) == 0:
        return {
            'window_analysis': {},
            'stability_point': None,
            'performance_trajectory': [],
            'error': f'No data found for group {group}'
        }
    
    # Get test names in order
    test_names = sorted(group_data['Test'].unique())
    group_means = group_data.groupby('Test')['Value'].mean().reindex(test_names).values
    
    # Analyze stability using sliding windows
    window_analysis = analyze_sliding_window(group_means, test_names)
    
    # Find first window where all stability criteria are met
    stability_point = None
    for window, results in window_analysis.items():
        if results['overall_stable']:
            stability_point = window
            break
    
    return {
        'window_analysis': window_analysis,
        'stability_point': stability_point,
        'performance_trajectory': group_means.tolist()
    }



In [23]:
def run_rq4_analysis(correctness_path: str, time_path: str) -> Dict:
    """
    Run complete RQ4 analysis for both performance metrics.
    
    Args:
        correctness_path: Path to correctness CSV file
        time_path: Path to time CSV file
        
    Returns:
        Dictionary containing stability analysis results for both metrics
    """
    # Load and prepare data
    correctness_long, time_long = load_and_prepare_data(correctness_path, time_path)
    
    # Initialize results dictionaries
    time_stability = {}
    correctness_stability = {}
    
    # Analyze each group
    for group in range(1, 5):  # Groups 1-4
        # Analyze time-to-completion
        time_stability[f'G{group}'] = find_stability_point(time_long, group)
        
        # Analyze correctness
        correctness_stability[f'G{group}'] = find_stability_point(correctness_long, group)
    
    return {
        'time_stability': time_stability,
        'correctness_stability': correctness_stability
    }


In [24]:
data_path_correctness = os.path.join('..', 'data', 
                                     'performance', 'SpringCorrectness.csv')

data_path_time = os.path.join('..', 'data', 
                              'performance', 'SpringTime.csv')

In [25]:
results = run_rq4_analysis(data_path_correctness, data_path_time)

In [27]:
results

{'time_stability': {'G1': {'window_analysis': {'T0-T2': {'cv': 26.830678472898423,
     'improvement_rates': [-35.30288700745244, -16.174313103314596],
     'friedman_p': 0.36787944117144245,
     'wilcoxon_results': [{'pair': 'T0-T1', 'p_value': 1.0},
      {'pair': 'T1-T2', 'p_value': 1.0}],
     'cv_stable': False,
     'improvement_stable': False,
     'statistical_stable': True,
     'overall_stable': False},
    'T1-T3': {'cv': 8.099991741779515,
     'improvement_rates': [-16.174313103314596, 1.9465056237703164],
     'friedman_p': 0.36787944117144245,
     'wilcoxon_results': [{'pair': 'T1-T2', 'p_value': 1.0},
      {'pair': 'T2-T3', 'p_value': 1.0}],
     'cv_stable': True,
     'improvement_stable': False,
     'statistical_stable': True,
     'overall_stable': False},
    'T2-T4': {'cv': 1.0080435014137707,
     'improvement_rates': [1.9465056237703164, 0.38402979899081946],
     'friedman_p': 0.36787944117144245,
     'wilcoxon_results': [{'pair': 'T2-T3', 'p_value': 1.0},

In [26]:
# Print results
for metric in ['time_stability', 'correctness_stability']:
    print(f"\n{metric.upper()} RESULTS:")
    for group, analysis in results[metric].items():
        print(f"\n{group}:")
        print(f"Stability point: {analysis['stability_point']}")
        print(f"Performance trajectory: {analysis['performance_trajectory']}")

        # Print detailed window analysis
        print("\nWindow Analysis:")
        for window, window_results in analysis['window_analysis'].items():
            if window_results['overall_stable']:
                print(f"{window}: STABLE")
                print(f"  CV: {window_results['cv']:.2f}%")
                print(f"  Improvement rates: {[f'{rate:.2f}%' for rate in window_results['improvement_rates']]}")
                print(f"  Friedman p-value: {window_results['friedman_p']:.3f}")


TIME_STABILITY RESULTS:

G1:
Stability point: T2-T4
Performance trajectory: [41.09022222222222, 26.5841875, 22.284377777777777, 22.718144444444444, 22.80538888888889, 31.2696]

Window Analysis:
T2-T4: STABLE
  CV: 1.01%
  Improvement rates: ['1.95%', '0.38%']
  Friedman p-value: 0.368

G2:
Stability point: None
Performance trajectory: [34.20996, 31.168022222222223, 23.34271, 24.590844444444443, 21.787655555555556, 21.859255555555556]

Window Analysis:

G3:
Stability point: None
Performance trajectory: [37.76844444444444, 27.220212500000002, 24.041166666666665, 27.15052222222222, 27.35968888888889, 22.44666666666667]

Window Analysis:

G4:
Stability point: T1-T3
Performance trajectory: [39.29661, 13.4802875, 14.77286, 14.08295, 12.826069999999998, 13.585669999999999]

Window Analysis:
T1-T3: STABLE
  CV: 3.74%
  Improvement rates: ['9.59%', '-4.67%']
  Friedman p-value: 0.368
T2-T4: STABLE
  CV: 5.80%
  Improvement rates: ['-4.67%', '-8.92%']
  Friedman p-value: 0.368
T3-T5: STABLE
  C