In [1]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
import os

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

STRATEGIES_ORDER = ["Q-RL", "R-Item", "R-Cat", "LSC", "WSLS", "EV", "RA", "RS", "PD"]
CLUSTERING_K = 2

def load_and_preprocess(filepath="strategy_scores.csv", mock=False):
    if not mock and os.path.exists(filepath):
        try:
            df = pd.read_csv(filepath, index_col="participant_id")
            for col in df.columns:
                if df[col].dtype == 'object':
                    df[col] = df[col].astype(str).str.replace("%", "", regex=False).astype(float) / 100.0
                elif df[col].max() > 1.0:
                    df[col] = df[col] / 100.0
            print(f"Loaded file: '{filepath}'")
            return df
        except Exception as e:
            print(f"Error loading file '{filepath}': {e}")
            return None
    else:
        print(f"Warning: File '{filepath}' not found or simulation enabled. Generating simulated strategy score data (N=50).")
        np.random.seed(42)
        data = {}
        for s in STRATEGIES_ORDER:
            data[f"{s}_all"] = np.random.rand(50) * 0.5 + (0.5 * (s in ["LSC", "WSLS"]))
        df = pd.DataFrame(data, index=[f"P_{i}" for i in range(50)])
        return df

def load_scores(filepath="decision.csv", participant_ids=None, mock=False):
    if not mock and os.path.exists(filepath):
        try:
            df = pd.read_csv(filepath, header=None, names=['participant_id', 'trial', 'choice', 'score'])
            print(f"Loaded file: '{filepath}'")
            return df
        except Exception as e:
            print(f"Error loading file '{filepath}': {e}")
            return None
    else:
        print(f"Warning: File '{filepath}' not found or simulation enabled. Generating simulated decision score data (60 trials).")
        np.random.seed(42)
        n_participants = len(participant_ids) if participant_ids is not None else 50
        df_list = []
        for i in range(n_participants):
            pid = f"P_{i}"
            scores = np.random.choice([0, 1], size=60, p=[0.3, 0.7])
            df_part = pd.DataFrame({
                'participant_id': pid,
                'trial': range(1, 61),
                'choice': np.random.randint(1, 5, 60),
                'score': scores
            })
            df_list.append(df_part)
        return pd.concat(df_list, ignore_index=True)

def perform_gmm_clustering(df, k):
    clustering_cols = [f"{s}_all" for s in STRATEGIES_ORDER]
    X = df[clustering_cols]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    gmm = GaussianMixture(n_components=k, random_state=42, n_init=10)
    cluster_labels = gmm.fit_predict(X_scaled)
    return pd.Series(cluster_labels, index=df.index)

def compare_cluster_total_scores(df_scores, cluster_labels):
    df_scores['cluster_label'] = df_scores['participant_id'].map(cluster_labels)
    participant_scores = df_scores.groupby('participant_id').agg({
        'score': 'sum',
        'cluster_label': 'first'
    }).dropna().reset_index()

    if participant_scores['cluster_label'].isnull().any():
         print("Error: Participant ID mismatch between GMM clusters and decision score data. Skipping unmatched participants.")

    cluster_0_scores = participant_scores[participant_scores['cluster_label'] == 0]['score'].values
    cluster_1_scores = participant_scores[participant_scores['cluster_label'] == 1]['score'].values

    n0, n1 = len(cluster_0_scores), len(cluster_1_scores)
    
    if n0 < 3 or n1 < 3:
        print(f"Error: Cluster sample size too small (C0 N={n0}, C1 N={n1}). Statistical analysis aborted.")
        return

    mean0, mean1 = np.mean(cluster_0_scores), np.mean(cluster_1_scores)
    std0, std1 = np.std(cluster_0_scores, ddof=1), np.std(cluster_1_scores, ddof=1)

    _, p_norm0 = stats.shapiro(cluster_0_scores)
    _, p_norm1 = stats.shapiro(cluster_1_scores)
    both_normal = (p_norm0 > 0.05) and (p_norm1 > 0.05)

    if both_normal:
        _, p_levene = stats.levene(cluster_0_scores, cluster_1_scores)
        if p_levene > 0.05:
            t_stat, p_value = stats.ttest_ind(cluster_0_scores, cluster_1_scores)
            test_name = "Independent samples t-test"
        else:
            t_stat, p_value = stats.ttest_ind(cluster_0_scores, cluster_1_scores, equal_var=False)
            test_name = "Welch's t-test"
        statistic = t_stat
    else:
        u_stat, p_value = stats.mannwhitneyu(cluster_0_scores, cluster_1_scores, alternative='two-sided')
        test_name = "Mann-Whitney U test"
        statistic = u_stat

    print("\n" + "="*50)
    print("Total Score Difference Comparison After GMM Clustering")
    print(f"Clustering K={CLUSTERING_K}, Statistical Test: {test_name}")
    print("="*50)
    print(f"Cluster 0 (N={n0}): Mean total score = {mean0:.2f}, SD = {std0:.2f}")
    print(f"Cluster 1 (N={n1}): Mean total score = {mean1:.2f}, SD = {std1:.2f}")

    if mean0 > mean1:
        direction = "Cluster 0 > Cluster 1"
    elif mean0 < mean1:
        direction = "Cluster 0 < Cluster 1"
    else:
        direction = "No difference"

    if p_value < 0.001:
        sig = "*** (p < 0.001)"
    elif p_value < 0.01:
        sig = "** (p < 0.01)"
    elif p_value < 0.05:
        sig = "* (p < 0.05)"
    else:
        sig = "ns (not significant)"

    print("-" * 50)
    print(f"Statistic ({test_name} Stat) = {statistic:.4f}")
    print(f"p-value = {p_value:.6f}")
    print(f"Difference direction: {direction}")
    print(f"Statistical significance: {sig}")
    print("-" * 50)

if __name__ == "__main__":
    df_strategy = load_and_preprocess(mock=False)
    if df_strategy is None:
        print("Note: Real strategy score file failed to load. Falling back to simulated data for demo.")
        df_strategy = load_and_preprocess(mock=True)

    cluster_labels = perform_gmm_clustering(df_strategy, CLUSTERING_K)

    df_scores = load_scores(participant_ids=df_strategy.index, mock=False)
    
    if df_scores is None:
        print("Note: Real decision score file failed to load. Falling back to simulated data for demo.")
        df_scores = load_scores(participant_ids=df_strategy.index, mock=True)

    compare_cluster_total_scores(df_scores, cluster_labels)

Loaded file: 'strategy_scores.csv'
Loaded file: 'decision.csv'

Total Score Difference Comparison After GMM Clustering
Clustering K=2, Statistical Test: Mann-Whitney U test
Cluster 0 (N=40): Mean total score = 111.80, SD = 6.14
Cluster 1 (N=13): Mean total score = 102.08, SD = 8.51
--------------------------------------------------
Statistic (Mann-Whitney U test Stat) = 448.0000
p-value = 0.000105
Difference direction: Cluster 0 > Cluster 1
Statistical significance: *** (p < 0.001)
--------------------------------------------------
