In [None]:
# === Required Libraries ===
# Run this once if needed:
# pip install pingouin statsmodels scipy pandas

import pandas as pd
from statsmodels.stats.anova import AnovaRM
from scipy.stats import ttest_rel, shapiro
from statsmodels.stats.multitest import multipletests
import pingouin as pg
from itertools import combinations

# === Step 1: Load and combine data from all token levels ===
df_500k = pd.read_csv("results/500000_tokens/run_log-edit.csv")
df_500k["token_count"] = 500000

df_1m = pd.read_csv("results/1000000_tokens/run_log-edit.csv")
df_1m["token_count"] = 1000000

df_2m = pd.read_csv("results/2000000_tokens/run_log-edit.csv")
df_2m["token_count"] = 2000000

df_all = pd.concat([df_500k, df_1m, df_2m], ignore_index=True)
df_all = df_all[["trial_number", "token_count", "parameter_efficiency"]]

# === Step 2: Run Repeated Measures ANOVA ===
anova = AnovaRM(df_all, depvar="parameter_efficiency", subject="trial_number", within=["token_count"]).fit()
print("\n=== Repeated Measures ANOVA ===")
print(anova)

# === Step 3: Prepare data for pairwise testing and assumptions ===
df_wide = df_all.pivot(index="trial_number", columns="token_count", values="parameter_efficiency")
pairs = list(combinations(df_wide.columns, 2))

# === Step 4: Pairwise Bonferroni-Corrected Dependent T-Tests ===
print("\n=== Pairwise Comparisons (Bonferroni-Corrected) ===")
p_values = []
results = []

for a, b in pairs:
    t_stat, p = ttest_rel(df_wide[a], df_wide[b])
    p_values.append(p)
    results.append((a, b, t_stat, p))

reject, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

for i, (a, b, t_stat, raw_p) in enumerate(results):
    print(f"{a} vs {b} → t = {t_stat:.4f}, raw p = {raw_p:.4f}, corrected p = {pvals_corrected[i]:.4f}, reject H₀: {reject[i]}")

# === Step 5: Normality Check (Shapiro-Wilk) ===
print("\n=== Shapiro-Wilk Normality Test ===")
for token in df_wide.columns:
    stat, p = shapiro(df_wide[token])
    print(f"{token}: W = {stat:.4f}, p = {p:.4f} → {'Normal' if p > 0.05 else 'Non-normal'}")

# === Step 6: Mauchly’s Test for Sphericity ===
print("\n=== Mauchly’s Test for Sphericity ===")
df_long = pd.melt(df_wide.reset_index(), id_vars='trial_number', var_name='token_count', value_name='efficiency')
df_long['token_count'] = df_long['token_count'].astype(str)

sphericity_test = pg.sphericity(data=df_long, dv='efficiency', subject='trial_number', within='token_count')
print(sphericity_test)

# === Step 7: Detailed ANOVA with Pingouin (Optional) ===
print("\n=== Repeated Measures ANOVA (Pingouin) ===")
aov_pg = pg.rm_anova(dv='efficiency', within='token_count', subject='trial_number', data=df_long, detailed=True)
print(aov_pg)