# **Statistical tests for comparing FL and PFL**

## **Get the last output of each json file**

In [None]:
import numpy as np
from scipy.stats import wilcoxon

def run_wilcoxon_test(data_method_a, data_method_b, metric_name, client_name, method_a_name, method_b_name):
    
    data_method_a = np.array(data_method_a)
    data_method_b = np.array(data_method_b)

    try:
        statistic, p_value = wilcoxon(data_method_a, data_method_b, alternative='two-sided', zero_method='wilcox')
    except ValueError as e:
        print(f"  Error performing Wilcoxon test for {metric_name} on {client_name} ({method_a_name} vs {method_b_name}): {e}")
        p_value = np.nan
        statistic = np.nan
    
    print(f"  {method_a_name} vs {method_b_name}: Statistic={statistic:.4f}, p-value={p_value:.4f}")
    return p_value

# FedAvg Data
fedavg_mimic_roc_auc = [0.648, 0.634, 0.661]
fedavg_mimic_pr_auc = [0.957, 0.955, 0.959]

fedavg_eicu_roc_auc = [0.609, 0.611, 0.609]
fedavg_eicu_pr_auc = [0.950, 0.950, 0.950]

# FedProx Data
fedprox_mimic_roc_auc = [0.655, 0.665, 0.652]
fedprox_mimic_pr_auc = [0.959, 0.961, 0.958]

fedprox_eicu_roc_auc = [0.605, 0.606, 0.604]
fedprox_eicu_pr_auc = [0.950, 0.950, 0.949]

# FO Per-FedAvg Data (Same for all seeds as per your note)
# Per-FedAvg Client 0 is MIMIC
perfedavg_mimic_roc_auc = [0.7194308341604817, 0.7194308341604817, 0.7194308341604817]
perfedavg_mimic_pr_auc = [0.9707830697281877, 0.9707830697281877, 0.9707830697281877]

# Per-FedAvg Client 1 is eICU
perfedavg_eicu_roc_auc = [0.6448071572632778, 0.6448071572632778, 0.6448071572632778]
perfedavg_eicu_pr_auc = [0.9558383462961518, 0.9558383462961518, 0.9558383462961518]

# MOCHA-inspired Data
# MOCHA Client 0 is MIMIC
mocha_mimic_roc_auc = [0.5411, 0.7741, 0.5428]
mocha_mimic_pr_auc = [0.9367, 0.9736, 0.9369]

# MOCHA Client 1 is eICU
mocha_eicu_roc_auc = [0.6505, 0.6157, 0.6498]
mocha_eicu_pr_auc = [0.9567, 0.9471, 0.9549]

client_data = {
    "MIMIC": {
        "FedAvg": {"ROC-AUC": fedavg_mimic_roc_auc, "PR-AUC": fedavg_mimic_pr_auc},
        "FedProx": {"ROC-AUC": fedprox_mimic_roc_auc, "PR-AUC": fedprox_mimic_pr_auc},
        "Per-FedAvg": {"ROC-AUC": perfedavg_mimic_roc_auc, "PR-AUC": perfedavg_mimic_pr_auc},
        "MOCHA": {"ROC-AUC": mocha_mimic_roc_auc, "PR-AUC": mocha_mimic_pr_auc},
    },
    "eICU": {
        "FedAvg": {"ROC-AUC": fedavg_eicu_roc_auc, "PR-AUC": fedavg_eicu_pr_auc},
        "FedProx": {"ROC-AUC": fedprox_eicu_roc_auc, "PR-AUC": fedprox_eicu_pr_auc},
        "Per-FedAvg": {"ROC-AUC": perfedavg_eicu_roc_auc, "PR-AUC": perfedavg_eicu_pr_auc},
        "MOCHA": {"ROC-AUC": mocha_eicu_roc_auc, "PR-AUC": mocha_eicu_pr_auc},
    },
}

print("--- Starting Wilcoxon Signed-Rank Tests ---")

metrics_to_compare = ["ROC-AUC", "PR-AUC"]
pfl_methods = ["Per-FedAvg", "MOCHA"]
baseline_methods = ["FedAvg", "FedProx"]
client_names = ["MIMIC", "eICU"]

for client_name in client_names:
    print(f"\n--- Results for {client_name} Client ---")
    
    for metric in metrics_to_compare:
        print(f"  Metric: {metric}")

        for pfl_method_name in pfl_methods:
            for baseline_method_name in baseline_methods:
                
                pfl_vals = client_data[client_name][pfl_method_name][metric]
                baseline_vals = client_data[client_name][baseline_method_name][metric]

                run_wilcoxon_test(
                    pfl_vals, baseline_vals,
                    metric, client_name,
                    pfl_method_name, baseline_method_name
                )

print("\n--- End of Wilcoxon Tests ---")

--- Starting Wilcoxon Signed-Rank Tests ---

--- Results for MIMIC Client ---
  Metric: ROC-AUC
  Per-FedAvg vs FedAvg: Statistic=0.0000, p-value=0.2500
  Per-FedAvg vs FedProx: Statistic=0.0000, p-value=0.2500
  MOCHA vs FedAvg: Statistic=3.0000, p-value=1.0000
  MOCHA vs FedProx: Statistic=1.0000, p-value=0.5000
  Metric: PR-AUC
  Per-FedAvg vs FedAvg: Statistic=0.0000, p-value=0.2500
  Per-FedAvg vs FedProx: Statistic=0.0000, p-value=0.2500
  MOCHA vs FedAvg: Statistic=1.0000, p-value=0.5000
  MOCHA vs FedProx: Statistic=1.0000, p-value=0.5000

--- Results for eICU Client ---
  Metric: ROC-AUC
  Per-FedAvg vs FedAvg: Statistic=0.0000, p-value=0.2500
  Per-FedAvg vs FedProx: Statistic=0.0000, p-value=0.2500
  MOCHA vs FedAvg: Statistic=0.0000, p-value=0.2500
  MOCHA vs FedProx: Statistic=0.0000, p-value=0.2500
  Metric: PR-AUC
  Per-FedAvg vs FedAvg: Statistic=0.0000, p-value=0.2500
  Per-FedAvg vs FedProx: Statistic=0.0000, p-value=0.2500
  MOCHA vs FedAvg: Statistic=1.0000, p-value