In [10]:
import numpy as np
from scipy.stats import wilcoxon

In [11]:
# === Improvised Evaluation Data (Sample-wise F1) ===

llama3_f1_dbpedia = [
    0.62, 0.40, 0.44, 0.57, 0.62, 0.60, 0.61, 0.80, 0.67, 0.50,
    0.49, 0.38, 0.65, 0.78, 0.69, 0.19, 0.38, 0.71, 0.43
]

mistral_f1_dbpedia = [
    0.65, 0.34, 0.50, 0.61, 0.55, 0.59, 0.54, 0.77, 0.68, 0.55,
    0.44, 0.34, 0.66, 0.71, 0.68, 0.19, 0.34, 0.74, 0.42
]

alpaca_f1_dbpedia = [
    0.20, 0.15, 0.20, 0.29, 0.32, 0.30, 0.25, 0.47, 0.25, 0.45,
    0.10, 0.08, 0.31, 0.36, 0.31, 0.09, 0.26, 0.34, 0.15
]

vicuna_f1_dbpedia = [
    0.23, 0.18, 0.27, 0.38, 0.29, 0.32, 0.41, 0.46, 0.32, 0.40,
    0.18, 0.05, 0.39, 0.36, 0.42, 0.12, 0.23, 0.46, 0.20
]

# === Baseline Evaluation Data (Per-class avg_f1 for 19 classes) ===

llama3_f1_baseline = [
    0.46, 0.28, 0.41, 0.46, 0.52, 0.53, 0.57, 0.61, 0.51, 0.47,
    0.35, 0.19, 0.59, 0.57, 0.56, 0.16, 0.37, 0.67, 0.40
]

mistral_f1_baseline = [
    0.51, 0.29, 0.46, 0.47, 0.47, 0.51, 0.52, 0.62, 0.49, 0.53,
    0.33, 0.25, 0.58, 0.51, 0.51, 0.15, 0.40, 0.72, 0.40
]

alpaca_f1_baseline = [
    0.20, 0.15, 0.20, 0.29, 0.32, 0.30, 0.25, 0.47, 0.25, 0.45,
    0.10, 0.08, 0.31, 0.36, 0.31, 0.09, 0.26, 0.34, 0.15
]

vicuna_f1_baseline = [
    0.23, 0.18, 0.27, 0.38, 0.29, 0.32, 0.41, 0.46, 0.32, 0.40,
    0.18, 0.05, 0.39, 0.36, 0.42, 0.12, 0.23, 0.46, 0.20
]


# === Wilcoxon Test Function ===

def run_wilcoxon(model1_scores, model2_scores, name1, name2, dataset):
    stat, p = wilcoxon(model1_scores, model2_scores)
    print(f"\n[{dataset}] {name1} vs {name2}")
    print(f"Wilcoxon statistic = {stat:.4f}, p-value = {p:.4f}")
    if p < 0.05:
        print(f"→ {name1} significantly outperforms {name2} (p < 0.05)")
    else:
        print(f"→ No significant difference between {name1} and {name2} (p ≥ 0.05)")


# === Run Comparisons ===

print("=== Wilcoxon Signed-Rank Test on Improvised Evaluation Scores (DBpedia) ===")
run_wilcoxon(llama3_f1_dbpedia, alpaca_f1_dbpedia, "LLama 3-8B", "Alpaca-LoRA-13B", "Improvised DBpedia")
run_wilcoxon(llama3_f1_dbpedia, vicuna_f1_dbpedia, "LLama 3-8B", "Vicuna", "Improvised DBpedia")
run_wilcoxon(mistral_f1_dbpedia, alpaca_f1_dbpedia, "Mistral", "Alpaca-LoRA-13B", "Improvised DBpedia")
run_wilcoxon(mistral_f1_dbpedia, vicuna_f1_dbpedia, "Mistral", "Vicuna", "Improvised DBpedia")

print("\n=== Wilcoxon Signed-Rank Test on Baseline Evaluation Scores (DBpedia) ===")
run_wilcoxon(llama3_f1_baseline, alpaca_f1_baseline, "LLama 3-8B", "Alpaca-LoRA-13B", "Baseline")
run_wilcoxon(llama3_f1_baseline, vicuna_f1_baseline, "LLama 3-8B", "Vicuna", "Baseline")
run_wilcoxon(mistral_f1_baseline, alpaca_f1_baseline, "Mistral", "Alpaca-LoRA-13B", "Baseline")
run_wilcoxon(mistral_f1_baseline, vicuna_f1_baseline, "Mistral", "Vicuna", "Baseline")


=== Wilcoxon Signed-Rank Test on Improvised Evaluation Scores (DBpedia) ===

[Improvised DBpedia] LLama 3-8B vs Alpaca-LoRA-13B
Wilcoxon statistic = 0.0000, p-value = 0.0001
→ LLama 3-8B significantly outperforms Alpaca-LoRA-13B (p < 0.05)

[Improvised DBpedia] LLama 3-8B vs Vicuna
Wilcoxon statistic = 0.0000, p-value = 0.0001
→ LLama 3-8B significantly outperforms Vicuna (p < 0.05)

[Improvised DBpedia] Mistral vs Alpaca-LoRA-13B
Wilcoxon statistic = 0.0000, p-value = 0.0000
→ Mistral significantly outperforms Alpaca-LoRA-13B (p < 0.05)

[Improvised DBpedia] Mistral vs Vicuna
Wilcoxon statistic = 0.0000, p-value = 0.0001
→ Mistral significantly outperforms Vicuna (p < 0.05)

=== Wilcoxon Signed-Rank Test on Baseline Evaluation Scores (DBpedia) ===

[Baseline] LLama 3-8B vs Alpaca-LoRA-13B
Wilcoxon statistic = 0.0000, p-value = 0.0001
→ LLama 3-8B significantly outperforms Alpaca-LoRA-13B (p < 0.05)

[Baseline] LLama 3-8B vs Vicuna
Wilcoxon statistic = 0.0000, p-value = 0.0001
→ LLama 