In [5]:
import numpy as np
from scipy.stats import wilcoxon

In [6]:
# === Improvised Evaluation (Sample-wise F1 Scores for Wikidata) ===
llama3_f1_wikidata = [0.32, 0.36, 0.53, 0.35, 0.41, 0.40, 0.74, 0.51, 0.49, 0.58]
mistral_f1_wikidata = [0.30, 0.37, 0.44, 0.33, 0.40, 0.36, 0.72, 0.44, 0.41, 0.53]
alpaca_f1_wikidata = [0.17, 0.22, 0.45, 0.19, 0.21, 0.29, 0.55, 0.21, 0.27, 0.15]
vicuna_f1_wikidata = [0.25, 0.32, 0.52, 0.26, 0.24, 0.35, 0.66, 0.33, 0.25, 0.31]

# === Baseline Evaluation (avg_f1 Scores Per Domain) ===
llama3_f1_baseline = [0.27, 0.32, 0.60, 0.34, 0.37, 0.37, 0.70, 0.53, 0.47, 0.63]
mistral_f1_baseline = [0.29, 0.34, 0.47, 0.32, 0.40, 0.33, 0.70, 0.40, 0.39, 0.45]
alpaca_f1_baseline = [0.17, 0.22, 0.45, 0.19, 0.21, 0.29, 0.55, 0.21, 0.27, 0.15]
vicuna_f1_baseline = [0.25, 0.32, 0.52, 0.26, 0.24, 0.35, 0.66, 0.33, 0.25, 0.31]

# === Wilcoxon Test Function ===
def run_wilcoxon(model1_scores, model2_scores, name1, name2, dataset):
    stat, p = wilcoxon(model1_scores, model2_scores)
    print(f"\n[{dataset}] {name1} vs {name2}")
    print(f"Wilcoxon statistic = {stat:.4f}, p-value = {p:.4f}")
    if p < 0.05:
        print(f"→ {name1} significantly outperforms {name2} (p < 0.05)")
    else:
        print(f"→ No significant difference between {name1} and {name2} (p ≥ 0.05)")

# === Run Comparisons ===
print("=== Wilcoxon Signed-Rank Test on Improvised Evaluation (Wikidata) ===")
run_wilcoxon(llama3_f1_wikidata, alpaca_f1_wikidata, "LLama 3-8B", "Alpaca-LoRA-13B", "Wikidata")
run_wilcoxon(llama3_f1_wikidata, vicuna_f1_wikidata, "LLama 3-8B", "Vicuna", "Wikidata")
run_wilcoxon(mistral_f1_wikidata, alpaca_f1_wikidata, "Mistral", "Alpaca-LoRA-13B", "Wikidata")
run_wilcoxon(mistral_f1_wikidata, vicuna_f1_wikidata, "Mistral", "Vicuna", "Wikidata")

print("\n=== Wilcoxon Signed-Rank Test on Baseline Evaluation (avg_f1 across domains) ===")
run_wilcoxon(llama3_f1_baseline, alpaca_f1_baseline, "LLama 3-8B", "Alpaca-LoRA-13B", "Baseline")
run_wilcoxon(llama3_f1_baseline, vicuna_f1_baseline, "LLama 3-8B", "Vicuna", "Baseline")
run_wilcoxon(mistral_f1_baseline, alpaca_f1_baseline, "Mistral", "Alpaca-LoRA-13B", "Baseline")
run_wilcoxon(mistral_f1_baseline, vicuna_f1_baseline, "Mistral", "Vicuna", "Baseline")


=== Wilcoxon Signed-Rank Test on Improvised Evaluation (Wikidata) ===

[Wikidata] LLama 3-8B vs Alpaca-LoRA-13B
Wilcoxon statistic = 0.0000, p-value = 0.0020
→ LLama 3-8B significantly outperforms Alpaca-LoRA-13B (p < 0.05)

[Wikidata] LLama 3-8B vs Vicuna
Wilcoxon statistic = 0.0000, p-value = 0.0020
→ LLama 3-8B significantly outperforms Vicuna (p < 0.05)

[Wikidata] Mistral vs Alpaca-LoRA-13B
Wilcoxon statistic = 1.0000, p-value = 0.0039
→ Mistral significantly outperforms Alpaca-LoRA-13B (p < 0.05)

[Wikidata] Mistral vs Vicuna
Wilcoxon statistic = 6.0000, p-value = 0.0254
→ Mistral significantly outperforms Vicuna (p < 0.05)

=== Wilcoxon Signed-Rank Test on Baseline Evaluation (avg_f1 across domains) ===

[Baseline] LLama 3-8B vs Alpaca-LoRA-13B
Wilcoxon statistic = 0.0000, p-value = 0.0020
→ LLama 3-8B significantly outperforms Alpaca-LoRA-13B (p < 0.05)

[Baseline] LLama 3-8B vs Vicuna
Wilcoxon statistic = 0.0000, p-value = 0.0039
→ LLama 3-8B significantly outperforms Vicuna (