In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import wilcoxon

In [2]:
df = pd.read_csv('final_with_indobert_jw.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        100 non-null    int64  
 1   baseline_wer                100 non-null    float64
 2   baseline_cer                100 non-null    float64
 3   baseline_pmr                100 non-null    float64
 4   baseline_bleu               100 non-null    float64
 5   baseline_cosine             100 non-null    float64
 6   baseline_indobert           100 non-null    object 
 7   final_symspell_wer          100 non-null    float64
 8   final_symspell_cer          100 non-null    float64
 9   final_symspell_pmr          100 non-null    float64
 10  final_symspell_bleu         100 non-null    float64
 11  final_symspell_cosine       100 non-null    float64
 12  final_symspell_indobert     100 non-null    object 
 13  final_llm_wer               100 non-

In [3]:
def shapiro_wilk(data):
    stat, p = stats.shapiro(data)
    return p

In [4]:
lower_better = ["wer", "cer"]
higher_better = ["pmr", "bleu", "cosine", "jw",
                 "indobert_P", "indobert_R", "indobert_F1"]
metrics = lower_better + higher_better

In [5]:
def evaluate(df, metric, method):
    baseline = df[f"baseline_{metric}"].values
    refined = df[f"final_{method}_{metric}"].values

    if metric in lower_better: diff = baseline - refined  # positive = improvement
    else: diff = refined - baseline  # positive = improvement

    p_shapiro = shapiro_wilk(diff)
    if p_shapiro >= 0.05:
        stat, p_value = stats.ttest_rel(refined, baseline)
        test_used = "Paired t-test"
    else:
        stat, p_value = wilcoxon(diff)
        test_used = "Wilcoxon signed-rank"

    return {
        "metric": metric,
        "comparison": f"baseline vs {method}",
        "test": test_used,
        "shapiro_p": p_shapiro,
        "p_value": p_value,
        "mean_baseline": np.mean(baseline),
        "mean_refined": np.mean(refined),
        "mean_improvement": np.mean(diff)
    }

In [6]:
results = []

for metric in metrics:
    results.append(evaluate(df, metric, "symspell"))
    results.append(evaluate(df, metric, "llm"))

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,metric,comparison,test,shapiro_p,p_value,mean_baseline,mean_refined,mean_improvement
0,wer,baseline vs symspell,Wilcoxon signed-rank,1.830843e-21,1.189345e-08,0.432745,0.172637,0.260108
1,wer,baseline vs llm,Wilcoxon signed-rank,1.805606e-21,6.946896e-11,0.432745,0.168569,0.264176
2,cer,baseline vs symspell,Wilcoxon signed-rank,2.971047e-21,7.169704e-09,0.319161,0.121424,0.197737
3,cer,baseline vs llm,Wilcoxon signed-rank,2.958096e-21,9.756623e-10,0.319161,0.120471,0.19869
4,pmr,baseline vs symspell,Wilcoxon signed-rank,2.205307e-12,1.591306e-06,0.172997,0.391989,0.218992
5,pmr,baseline vs llm,Wilcoxon signed-rank,4.471238e-11,5.129928e-07,0.172997,0.398404,0.225407
6,bleu,baseline vs symspell,Wilcoxon signed-rank,1.098987e-15,0.0004389483,0.796237,0.804493,0.008257
7,bleu,baseline vs llm,Wilcoxon signed-rank,5.554253e-16,2.271705e-07,0.796237,0.813179,0.016942
8,cosine,baseline vs symspell,Wilcoxon signed-rank,1.0651120000000001e-17,0.005249423,0.883098,0.863786,-0.019312
9,cosine,baseline vs llm,Wilcoxon signed-rank,6.374799e-18,1.088094e-07,0.883098,0.876732,-0.006367


In [14]:
results_df[results_df['comparison'] == 'baseline vs symspell']

Unnamed: 0,metric,comparison,test,shapiro_p,p_value,mean_baseline,mean_refined,mean_improvement,p_corrected,significant
0,wer,baseline vs symspell,Wilcoxon signed-rank,1.830843e-21,1.189345e-08,0.432745,0.172637,0.260108,1.665083e-07,True
2,cer,baseline vs symspell,Wilcoxon signed-rank,2.971047e-21,7.169704e-09,0.319161,0.121424,0.197737,1.075456e-07,True
4,pmr,baseline vs symspell,Wilcoxon signed-rank,2.205307e-12,1.591306e-06,0.172997,0.391989,0.218992,9.547836e-06,True
6,bleu,baseline vs symspell,Wilcoxon signed-rank,1.098987e-15,0.0004389483,0.796237,0.804493,0.008257,0.001316845,True
8,cosine,baseline vs symspell,Wilcoxon signed-rank,1.0651120000000001e-17,0.005249423,0.883098,0.863786,-0.019312,0.005249423,True
10,jw,baseline vs symspell,Wilcoxon signed-rank,1.181378e-09,8.345072e-07,0.846417,0.879365,0.032948,5.84155e-06,True
12,indobert_P,baseline vs symspell,Wilcoxon signed-rank,2.554037e-14,7.958557e-08,0.918677,0.935379,0.016702,9.550269e-07,True
14,indobert_R,baseline vs symspell,Wilcoxon signed-rank,1.570122e-16,0.001806552,0.937881,0.931343,-0.006538,0.003613105,True
16,indobert_F1,baseline vs symspell,Wilcoxon signed-rank,9.618179e-16,6.827915e-06,0.927779,0.932791,0.005012,2.731166e-05,True
