In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import wilcoxon, friedmanchisquare
import pingouin as pg

In [2]:
bert_df = pd.read_csv('result/bert_bleurt_result.csv')
llm_df = pd.read_csv('result/LLM_judge_result.csv')
sas_df = pd.read_csv('result/sas_result.csv')

In [3]:
merged_df = (
    bert_df
    .merge(llm_df, on='No', how='left', suffixes=('', '_dup1'))
    .merge(sas_df, on='No', how='left', suffixes=('', '_dup2'))
)
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_dup')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_reasoning')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_category')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('Claude')]
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 36 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       45 non-null     int64  
 1   No                               45 non-null     float64
 2   Question                         45 non-null     object 
 3   Qwen_P                           45 non-null     float64
 4   Qwen_R                           45 non-null     float64
 5   Qwen_F1                          45 non-null     float64
 6   Qwen_BLEURT                      45 non-null     float64
 7   GPT_P                            45 non-null     float64
 8   GPT_R                            45 non-null     float64
 9   GPT_F1                           45 non-null     float64
 10  GPT_BLEURT                       45 non-null     float64
 11  Deepseek RAG_P                   45 non-null     float64
 12  Deepseek RAG_R          

In [4]:
merged_df = merged_df.rename(
    columns=lambda c: (
        f"{c.split('_')[2]}{c.split('_')[3]}_LLM_JUDGE" if c.startswith("LLM_JUDGE_") else c
    )
)
merged_df = merged_df.rename(
    columns=lambda c: (
        f"{c.split('_')[1]}{c.split('_')[2]}_SAS" if c.startswith("SAS") else c
    )
)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 36 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    45 non-null     int64  
 1   No                            45 non-null     float64
 2   Question                      45 non-null     object 
 3   Qwen_P                        45 non-null     float64
 4   Qwen_R                        45 non-null     float64
 5   Qwen_F1                       45 non-null     float64
 6   Qwen_BLEURT                   45 non-null     float64
 7   GPT_P                         45 non-null     float64
 8   GPT_R                         45 non-null     float64
 9   GPT_F1                        45 non-null     float64
 10  GPT_BLEURT                    45 non-null     float64
 11  Deepseek RAG_P                45 non-null     float64
 12  Deepseek RAG_R                45 non-null     float64
 13  Deepsee

In [5]:
def shapiro_wilk(data):
    stat, p = stats.shapiro(data)
    # print(f"Shapiro-Wilk Statistic = {stat:.4f}")
    # print(f"p-value = {p:.4f}")
    return p

In [6]:
def ttest_onetail(data_rag, data_non_rag):
    t_stat, p_two_tailed = stats.ttest_rel(data_rag, data_non_rag)
    if t_stat > 0:
        p_one_tailed = p_two_tailed / 2
    else:
        p_one_tailed = 1 - (p_two_tailed / 2)

    # print(f"t-statistic = {t_stat:.4f}")
    # print(f"One-tailed p-value = {p_one_tailed:.4f}")
    # print(f"Two-tailed p-value = {p_two_tailed:.4f}")
    return p_one_tailed

In [7]:
def anova(data_gpt, data_deepseek, data_qwen):
    anova_df = pd.DataFrame({
        'GPT': data_gpt,
        'Deepseek': data_deepseek,
        'Qwen': data_qwen
    }).melt(var_name='model', value_name='score')

    anova_df['subject'] = np.tile(np.arange(len(data_gpt)), 3)
    anova = pg.rm_anova(dv='score', within='model', subject='subject', data=anova_df, detailed=True)
    return anova

In [8]:
def process1(str):
    filtered_df = merged_df.loc[:, merged_df.columns.str.endswith(str)]
    filtered_df.head()
    cols = {name: filtered_df[name] for name in filtered_df.columns}

    for c in cols.keys():
        if c.__contains__('Deepseek'):
            if c.__contains__('non') and c.__contains__('RAG'): nonrag = cols[c] 
            else: rag = cols[c]
    
    sw_rag, sw_nonrag = shapiro_wilk(rag), shapiro_wilk(nonrag)
    print(f'Shapiro Wilk RAG ({str}): {sw_rag:.2e}')
    print(f'Shapiro Wilk nonRAG ({str}): {sw_nonrag:.2e}')

    if sw_rag > 0.05 and sw_nonrag > 0.05:
        print(f't-test ({str}): {ttest_onetail(rag, nonrag):.2e}\n')
    else: 
        stat, p = wilcoxon(rag, nonrag)
        print(f'Wilcoxon ({str}): {p:.2e}\n')


In [9]:
def process2(str):
    filtered_df = merged_df.loc[:, merged_df.columns.str.endswith(str)]
    cols = {name: filtered_df[name] for name in filtered_df.columns}

    for c in cols.keys():
        if c.__contains__('Deepseek') and not c.__contains__('non'): deepseek = cols[c]
        elif c.__contains__('GPT'): gpt = cols[c]
        elif c.__contains__('Qwen'): qwen = cols[c]
    
    sw_deep, sw_gpt, sw_qwen = shapiro_wilk(deepseek), shapiro_wilk(gpt), shapiro_wilk(qwen)
    print(f'Shapiro Wilk GPT ({str}): {sw_gpt:.2e}')
    print(f'Shapiro Wilk Deepseek ({str}): {sw_deep:.2e}')
    print(f'Shapiro Wilk Qwen ({str}): {sw_qwen:.2e}')

    if sw_deep > 0.05 and sw_gpt > 0.05 and sw_qwen > 0.05:
        print('Result ANOVA of', str)
        print(anova(gpt, deepseek, qwen))
        print()
    else:
        stat, p = friedmanchisquare(gpt, deepseek, qwen)
        print(f"Friedman test p-value for {str} = {p:.2e}\n")

In [10]:
process1('_P')
process1('_R')
process1('_F1')
process1('_LLM_JUDGE')
process1('_SAS')

Shapiro Wilk RAG (_P): 3.77e-01
Shapiro Wilk nonRAG (_P): 2.09e-01
t-test (_P): 4.37e-03

Shapiro Wilk RAG (_R): 8.70e-01
Shapiro Wilk nonRAG (_R): 5.35e-01
t-test (_R): 3.81e-05

Shapiro Wilk RAG (_F1): 3.89e-01
Shapiro Wilk nonRAG (_F1): 5.42e-02
t-test (_F1): 1.34e-04

Shapiro Wilk RAG (_LLM_JUDGE): 4.17e-09
Shapiro Wilk nonRAG (_LLM_JUDGE): 1.20e-07
Wilcoxon (_LLM_JUDGE): 1.15e-01

Shapiro Wilk RAG (_SAS): 8.14e-03
Shapiro Wilk nonRAG (_SAS): 2.23e-01
Wilcoxon (_SAS): 2.84e-06



In [11]:
process2('_P')
process2('_R')
process2('_F1')
process2('_LLM_JUDGE')
process2('_SAS')

Shapiro Wilk GPT (_P): 6.18e-01
Shapiro Wilk Deepseek (_P): 3.77e-01
Shapiro Wilk Qwen (_P): 1.52e-01
Result ANOVA of _P
  Source        SS  DF        MS         F    p-unc       ng2       eps
0  model  0.001836   2  0.000918  1.312396  0.27439  0.009024  0.932064
1  Error  0.061553  88  0.000699       NaN      NaN       NaN       NaN

Shapiro Wilk GPT (_R): 7.56e-02
Shapiro Wilk Deepseek (_R): 8.70e-01
Shapiro Wilk Qwen (_R): 8.38e-02
Result ANOVA of _R
  Source        SS  DF        MS         F     p-unc       ng2       eps
0  model  0.018330   2  0.009165  5.292827  0.006752  0.049374  0.931031
1  Error  0.152382  88  0.001732       NaN       NaN       NaN       NaN

Shapiro Wilk GPT (_F1): 5.50e-02
Shapiro Wilk Deepseek (_F1): 3.89e-01
Shapiro Wilk Qwen (_F1): 5.15e-01
Result ANOVA of _F1
  Source        SS  DF        MS         F     p-unc       ng2       eps
0  model  0.005568   2  0.002784  2.985077  0.055678  0.025831  0.921532
1  Error  0.082067  88  0.000933       NaN       N

  W = np.prod(eig) / (eig.sum() / d) ** d


In [13]:
merged_df.describe().to_csv('desc_stat.csv')

In [17]:
def process_shapiro(str):
    filtered_df = merged_df.loc[:, merged_df.columns.str.endswith(str)]
    cols = {name: filtered_df[name] for name in filtered_df.columns}

    for c in cols.keys():
        if c.__contains__('Deepseek') and not c.__contains__('non'): deepseek = cols[c]
        if c.__contains__('Deepseek') and c.__contains__('non'): deepseek_non = cols[c]
        elif c.__contains__('GPT'): gpt = cols[c]
        elif c.__contains__('Qwen'): qwen = cols[c]
    
    sw_deep, sw_gpt, sw_qwen = shapiro_wilk(deepseek), shapiro_wilk(gpt), shapiro_wilk(qwen)
    sw_non = shapiro_wilk(deepseek_non)

    print(f'Shapiro Wilk GPT ({str}): {sw_gpt:.2e}')
    print(f'Shapiro Wilk Deepseek ({str}): {sw_deep:.2e}')
    print(f'Shapiro Wilk Deepseek non RAG({str}): {sw_non:.2e}')
    print(f'Shapiro Wilk Qwen ({str}): {sw_qwen:.2e}')
    print()

In [18]:
process_shapiro('_P')
process_shapiro('_R')
process_shapiro('_F1')
process_shapiro('_LLM_JUDGE')
process_shapiro('_SAS')

Shapiro Wilk GPT (_P): 6.18e-01
Shapiro Wilk Deepseek (_P): 3.77e-01
Shapiro Wilk Deepseek non RAG(_P): 2.09e-01
Shapiro Wilk Qwen (_P): 1.52e-01

Shapiro Wilk GPT (_R): 7.56e-02
Shapiro Wilk Deepseek (_R): 8.70e-01
Shapiro Wilk Deepseek non RAG(_R): 5.35e-01
Shapiro Wilk Qwen (_R): 8.38e-02

Shapiro Wilk GPT (_F1): 5.50e-02
Shapiro Wilk Deepseek (_F1): 3.89e-01
Shapiro Wilk Deepseek non RAG(_F1): 5.42e-02
Shapiro Wilk Qwen (_F1): 5.15e-01

Shapiro Wilk GPT (_LLM_JUDGE): 3.16e-08
Shapiro Wilk Deepseek (_LLM_JUDGE): 4.17e-09
Shapiro Wilk Deepseek non RAG(_LLM_JUDGE): 1.20e-07
Shapiro Wilk Qwen (_LLM_JUDGE): 8.13e-07

Shapiro Wilk GPT (_SAS): 1.51e-04
Shapiro Wilk Deepseek (_SAS): 8.14e-03
Shapiro Wilk Deepseek non RAG(_SAS): 2.23e-01
Shapiro Wilk Qwen (_SAS): 2.76e-02

