In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import pingouin as pg

In [2]:
bert_df = pd.read_csv('result/bert_bleurt_result.csv')
llm_df = pd.read_csv('result/LLM_judge_result.csv')
sas_df = pd.read_csv('result/sas_result.csv')

In [7]:
merged_df = (
    bert_df
    .merge(llm_df, on='No', how='left', suffixes=('', '_dup1'))
    .merge(sas_df, on='No', how='left', suffixes=('', '_dup2'))
)
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_dup')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_reasoning')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_category')]

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 42 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       45 non-null     int64  
 1   No                               45 non-null     float64
 2   Question                         45 non-null     object 
 3   Claude_P                         45 non-null     float64
 4   Claude_R                         45 non-null     float64
 5   Claude_F1                        45 non-null     float64
 6   Claude_BLEURT                    45 non-null     float64
 7   Qwen_P                           45 non-null     float64
 8   Qwen_R                           45 non-null     float64
 9   Qwen_F1                          45 non-null     float64
 10  Qwen_BLEURT                      45 non-null     float64
 11  GPT_P                            45 non-null     float64
 12  GPT_R                   

In [None]:
def shapiro_wilk(data):
    stat, p = stats.shapiro(data)
    # print(f"Shapiro-Wilk Statistic = {stat:.4f}")
    # print(f"p-value = {p:.4f}")
    return p

data_deepseek_rag = merged_df['Deepseek RAG_P']
data_deepseek_non_rag = merged_df['Deepseek non RAG_P']
count_p_val(data_rag)
count_p_val(data_non_rag)

In [None]:
t_stat, p_two_tailed = stats.ttest_rel(data_rag, data_non_rag)
if t_stat > 0:
    p_one_tailed = p_two_tailed / 2
else:
    p_one_tailed = 1 - (p_two_tailed / 2)

print(f"t-statistic = {t_stat:.4f}")
print(f"One-tailed p-value = {p_one_tailed:.4f}")
print(f"Two-tailed p-value = {p_two_tailed:.4f}")

In [None]:
data_gpt = eval_df['GPT_P']
data_qwen = eval_df['Qwen_P']
count_p_val(data_gpt)
count_p_val(data_qwen)

In [None]:
anova_df = pd.DataFrame({
    'GPT': data_gpt,
    'Deepseek': data_rag,
    'Qwen': data_qwen
}).melt(var_name='model', value_name='score')

# Each observation corresponds to same test item
anova_df['subject'] = np.tile(np.arange(len(data_gpt)), 3)

anova = pg.rm_anova(dv='score', within='model', subject='subject', data=anova_df, detailed=True)
print(anova)