In [12]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan, het_white

def heteroskedasticity_hypothesis_testing(X,y):
        X_with_const = sm.add_constant(X)
        model = sm.OLS(y, X_with_const).fit()
        resid = model.resid
        exog = model.model.exog
        bp_stat = het_breuschpagan(resid, exog)
        bp_labels = ['LM Statistic', 'LM p-value', 'F Statistic', 'F p-value']
        bp_results = dict(zip(bp_labels, bp_stat))
        white_stat = het_white(resid, exog)
        white_labels = ['LM Statistic', 'LM p-value', 'F Statistic', 'F p-value']
        white_results = dict(zip(white_labels, white_stat))
        results_df = pd.DataFrame([bp_results, white_results], index=['Breusch-Pagan', 'White'])
        return results_df


In [16]:
import os
import pandas as pd

for dataset in {'Summeval', 'Dialsumm', 'SocREval', 'GEval'}:
        for model in {'4omini', 'dsr1', 'qwen'}:
                folder_path = f'./model_logits/{model}/'
                if dataset == 'Summeval' or dataset == 'Dialsumm':
                        dimensions = ['consistency', 'coherence', 'fluency', 'relevance']
                elif dataset == 'GEval' or dataset == 'SocREval':
                        dimensions = ['cosmos', 'drop', 'esnli', 'gsm8k']
                for dimension in dimensions:
                        file_path = os.path.join(folder_path, f"{dataset}_{dimension}_logits.csv")
                        df = pd.read_csv(file_path)
                        X = df.iloc[:, :-1]
                        y = df.iloc[:, -1]
                        ht_result = heteroskedasticity_hypothesis_testing(X,y)
                        print(f"The hypothesis testing results of {dataset}_{dimension} from {model}: ")
                        print(ht_result)
                



The hypothesis testing results of GEval_cosmos from qwen: 
               LM Statistic  LM p-value  F Statistic  F p-value
Breusch-Pagan      7.883014    0.162802     1.592469   0.164083
White             25.904197    0.168999     1.332774   0.163982
The hypothesis testing results of GEval_drop from qwen: 
               LM Statistic  LM p-value  F Statistic  F p-value
Breusch-Pagan     22.042249    0.000514     4.784712   0.000371
White             31.325569    0.051036     1.656793   0.043809
The hypothesis testing results of GEval_esnli from qwen: 
               LM Statistic  LM p-value  F Statistic  F p-value
Breusch-Pagan     22.553662    0.000411     5.092058   0.000248
White             49.770427    0.000239     3.195783   0.000036
The hypothesis testing results of GEval_gsm8k from qwen: 
               LM Statistic  LM p-value  F Statistic  F p-value
Breusch-Pagan     27.782255    0.000040     6.259236   0.000021
White             56.738744    0.000022     3.544655   0.000003
