In [1]:
import glob
import os
import pandas as pd
from scipy.stats import binomtest

results = []

for tmp in ['tmp0', 'tmp1']:
    for model in ['qwen', 'dsr1', '4omini']:
            for dimension in ['consistency', 'coherence', 'fluency', 'relevance']:
                coverages = []
                widths = []
                for seed in range(31):
                    csv_path = f'./{model}_{tmp}/R2CCP_Summeval_{dimension}_{seed}.csv'
                    if os.path.exists(csv_path):
                        df = pd.read_csv(csv_path)
                        df = df.rename(columns={'low': 'y_qlow', 'up': 'y_qup'})
                        df['y_test'] = round(df['y_test'], 2)
                        df['y_qlow'] = round(df['y_qlow'], 2)
                        df['y_qup'] = round(df['y_qup'], 2)
                        coverage = ((df['y_test'] >= df['y_qlow']) & (df['y_test'] <= df['y_qup'])).mean()
                        width = (df['y_qup'] - df['y_qlow']).mean()
                        coverages.append(coverage)
                        widths.append(width)
                if coverages and widths:
                    results.append({
                        'tmp': tmp,
                        'model': model,
                        'dimension': dimension,
                        'interval_width_mean': sum(widths) / len(widths),
                        'interval_width_std': pd.Series(widths).std(),
                        'coverage_rate_mean': sum(coverages) / len(coverages),
                        'coverage_rate_std': pd.Series(coverages).std(),
                        'significant_test': binomtest(sum(c >= 0.9 for c in coverages), len(coverages), 0.9, alternative='two-sided').pvalue
                    })

results_df = pd.DataFrame(results)
print(results_df)

     tmp   model    dimension  interval_width_mean  interval_width_std  \
0   tmp0    qwen  consistency             0.612234            0.134094   
1   tmp0    qwen    coherence             2.436686            0.142597   
2   tmp0    qwen      fluency             0.952665            0.121825   
3   tmp0    qwen    relevance             1.978909            0.123691   
4   tmp0    dsr1  consistency             0.694085            0.134258   
5   tmp0    dsr1    coherence             2.304218            0.117156   
6   tmp0    dsr1      fluency             0.892561            0.151155   
7   tmp0    dsr1    relevance             1.998439            0.148224   
8   tmp0  4omini  consistency             0.661817            0.172211   
9   tmp0  4omini    coherence             2.617296            0.158834   
10  tmp0  4omini      fluency             0.978438            0.158437   
11  tmp0  4omini    relevance             2.002413            0.109331   
12  tmp1    qwen  consistency         

In [2]:
results_df

Unnamed: 0,tmp,model,dimension,interval_width_mean,interval_width_std,coverage_rate_mean,coverage_rate_std,significant_test
0,tmp0,qwen,consistency,0.612234,0.134094,0.907284,0.020217,0.000332596
1,tmp0,qwen,coherence,2.436686,0.142597,0.895431,0.024837,1.581656e-10
2,tmp0,qwen,fluency,0.952665,0.121825,0.90175,0.019207,3.559479e-08
3,tmp0,qwen,relevance,1.978909,0.123691,0.904542,0.024919,8.907787e-05
4,tmp0,dsr1,consistency,0.694085,0.134258,0.90444,0.020931,6.20203e-05
5,tmp0,dsr1,coherence,2.304218,0.117156,0.901164,0.021281,1.962613e-08
6,tmp0,dsr1,fluency,0.892561,0.151155,0.900875,0.020772,0.0004543551
7,tmp0,dsr1,relevance,1.998439,0.148224,0.898417,0.028982,3.559479e-08
8,tmp0,4omini,consistency,0.661817,0.172211,0.90575,0.024746,8.907787e-05
9,tmp0,4omini,coherence,2.617296,0.158834,0.892813,0.030345,7.224555e-11
