In [1]:
import os
import pandas as pd
import numpy as np

base_dir = os.getcwd()  
models = ['4omini', 'dsr1', 'qwen'] 
datasets = ['Summeval', 'Dialsumm', 'GEval', 'SocREval']

In [4]:
import os
import pandas as pd
import numpy as np
from scipy import stats

logits_folder = 'model_logits'
raw_folder = 'raw_scores'

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

def calculate_performance(y_pred, y_test):
    if np.isnan(y_pred).any() or np.isnan(y_test).any():
        mask = ~np.isnan(y_pred) & ~np.isnan(y_test)
        y_pred = y_pred[mask]
        y_test = y_test[mask]
    mse = np.mean((y_pred - y_test) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_pred - y_test))

    rho = stats.spearmanr(y_pred, y_test)[0]
    tau = stats.kendalltau(y_pred, y_test)[0]
    pcc = stats.pearsonr(y_pred, y_test)[0]
    return mse, rmse, mae, rho, tau, pcc


### Overall judgments performace: raw socres and weighted sum

In [5]:
columns = ['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']
results_df = pd.DataFrame(columns=columns)

for model in ['4omini', 'dsr1', 'qwen']:
    logits_model_path = os.path.join(logits_folder, model)
    raw_model_path = os.path.join(raw_folder, model)
    
    for file_name in os.listdir(logits_model_path):
        if file_name.endswith('_logits.csv'):
            dataset, dimension = file_name.split('_')[:2]
            raw_file_name = f"{dataset}_{dimension}_raw.csv"
            logits_file_path = os.path.join(logits_model_path, file_name)
            raw_file_path = os.path.join(raw_model_path, raw_file_name)
            
            if os.path.exists(raw_file_path):
                logits_df = pd.read_csv(logits_file_path)
                X = logits_df.iloc[:, :-1].values
                X_softmax = softmax(X)
                weighted_sum = X_softmax @ np.array([1, 2, 3, 4, 5])
                y = logits_df.iloc[:, -1].values
                raw_df = pd.read_csv(raw_file_path)
                
                mse, rmse, mae, rho, tau, pcc = calculate_performance(weighted_sum, y)
                results_df = pd.concat([
                    results_df,
                    pd.DataFrame([{
                        'Model': model,
                        'Dataset': dataset,
                        'Dimension': dimension,
                        'Type': 'Weighted Sum',
                        'MSE': mse,
                        'RMSE': rmse,
                        'MAE': mae,
                        'Spearman': rho,
                        'Kendall': tau,
                        'Pearson': pcc
                    }])
                ], ignore_index=True)
                
                mse, rmse, mae, rho, tau, pcc = calculate_performance(raw_df.iloc[:, 0].values, y)
                results_df = pd.concat([
                    results_df,
                    pd.DataFrame([{
                        'Model': model,
                        'Dataset': dataset,
                        'Dimension': dimension,
                        'Type': 'Raw',
                        'MSE': mse,
                        'RMSE': rmse,
                        'MAE': mae,
                        'Spearman': rho,
                        'Kendall': tau,
                        'Pearson': pcc
                    }])
                ], ignore_index=True)

print(results_df)
results_df.to_csv('judgments_performance_overall.csv', index=False)

  results_df = pd.concat([


     Model   Dataset    Dimension          Type       MSE      RMSE       MAE  \
0   4omini  Dialsumm    coherence  Weighted Sum  3.701241  1.923861  1.699435   
1   4omini  Dialsumm    coherence           Raw  3.793651  1.947730  1.712381   
2   4omini  Dialsumm  consistency  Weighted Sum  0.828361  0.910144  0.705311   
3   4omini  Dialsumm  consistency           Raw  0.993721  0.996856  0.769909   
4   4omini  Dialsumm      fluency  Weighted Sum  1.681136  1.296586  1.063949   
..     ...       ...          ...           ...       ...       ...       ...   
91    qwen  Summeval  consistency           Raw  2.059444  1.435076  1.231667   
92    qwen  Summeval      fluency  Weighted Sum  4.214038  2.052812  1.921801   
93    qwen  Summeval      fluency           Raw  4.451389  2.109831  1.951667   
94    qwen  Summeval    relevance  Weighted Sum  1.097095  1.047423  0.887091   
95    qwen  Summeval    relevance           Raw  1.195903  1.093573  0.906458   

    Spearman   Kendall   Pe

### Continuous CP methods: before and after boundary adjustment

In [6]:
def boundary_adjustment(value, label_set, threshold=0.0):
    threshold_max = (label_set[-1] - label_set[0]) / (len(label_set) - 1) / 2
    threshold = min(threshold_max, threshold)
    adjusted_value = next((num for num in label_set if abs(num - value) < threshold), value)
    
    return adjusted_value

columns = ['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']

def midpoint_extraction(directory, method, dataset, dimension, label_set):
    midpoint_before_df = pd.DataFrame(columns=columns)
    midpoint_after_df = pd.DataFrame(columns=columns)
    for seed in range(1, 31): 
        file_name = f"{method}_{dataset}_{dimension}_{seed}.csv"
        file_path = os.path.join(directory, method, file_name)        
        if os.path.exists(file_path):
            data = pd.read_csv(file_path)
            data = data.round(2)
            y_test = data['y_test'].values
            
            data['low'] = data['low'].apply(lambda x: boundary_adjustment(x, label_set, 0))
            data['up'] = data['up'].apply(lambda x: boundary_adjustment(x, label_set, 0))
            midpoint_before = (data['low'] + data['up']) / 2
            mse, rmse, mae, rho, tau, pcc = calculate_performance(midpoint_before, y_test)
            midpoint_before_df = pd.concat([
                midpoint_before_df,
                pd.DataFrame([{
                    'Model': model,
                    'Dataset': dataset,
                    'Dimension': dimension,
                    'Type': "continuous_R2CCP",
                    'MSE': mse,
                    'RMSE': rmse,
                    'MAE': mae,
                    'Spearman': rho,
                    'Kendall': tau,
                    'Pearson': pcc
                }])
            ], ignore_index=True)

            data['low'] = data['low'].apply(lambda x: boundary_adjustment(x, label_set, 0.5))
            data['up'] = data['up'].apply(lambda x: boundary_adjustment(x, label_set, 0.5))
            midpoint_after = (data['low'] + data['up']) / 2
            mse, rmse, mae, rho, tau, pcc = calculate_performance(midpoint_after, y_test)
            midpoint_after_df = pd.concat([
                midpoint_after_df,
                pd.DataFrame([{
                    'Model': model,
                    'Dataset': dataset,
                    'Dimension': dimension,
                    'Type': "discrete_R2CCP",
                    'MSE': mse,
                    'RMSE': rmse,
                    'MAE': mae,
                    'Spearman': rho,
                    'Kendall': tau,
                    'Pearson': pcc
                }])
            ], ignore_index=True)

    summary_before = midpoint_before_df.groupby(['Model', 'Dataset', 'Dimension', 'Type']).agg({
        'MSE': ['mean', 'std'],
        'RMSE': ['mean', 'std'],
        'MAE': ['mean', 'std'],
        'Spearman': ['mean', 'std'],
        'Kendall': ['mean', 'std'],
        'Pearson': ['mean', 'std']
    }).reset_index()
    summary_after = midpoint_after_df.groupby(['Model', 'Dataset', 'Dimension', 'Type']).agg({
        'MSE': ['mean', 'std'],
        'RMSE': ['mean', 'std'],
        'MAE': ['mean', 'std'],
        'Spearman': ['mean', 'std'],
        'Kendall': ['mean', 'std'],
        'Pearson': ['mean', 'std']
    }).reset_index()
    return summary_before, summary_after

base_dir = os.getcwd()  
models = ['4omini', 'dsr1', 'qwen'] 
datasets = ['Summeval', 'Dialsumm', 'GEval', 'SocREval']

method = "R2CCP"

results_df = pd.DataFrame()
for model in models:
    for dataset in datasets:
        if dataset == 'Summeval' or dataset == 'Dialsumm':
            dimensions = ['consistency', 'coherence', 'fluency', 'relevance']
            label_set = np.array([1, 1.33, 1.67, 2, 2.33, 2.67, 3, 3.33, 3.67, 4, 4.33, 4.67, 5])
        elif dataset == 'GEval' or dataset == 'SocREval':
            dimensions = ['cosmos', 'drop', 'esnli', 'gsm8k']
            label_set = np.array([1, 2, 3, 4, 5])
        directory = os.path.join(base_dir, "interval_results", model)
        for dimension in dimensions:
            summary_before, summary_after = midpoint_extraction(directory, method, dataset, dimension, label_set)
            results_df = pd.concat([
                        results_df,
                        summary_before,
                        summary_after
                    ], ignore_index=True)

print(results_df)
results_df.to_csv('midpoints_performance_r2ccp.csv', index=False)


  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_after_df = pd.concat([
  midpoint_before_df = pd.concat([

     Model   Dataset    Dimension              Type       MSE            \
                                                         mean       std   
0   4omini  Summeval  consistency  continuous_R2CCP  0.510480  0.056919   
1   4omini  Summeval  consistency    discrete_R2CCP  0.512428  0.057503   
2   4omini  Summeval    coherence  continuous_R2CCP  0.791054  0.042025   
3   4omini  Summeval    coherence    discrete_R2CCP  0.794462  0.042103   
4   4omini  Summeval      fluency  continuous_R2CCP  0.442466  0.036568   
..     ...       ...          ...               ...       ...       ...   
91    qwen  SocREval         drop    discrete_R2CCP  1.402639  0.226779   
92    qwen  SocREval        esnli  continuous_R2CCP  0.608586  0.167008   
93    qwen  SocREval        esnli    discrete_R2CCP  0.631609  0.172731   
94    qwen  SocREval        gsm8k  continuous_R2CCP  1.823383  0.464182   
95    qwen  SocREval        gsm8k    discrete_R2CCP  1.849229  0.464243   

        RMSE            

### Ordinal CP methods: only discrete intervals

In [7]:
base_dir = os.getcwd()  
models = ['4omini', 'dsr1', 'qwen'] 
datasets = ['Summeval', 'Dialsumm', 'GEval', 'SocREval']
method = "OrdinalAPS"
columns = ['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']

results_df = pd.DataFrame()
for model in models:
    for dataset in datasets:
        if dataset == 'Summeval' or dataset == 'Dialsumm':
            dimensions = ['consistency', 'coherence', 'fluency', 'relevance']
        elif dataset == 'GEval' or dataset == 'SocREval':
            dimensions = ['cosmos', 'drop', 'esnli', 'gsm8k']
        directory = os.path.join(base_dir, "interval_results", model)
        for dimension in dimensions:
            midpoint_before_df = pd.DataFrame(columns=columns)
            for seed in range(1, 31): 
                file_name = f"{method}_{dataset}_{dimension}_{seed}.csv"
                file_path = os.path.join(directory, method, file_name)        
                if os.path.exists(file_path):
                    data = pd.read_csv(file_path)
                    data = data.round(2)
                    y_test = data['y_test'].values
                    
                    data['low'] = data['low'].apply(lambda x: boundary_adjustment(x, label_set, 0))
                    data['up'] = data['up'].apply(lambda x: boundary_adjustment(x, label_set, 0))
                    midpoint_before = (data['low'] + data['up']) / 2
                    mse, rmse, mae, rho, tau, pcc = calculate_performance(midpoint_before, y_test)
                    midpoint_before_df = pd.concat([
                        midpoint_before_df,
                        pd.DataFrame([{
                            'Model': model,
                            'Dataset': dataset,
                            'Dimension': dimension,
                            'Type': method,
                            'MSE': mse,
                            'RMSE': rmse,
                            'MAE': mae,
                            'Spearman': rho,
                            'Kendall': tau,
                            'Pearson': pcc
                        }])
                    ], ignore_index=True)

            summary = midpoint_before_df.groupby(['Model', 'Dataset', 'Dimension', 'Type']).agg({
                                                        'MSE': ['mean', 'std'],
                                                        'RMSE': ['mean', 'std'],
                                                        'MAE': ['mean', 'std'],
                                                        'Spearman': ['mean', 'std'],
                                                        'Kendall': ['mean', 'std'],
                                                        'Pearson': ['mean', 'std']
                                                    }).reset_index()
            results_df = pd.concat([
                        results_df,
                        summary,
                    ], ignore_index=True)

print(results_df)
results_df.to_csv('midpoint_performance_ordinalaps.csv', index=False)

  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df = pd.concat([
  midpoint_before_df

     Model   Dataset    Dimension        Type       MSE                RMSE  \
                                                   mean       std      mean   
0   4omini  Summeval  consistency  OrdinalAPS  1.775499  0.035414  1.332415   
1   4omini  Summeval    coherence  OrdinalAPS  1.498127  0.053057  1.223795   
2   4omini  Summeval      fluency  OrdinalAPS  3.451555  0.051725  1.857786   
3   4omini  Summeval    relevance  OrdinalAPS  0.814759  0.034453  0.902444   
4   4omini  Dialsumm  consistency  OrdinalAPS  0.698577  0.021271  0.835716   
5   4omini  Dialsumm    coherence  OrdinalAPS  3.062263  0.062681  1.749844   
6   4omini  Dialsumm      fluency  OrdinalAPS  1.620314  0.044393  1.272800   
7   4omini  Dialsumm    relevance  OrdinalAPS  0.929960  0.036151  0.964165   
8   4omini     GEval       cosmos  OrdinalAPS  1.719388  0.171676  1.309641   
9   4omini     GEval         drop  OrdinalAPS  1.374048  0.109392  1.171297   
10  4omini     GEval        esnli  OrdinalAPS  2.141

### Split the datasets like CPs

In [8]:
columns = ['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']
results_df = pd.DataFrame(columns=columns)

for model in ['4omini', 'dsr1', 'qwen']:
    logits_model_path = os.path.join(logits_folder, model)
    raw_model_path = os.path.join(raw_folder, model)
    
    for file_name in os.listdir(logits_model_path):
        if file_name.endswith('_logits.csv'):
            dataset, dimension = file_name.split('_')[:2]
            raw_file_name = f"{dataset}_{dimension}_raw.csv"
            logits_file_path = os.path.join(logits_model_path, file_name)
            raw_file_path = os.path.join(raw_model_path, raw_file_name)
            
            if os.path.exists(raw_file_path):
                logits_df = pd.read_csv(logits_file_path)
                X = logits_df.iloc[:, :-1].values
                X_softmax = softmax(X)
                weighted_sum = X_softmax @ np.array([1, 2, 3, 4, 5])
                y = logits_df.iloc[:, -1].values
                raw_df = pd.read_csv(raw_file_path)
                raw_scores = raw_df.iloc[:, 0].values
                for seed in range(1,31):
                    from sklearn.model_selection import train_test_split
                    weighted_sum_cal, weighted_sum_test, y_cal, y_test = train_test_split(weighted_sum, y, test_size=0.5, random_state=seed)
                    mse, rmse, mae, rho, tau, pcc = calculate_performance(weighted_sum_test, y_test)
                    results_df = pd.concat([
                        results_df,
                        pd.DataFrame([{
                            'Model': model,
                            'Dataset': dataset,
                            'Dimension': dimension,
                            'Type': 'Weighted Sum',
                            'Seed': seed,
                            'MSE': mse,
                            'RMSE': rmse,
                            'MAE': mae,
                            'Spearman': rho,
                            'Kendall': tau,
                            'Pearson': pcc
                        }])
                    ], ignore_index=True)
                    
                    raw_scores_cal, raw_scores_test, y_cal, y_test = train_test_split(raw_scores, y, test_size=0.5, random_state=seed)
                    mse, rmse, mae, rho, tau, pcc = calculate_performance(raw_scores_test, y_test)
                    results_df = pd.concat([
                        results_df,
                        pd.DataFrame([{
                            'Model': model,
                            'Dataset': dataset,
                            'Dimension': dimension,
                            'Type': 'Raw',
                            'Seed': seed,
                            'MSE': mse,
                            'RMSE': rmse,
                            'MAE': mae,
                            'Spearman': rho,
                            'Kendall': tau,
                            'Pearson': pcc
                        }])
                    ], ignore_index=True)
                summary = results_df.groupby(['Model', 'Dataset', 'Dimension', 'Type']).agg({
                                                        'MSE': ['mean', 'std'],
                                                        'RMSE': ['mean', 'std'],
                                                        'MAE': ['mean', 'std'],
                                                        'Spearman': ['mean', 'std'],
                                                        'Kendall': ['mean', 'std'],
                                                        'Pearson': ['mean', 'std']
                                                    }).reset_index()

print(summary)
summary.to_csv('judgments_performance_split.csv', index=False)

  results_df = pd.concat([


     Model   Dataset    Dimension          Type       MSE                RMSE  \
                                                     mean       std      mean   
0   4omini  Dialsumm    coherence           Raw  3.787201  0.086450  1.945951   
1   4omini  Dialsumm    coherence  Weighted Sum  3.701023  0.082147  1.923690   
2   4omini  Dialsumm  consistency           Raw  0.999777  0.035452  0.999737   
3   4omini  Dialsumm  consistency  Weighted Sum  0.824807  0.028545  0.908058   
4   4omini  Dialsumm      fluency           Raw  1.780238  0.050028  1.334128   
..     ...       ...          ...           ...       ...       ...       ...   
91    qwen  Summeval  consistency  Weighted Sum  1.847105  0.045109  1.358984   
92    qwen  Summeval      fluency           Raw  4.476472  0.080026  2.115686   
93    qwen  Summeval      fluency  Weighted Sum  4.236327  0.062387  2.058180   
94    qwen  Summeval    relevance           Raw  1.187870  0.035375  1.089778   
95    qwen  Summeval    rele

### Extract and compare

In [9]:
import pandas as pd

judgments = pd.read_csv("judgments_performance_split.csv")
ordinalaps_scores = pd.read_csv("midpoint_performance_ordinalaps.csv")
r2ccp_scores = pd.read_csv("midpoints_performance_r2ccp.csv")


judgments = judgments[['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']].iloc[1:]
ordinalaps_scores = ordinalaps_scores[['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']].iloc[1:]
r2ccp_scores = r2ccp_scores[['Model', 'Dataset', 'Dimension', 'Type', 'MSE', 'RMSE', 'MAE', 'Spearman', 'Kendall', 'Pearson']].iloc[1:]

In [10]:
results_df = pd.concat([
                        judgments, ordinalaps_scores, r2ccp_scores
                    ], ignore_index=True)
results_df.to_csv("all_scores_comparison.csv", index=False)