In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

import torch
torch.set_float32_matmul_precision('medium')

alpha = 0.10

from matplotlib import pyplot as plt
import os, sys

# !wget https://files.pythonhosted.org/packages/py3/R/R2CCP/R2CCP-0.0.8-py3-none-any.whl
# !pip install R2CCP-0.0.8-py3-none-any.whl --no-deps
import os
os.makedirs('model_paths', exist_ok=True)

# !pip install configargparse pytorch_lightning torchvision
from R2CCP.main import R2CCP

def merge_intervals(sample_intervals):
    if not sample_intervals:
        return (1,5)
    lows = [low for low, high in sample_intervals]
    highs = [high for low, high in sample_intervals]
    return (min(lows), max(highs))

def boundary_adjustment(value, label_set, threshold=0):
    threshold_max = (label_set[-1] - label_set[0]) / (len(label_set) - 1) / 2
    threshold = min(threshold_max, threshold)
    adjusted_value = next((num for num in label_set if abs(num - value) <= threshold+0.01), value)
    
    return adjusted_value

def coverage_and_width(low, up, y_test):
    width = up - low
    coverage = np.mean((low <= y_test) & (y_test <= up))
    return width.mean(), coverage.mean()

In [None]:
results = []
for model_name in {'4omini', 'dsr1', 'qwen'}:
    for dimension in {'consistency', 'coherence', 'fluency', 'relevance'}:
        cal_data = pd.read_csv(f'../model_logits/{model_name}/Summeval_{dimension}_logits.csv')
        X_cal = cal_data.iloc[:, :-1].to_numpy().astype(np.float32)
        y_cal = cal_data.iloc[:, -1].to_numpy().astype(np.float32)
        test_data = pd.read_csv(f'../model_logits/{model_name}/Dialsumm_{dimension}_logits.csv')
        X_test = test_data.iloc[:, :-1].to_numpy().astype(np.float32)
        y_test = test_data.iloc[:, -1].to_numpy().astype(np.float32)

        if os.path.exists('model_paths/model_save_destination.pth'):
            os.remove('model_paths/model_save_destination.pth')

        model = R2CCP({'model_path': 'model_paths/model_save_destination.pth', 'max_epochs': 100, 'alpha': 0.1})
        model.fit(X_cal, y_cal.flatten())
        intervals = model.get_intervals(X_test)
        intervals = [merge_intervals(sample_intervals) for sample_intervals in intervals]

        df = pd.DataFrame({
            'low':    [iv[0] for iv in intervals],
            'up':     [iv[1] for iv in intervals],
            'y_test': y_test
            })

        df.to_csv(f'R2CCP_{dimension}_{model_name}_summeval2dialsumm.csv', index=False)

        in_interval = [
            (low <= y_true <= high)
            for (low, high), y_true in zip(intervals, y_test)
            ]
        coverage_rate  = np.mean(in_interval)
        average_width = np.mean([high - low for low, high in intervals])

        del model
        torch.cuda.empty_cache()

        print(f"Width: {average_width:.4f}, Coverage: {coverage_rate:.4f}")
        results.append({
            'model': model_name,
            'dimension': dimension,
            'coverage_rate': coverage_rate,
            'average_width': average_width
        })


In [3]:
results

[{'model': 'dsr1',
  'dimension': 'fluency',
  'coverage_rate': 0.4657142857142857,
  'average_width': 0.6924895087310247},
 {'model': 'dsr1',
  'dimension': 'consistency',
  'coverage_rate': 0.24,
  'average_width': 0.5956422196967261},
 {'model': 'dsr1',
  'dimension': 'coherence',
  'coverage_rate': 0.9271428571428572,
  'average_width': 1.9237895865951267},
 {'model': 'dsr1',
  'dimension': 'relevance',
  'coverage_rate': 0.8185714285714286,
  'average_width': 1.7927325912032808},
 {'model': 'qwen',
  'dimension': 'fluency',
  'coverage_rate': 0.4742857142857143,
  'average_width': 0.8462530732154846},
 {'model': 'qwen',
  'dimension': 'consistency',
  'coverage_rate': 0.20357142857142857,
  'average_width': 0.5863682071651731},
 {'model': 'qwen',
  'dimension': 'coherence',
  'coverage_rate': 0.72,
  'average_width': 2.067312071578843},
 {'model': 'qwen',
  'dimension': 'relevance',
  'coverage_rate': 0.8742857142857143,
  'average_width': 1.9778050071001052},
 {'model': '4omini',

In [4]:
label_set = np.array([1, 1.33, 1.67, 2, 2.33, 2.67, 3, 3.33, 3.67, 4, 4.33, 4.67, 5])
adjustment = 0.17

results = []
for model_name in {'4omini', 'dsr1', 'qwen'}:
    for dimension in {'consistency', 'coherence', 'fluency', 'relevance'}:
        data = pd.read_csv(f'R2CCP_{dimension}_{model_name}_summeval2dialsumm.csv')
        data = data.round(2)
        data['low'] = data['low'].apply(lambda x: boundary_adjustment(x, label_set, adjustment))
        data['up'] = data['up'].apply(lambda x: boundary_adjustment(x, label_set, adjustment))
        width, coverage = coverage_and_width(data['low'], data['up'], data['y_test'])
        results.append({
            'model': model_name,
            'dimension': dimension,
            'coverage_rate': coverage,
            'average_width': width
        })

results

[{'model': 'dsr1',
  'dimension': 'fluency',
  'coverage_rate': 0.57,
  'average_width': 0.6531071428571429},
 {'model': 'dsr1',
  'dimension': 'consistency',
  'coverage_rate': 0.30714285714285716,
  'average_width': 0.5562642857142858},
 {'model': 'dsr1',
  'dimension': 'coherence',
  'coverage_rate': 0.9435714285714286,
  'average_width': 1.921557142857143},
 {'model': 'dsr1',
  'dimension': 'relevance',
  'coverage_rate': 0.8707142857142857,
  'average_width': 1.7732285714285714},
 {'model': 'qwen',
  'dimension': 'fluency',
  'coverage_rate': 0.6292857142857143,
  'average_width': 0.8376357142857144},
 {'model': 'qwen',
  'dimension': 'consistency',
  'coverage_rate': 0.30642857142857144,
  'average_width': 0.5536857142857143},
 {'model': 'qwen',
  'dimension': 'coherence',
  'coverage_rate': 0.7657142857142857,
  'average_width': 2.0643214285714286},
 {'model': 'qwen',
  'dimension': 'relevance',
  'coverage_rate': 0.9028571428571428,
  'average_width': 1.9797285714285713},
 {'mo

In [None]:
results = []
for model_name in {'4omini', 'dsr1', 'qwen'}:
    for dimension in {'consistency', 'coherence', 'fluency', 'relevance'}:
        cal_data = pd.read_csv(f'../model_logits/{model_name}/Dialsumm_{dimension}_logits.csv')
        X_cal = cal_data.iloc[:, :-1].to_numpy().astype(np.float32)
        y_cal = cal_data.iloc[:, -1].to_numpy().astype(np.float32)
        test_data = pd.read_csv(f'../model_logits/{model_name}/Summeval_{dimension}_logits.csv')
        X_test = test_data.iloc[:, :-1].to_numpy().astype(np.float32)
        y_test = test_data.iloc[:, -1].to_numpy().astype(np.float32)

        if os.path.exists('model_paths/model_save_destination.pth'):
            os.remove('model_paths/model_save_destination.pth')

        model = R2CCP({'model_path': 'model_paths/model_save_destination.pth', 'max_epochs': 100, 'alpha': 0.1})
        model.fit(X_cal, y_cal.flatten())
        intervals = model.get_intervals(X_test)
        intervals = [merge_intervals(sample_intervals) for sample_intervals in intervals]

        df = pd.DataFrame({
            'low':    [iv[0] for iv in intervals],
            'up':     [iv[1] for iv in intervals],
            'y_test': y_test
            })

        df.to_csv(f'R2CCP_{dimension}_{model_name}_dialsumm2summeval.csv', index=False)

        in_interval = [
            (low <= y_true <= high)
            for (low, high), y_true in zip(intervals, y_test)
            ]
        coverage_rate  = np.mean(in_interval)
        average_width = np.mean([high - low for low, high in intervals])

        del model
        torch.cuda.empty_cache()

        print(f"Width: {average_width:.4f}, Coverage: {coverage_rate:.4f}")
        results.append({
            'model': model_name,
            'dimension': dimension,
            'coverage_rate': coverage_rate,
            'average_width': average_width
        })


In [6]:
results

[{'model': 'dsr1',
  'dimension': 'fluency',
  'coverage_rate': 0.359375,
  'average_width': 1.1209421215951443},
 {'model': 'dsr1',
  'dimension': 'consistency',
  'coverage_rate': 0.50375,
  'average_width': 1.879784333333373},
 {'model': 'dsr1',
  'dimension': 'coherence',
  'coverage_rate': 0.52375,
  'average_width': 1.34417592972517},
 {'model': 'dsr1',
  'dimension': 'relevance',
  'coverage_rate': 0.801875,
  'average_width': 1.726801937893033},
 {'model': 'qwen',
  'dimension': 'fluency',
  'coverage_rate': 0.271875,
  'average_width': 1.1638392074406148},
 {'model': 'qwen',
  'dimension': 'consistency',
  'coverage_rate': 0.3775,
  'average_width': 1.5958447203040123},
 {'model': 'qwen',
  'dimension': 'coherence',
  'coverage_rate': 0.483125,
  'average_width': 1.4909820595383645},
 {'model': 'qwen',
  'dimension': 'relevance',
  'coverage_rate': 0.77625,
  'average_width': 1.5879724637418986},
 {'model': '4omini',
  'dimension': 'fluency',
  'coverage_rate': 0.536875,
  'av

In [7]:
label_set = np.array([1, 1.33, 1.67, 2, 2.33, 2.67, 3, 3.33, 3.67, 4, 4.33, 4.67, 5])
adjustment = 0.17

results = []
for model_name in {'4omini', 'dsr1', 'qwen'}:
    for dimension in {'consistency', 'coherence', 'fluency', 'relevance'}:
        data = pd.read_csv(f'R2CCP_{dimension}_{model_name}_dialsumm2summeval.csv')
        data = data.round(2)
        data['low'] = data['low'].apply(lambda x: boundary_adjustment(x, label_set, adjustment))
        data['up'] = data['up'].apply(lambda x: boundary_adjustment(x, label_set, adjustment))
        width, coverage = coverage_and_width(data['low'], data['up'], data['y_test'])
        results.append({
            'model': model_name,
            'dimension': dimension,
            'coverage_rate': coverage,
            'average_width': width
        })

results

[{'model': 'dsr1',
  'dimension': 'fluency',
  'coverage_rate': 0.82375,
  'average_width': 1.1595875},
 {'model': 'dsr1',
  'dimension': 'consistency',
  'coverage_rate': 0.699375,
  'average_width': 1.8781875000000001},
 {'model': 'dsr1',
  'dimension': 'coherence',
  'coverage_rate': 0.6025,
  'average_width': 1.3318125},
 {'model': 'dsr1',
  'dimension': 'relevance',
  'coverage_rate': 0.8625,
  'average_width': 1.7228937500000001},
 {'model': 'qwen',
  'dimension': 'fluency',
  'coverage_rate': 0.9,
  'average_width': 1.21904375},
 {'model': 'qwen',
  'dimension': 'consistency',
  'coverage_rate': 0.5925,
  'average_width': 1.60364375},
 {'model': 'qwen',
  'dimension': 'coherence',
  'coverage_rate': 0.57375,
  'average_width': 1.5208812500000002},
 {'model': 'qwen',
  'dimension': 'relevance',
  'coverage_rate': 0.845,
  'average_width': 1.5615312499999998},
 {'model': '4omini',
  'dimension': 'fluency',
  'coverage_rate': 0.920625,
  'average_width': 1.3340062499999998},
 {'mod

In [8]:
for dimension in {'consistency', 'coherence', 'fluency', 'relevance'}:
        data = pd.read_csv(f'R2CCP_{dimension}_dsr1_dialsumm2summeval.csv')
        print(f"Distribution of labels for {dimension}:")
        label = data['y_test'].round(2)
        value_counts = label.value_counts()
        print(value_counts)

Distribution of labels for fluency:
y_test
5.00    1150
4.67     141
4.33     112
4.00      39
2.67      34
3.00      30
3.67      27
3.33      18
1.67      17
2.33      11
2.00      11
1.33       5
1.00       5
Name: count, dtype: int64
Distribution of labels for consistency:
y_test
5.00    1306
4.67      81
4.33      44
1.67      39
2.67      30
2.00      24
1.33      21
2.33      19
1.00      14
4.00       8
3.00       7
3.67       7
Name: count, dtype: int64
Distribution of labels for coherence:
y_test
4.33    270
4.00    174
2.67    149
3.00    141
3.67    135
3.33    132
4.67    124
2.33    123
5.00    111
1.67    103
2.00    100
1.33     25
1.00     13
Name: count, dtype: int64
Distribution of labels for relevance:
y_test
4.33    318
4.00    302
3.67    217
4.67    187
3.33    155
3.00    120
2.67     87
5.00     83
2.33     62
2.00     33
1.67     25
1.33      6
1.00      5
Name: count, dtype: int64


In [9]:
for dimension in {'consistency', 'coherence', 'fluency', 'relevance'}:
        data = pd.read_csv(f'R2CCP_{dimension}_dsr1_summeval2dialsumm.csv')
        print(f"Distribution of labels for {dimension}:")
        label = data['y_test'].round(2)
        value_counts = label.value_counts()
        print(value_counts)

Distribution of labels for fluency:
y_test
4.67    588
4.33    400
4.00    204
3.67     92
3.33     41
2.67     29
3.00     21
2.33     10
5.00      9
2.00      4
1.67      1
1.33      1
Name: count, dtype: int64
Distribution of labels for consistency:
y_test
4.00    235
3.67    212
4.33    209
4.67    190
3.33    121
3.00     99
1.00     61
2.67     61
1.67     51
2.33     48
2.00     43
5.00     40
1.33     30
Name: count, dtype: int64
Distribution of labels for coherence:
y_test
4.67    413
4.33    350
4.00    191
3.67    133
3.33     91
3.00     76
5.00     71
2.67     46
2.33     20
2.00      5
1.67      3
1.33      1
Name: count, dtype: int64
Distribution of labels for relevance:
y_test
3.67    281
4.00    236
3.33    220
4.33    154
3.00     89
4.67     80
5.00     75
2.67     70
1.67     47
2.33     41
1.00     40
2.00     35
1.33     32
Name: count, dtype: int64
