In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

import torch
torch.set_float32_matmul_precision('medium')

alpha = 0.10

from matplotlib import pyplot as plt
import os, sys

# !wget https://files.pythonhosted.org/packages/py3/R/R2CCP/R2CCP-0.0.8-py3-none-any.whl
# !pip install R2CCP-0.0.8-py3-none-any.whl --no-deps
import os
os.makedirs('model_paths', exist_ok=True)

# !pip install configargparse pytorch_lightning torchvision
from R2CCP.main import R2CCP

In [None]:
def merge_intervals(sample_intervals):
    if not sample_intervals:
        return (1,5)
    lows = [low for low, high in sample_intervals]
    highs = [high for low, high in sample_intervals]
    return (min(lows), max(highs))

def range_modification(y_qlow, y_qup, range_low,  range_up):
    y_qlow = np.clip(y_qlow, range_low, range_up)
    y_qup = np.clip(y_qup, range_low, range_up)
    return y_qlow, y_qup

def run_experiment(X, y, dimension, cot, model_name, seed):

    X = X.to_numpy().astype(np.float32)
    y = y.to_numpy().astype(np.float32)

    from sklearn.model_selection import train_test_split
    X_cal, X_test, y_cal, y_test = train_test_split(X, y, test_size=0.5, random_state=seed)
    
    if os.path.exists('model_paths/model_save_destination.pth'):
        os.remove('model_paths/model_save_destination.pth')

    model = R2CCP({'model_path': 'model_paths/model_save_destination.pth', 'max_epochs': 100, 'alpha': 0.1})
    model.fit(X_cal, y_cal.flatten())
    intervals = model.get_intervals(X_test)
    intervals = [merge_intervals(sample_intervals) for sample_intervals in intervals]

    df = pd.DataFrame({
        'low':    [iv[0] for iv in intervals],
        'up':     [iv[1] for iv in intervals],
        'y_test': y_test
    })

    df.to_csv(f'R2CCP_{model_name}_{dimension}_prompt{cot}_sensitivity_{seed}.csv', index=False)

    in_interval = [
        (low <= y_true <= high)
        for (low, high), y_true in zip(intervals, y_test)
    ]
    coverage_rate  = np.mean(in_interval)
    average_width = np.mean([high - low for low, high in intervals])

    del model
    torch.cuda.empty_cache()

    print(f"Width: {average_width:.4f}, Coverage: {coverage_rate:.4f}")

    return average_width, coverage_rate


def calculate_statistics(X, y, num_runs, seed_start, dimension, cot, model_name):
    from tqdm import tqdm
    width = []
    coverage = []
    for i in tqdm(range(num_runs), desc="Running experiments"):
        seed = seed_start + i
        try:
            average_width, coverage_rate = run_experiment(X, y, dimension, cot, model_name, seed)
            width.append(average_width)
            coverage.append(coverage_rate)
        except IndexError as e:
            print(f"Skipping seed {seed} due to error: {e}")
            continue
    
    mean_width = np.mean(width)
    std_width = np.std(width)
    mean_coverage = np.mean(coverage)
    std_coverage = np.std(coverage)

    print("\nSummary of R2CCP:")
    print(f"Width: {mean_width:.4f}, {std_width:.4f}")
    print(f"Coverage: {mean_coverage:.4f}, {std_coverage:.4f}")

    return  mean_width, std_width, mean_coverage, std_coverage

In [None]:
results = []
label = pd.read_csv('label_summeval.csv')
for dimension in {'consistency', 'coherence'}:
    print(f"Processing dimension: {dimension}")
    y = label[dimension]
    for model_name in {'4o'}:
        all_logits = pd.read_csv(f'logits/{model_name}_{dimension}.csv')
        for cot in range(5):
            X = all_logits.iloc[cot::5]
            mean_width, std_width, mean_coverage, std_coverage = calculate_statistics(X, y, 30, 1, dimension, cot, model_name)

            results.append({
                'model_name': model_name,
                'cot': cot,
                'dimension': dimension,
                'mean_width': mean_width,
                'std_width': std_width,
                'mean_coverage': mean_coverage,
                'std_coverage': std_coverage
            })

results_df = pd.DataFrame(results)



In [None]:
results_df

In [None]:
results = []
label = pd.read_csv('label_summeval.csv')
for dimension in {'fluency', 'relevance'}:
    print(f"Processing dimension: {dimension}")
    y = label[dimension]
    for model_name in {'4o'}:
        all_logits = pd.read_csv(f'logits/{model_name}_{dimension}.csv')
        for cot in range(5):
            X = all_logits.iloc[cot::5]
            mean_width, std_width, mean_coverage, std_coverage = calculate_statistics(X, y, 30, 1, dimension, cot, model_name)

            results.append({
                'model_name': model_name,
                'cot': cot,
                'dimension': dimension,
                'mean_width': mean_width,
                'std_width': std_width,
                'mean_coverage': mean_coverage,
                'std_coverage': std_coverage
            })

results_df = pd.DataFrame(results)



In [None]:
results_df

In [None]:
results = []
label = pd.read_csv('label_summeval.csv')
for dimension in {'consistency', 'coherence'}:
    print(f"Processing dimension: {dimension}")
    y = label[dimension]
    for model_name in {'4omini'}:
        all_logits = pd.read_csv(f'logits/{model_name}_{dimension}.csv')
        for cot in range(5):
            X = all_logits.iloc[cot::5]
            mean_width, std_width, mean_coverage, std_coverage = calculate_statistics(X, y, 30, 1, dimension, cot, model_name)

            results.append({
                'model_name': model_name,
                'cot': cot,
                'dimension': dimension,
                'mean_width': mean_width,
                'std_width': std_width,
                'mean_coverage': mean_coverage,
                'std_coverage': std_coverage
            })

results_df = pd.DataFrame(results)



In [None]:
results_df

In [None]:
results = []
label = pd.read_csv('label_summeval.csv')
for dimension in {'fluency', 'relevance'}:
    print(f"Processing dimension: {dimension}")
    y = label[dimension]
    for model_name in {'4omini'}:
        all_logits = pd.read_csv(f'logits/{model_name}_{dimension}.csv')
        for cot in range(5):
            X = all_logits.iloc[cot::5]
            mean_width, std_width, mean_coverage, std_coverage = calculate_statistics(X, y, 30, 1, dimension, cot, model_name)

            results.append({
                'model_name': model_name,
                'cot': cot,
                'dimension': dimension,
                'mean_width': mean_width,
                'std_width': std_width,
                'mean_coverage': mean_coverage,
                'std_coverage': std_coverage
            })

results_df = pd.DataFrame(results)



In [None]:
results_df

In [5]:
import glob
import os
import pandas as pd
from scipy.stats import binomtest

results = []

for cot in ['0', '1', '2', '3', '4']:
    for model in ['4o','4omini']:
            for dimension in ['consistency', 'coherence', 'fluency', 'relevance']:
                coverages = []
                widths = []
                for seed in range(31):
                    csv_path = f'./R2CCP_{model}_{dimension}_prompt{cot}_sensitivity_{seed}.csv'
                    if os.path.exists(csv_path):
                        df = pd.read_csv(csv_path)
                        df = df.rename(columns={'low': 'y_qlow', 'up': 'y_qup'})
                        df['y_test'] = round(df['y_test'], 2)
                        df['y_qlow'] = round(df['y_qlow'], 2)
                        df['y_qup'] = round(df['y_qup'], 2)
                        coverage = ((df['y_test'] >= df['y_qlow']) & (df['y_test'] <= df['y_qup'])).mean()
                        width = (df['y_qup'] - df['y_qlow']).mean()
                        coverages.append(coverage)
                        widths.append(width)
                if coverages and widths:
                    results.append({
                        'cot': cot,
                        'model': model,
                        'dimension': dimension,
                        'interval_width_mean': sum(widths) / len(widths),
                        'interval_width_std': pd.Series(widths).std(),
                        'coverage_rate_mean': sum(coverages) / len(coverages),
                        'coverage_rate_std': pd.Series(coverages).std(),
                        'significant_test': binomtest(sum(c >= 0.9 for c in coverages), len(coverages), 0.9, alternative='two-sided').pvalue
                    })

results_df = pd.DataFrame(results)
print(results_df)

   cot   model    dimension  interval_width_mean  interval_width_std  \
0    0      4o  consistency             0.841784            0.215207   
1    0      4o    coherence             3.094161            0.148751   
2    0      4o      fluency             1.086002            0.221702   
3    0      4o    relevance             2.409371            0.150009   
4    0  4omini  consistency             0.827695            0.262389   
5    0  4omini    coherence             2.968898            0.141206   
6    0  4omini      fluency             1.062315            0.244633   
7    0  4omini    relevance             2.359884            0.130038   
8    1      4o  consistency             0.802447            0.245293   
9    1      4o    coherence             2.976335            0.156964   
10   1      4o      fluency             1.052982            0.229471   
11   1      4o    relevance             2.371231            0.108815   
12   1  4omini  consistency             0.810368            0.21

In [7]:
print(results_df)

   cot   model    dimension  interval_width_mean  interval_width_std  \
0    0      4o  consistency             0.841784            0.215207   
1    0      4o    coherence             3.094161            0.148751   
2    0      4o      fluency             1.086002            0.221702   
3    0      4o    relevance             2.409371            0.150009   
4    0  4omini  consistency             0.827695            0.262389   
5    0  4omini    coherence             2.968898            0.141206   
6    0  4omini      fluency             1.062315            0.244633   
7    0  4omini    relevance             2.359884            0.130038   
8    1      4o  consistency             0.802447            0.245293   
9    1      4o    coherence             2.976335            0.156964   
10   1      4o      fluency             1.052982            0.229471   
11   1      4o    relevance             2.371231            0.108815   
12   1  4omini  consistency             0.810368            0.21