In [2]:
word_to_number = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5
}

import numpy as np
import random
import torch

alpha = 0.10

import json
import math
import pandas as pd
from matplotlib import pyplot as plt
import os, sys



In [3]:
def get_qhat_ordinal_aps(prediction_function, cal_scores, cal_labels, alpha):
    n = cal_scores.shape[0]
    grid_size = 10000
    for q in np.linspace(1e-3, 1 - 1e-3, grid_size)[::-1]:
        coverage, _, _ = evaluate_sets(prediction_function, np.copy(cal_scores), np.copy(cal_labels), q, alpha)
        if coverage <= (np.ceil((n + 1)*(1 - alpha))/n):
            # return q + 1/(grid_size - 1)
            return np.minimum(q + 1/(grid_size - 1), 1.0 - 1e-6)  # Clip q to be less than 1.0
    return q

def evaluate_sets(prediction_function, val_scores, val_labels, qhat, alpha, print_bool=False):
    sets = prediction_function(val_scores, qhat)
    # Check
    sizes = sets.sum(axis=1)
    sizes_distribution = np.array([(sizes == i).mean() for i in range(5)])
    # Evaluate coverage
    covered = sets[np.arange(val_labels.shape[0]), val_labels]
    coverage = covered.mean()
    label_stratified_coverage = [
        covered[val_labels == j].mean() for j in range(np.unique(val_labels).max() + 1)
    ]
    label_distribution = [
        (val_labels == j).mean() for j in range(np.unique(val_labels).max() + 1)
    ]
    if(print_bool):
        print(r'$\alpha$' + f":{alpha}  |  coverage: {coverage}  |  average size: {sizes.mean()}  |  qhat: {qhat}  |  set size distribution: {sizes_distribution} ")
        print(f"label stratified coverage: {label_stratified_coverage}  \nlabel distribution: {label_distribution}")
    return coverage, label_stratified_coverage, sizes_distribution

def ordinal_aps_prediction(val_scores, qhat):
    import numpy as np

    n_samples, n_classes = val_scores.shape
    P = val_scores == val_scores.max(axis=1)[:, None]

    idx_construction_incomplete = (val_scores * P.astype(float)).sum(axis=1) <= qhat

    max_iter = n_classes  
    iter_count = 0

    while idx_construction_incomplete.sum() > 0:
        iter_count += 1
        if iter_count > max_iter:
            P[idx_construction_incomplete] = True
            break

        P_inc = P[idx_construction_incomplete]
        scores_inc = val_scores[idx_construction_incomplete]

        set_cumsum = P_inc.cumsum(axis=1)
        lower_edge_idx = (P_inc > 0).argmax(axis=1)
        upper_edge_idx = set_cumsum.argmax(axis=1)

        left_valid = (lower_edge_idx - 1) >= 0
        right_valid = (upper_edge_idx + 1) < scores_inc.shape[1]

        lower_edge_wins = np.zeros(scores_inc.shape[0], dtype=bool)

        lower_edge_wins[~right_valid & left_valid] = True

        both_valid = left_valid & right_valid
        lower_scores = scores_inc[np.arange(scores_inc.shape[0])[both_valid], lower_edge_idx[both_valid] - 1]
        upper_scores = scores_inc[np.arange(scores_inc.shape[0])[both_valid], upper_edge_idx[both_valid] + 1]
        lower_edge_wins[both_valid] = lower_scores > upper_scores

        valid_left = lower_edge_wins & ((lower_edge_idx - 1) >= 0)
        P_inc[valid_left, lower_edge_idx[valid_left] - 1] = True

        valid_right = (~lower_edge_wins) & ((upper_edge_idx + 1) < scores_inc.shape[1])
        P_inc[valid_right, upper_edge_idx[valid_right] + 1] = True

        P[idx_construction_incomplete] = P_inc

        idx_construction_incomplete = (val_scores * P.astype(float)).sum(axis=1) <= qhat

    return P


In [17]:
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def run_experiment(X, y, seed, dataset='summeval', dimension='consistency'):
    random.seed(seed)
    np.random.seed(seed)

    X.columns = list(range(len(X.columns)))
    X = X.to_numpy().astype(np.float32)
    y = y.to_numpy().astype(np.float32)*3-3

    y = y.astype(int)
    x_arr = X
    from scipy.interpolate import interp1d
    n = x_arr.shape[0]
    m = 13

    new_x = np.zeros((n, m))
    orig_idx = np.linspace(3, 15, 5)-3
    target_idx = np.linspace(3, 15, m)-3

    for i in range(n):
        f = interp1d(orig_idx, x_arr[i, :], kind='linear')
        new_x[i, :] = f(target_idx)
    
    new_x = softmax(new_x)
    # new_x = softmax(x_arr)

    from sklearn.model_selection import train_test_split

    fyxs_cal, fyxs_test, y_cal, y_test = train_test_split(new_x, y, test_size=0.5, random_state=seed)
    y_cal = y_cal.ravel()
    y_test = y_test.ravel()

    cal_scores = fyxs_cal
    cal_labels = y_cal
    test_scores = fyxs_test
    test_labels = y_test

    qhat = get_qhat_ordinal_aps(ordinal_aps_prediction, np.copy(cal_scores), np.copy(cal_labels), alpha)
    test_pred_sets = ordinal_aps_prediction(np.copy(test_scores), qhat)
    prediction_intervals = []
    for pred_set in test_pred_sets:
        indices = np.where(pred_set)[0]
        if len(indices) > 0:
            interval = (indices.min(), indices.max())
        else:
            interval = None 
        prediction_intervals.append(interval)

    y_qlow, y_qup = zip(*prediction_intervals)
    y_qlow = (np.array(y_qlow)+3)/3
    y_qup = (np.array(y_qup)+3)/3

    y_test_real = test_labels/3+1
    # y_qlow = np.array(y_qlow)+1
    # y_qup = np.array(y_qup)+1

    # y_test_real = test_labels+1

    df = pd.DataFrame({
        'low':    y_qlow.ravel(),
        'up':     y_qup.ravel(),
        'y_test': y_test_real.ravel(),
    })

    df.to_csv(f'OrdinalAPS_{dataset}_{dimension}_{seed}.csv', index=False)

    in_interval = (y_test_real >= y_qlow) & (y_test_real <= y_qup)

    average_width = np.mean(y_qup-y_qlow)
    coverage_rate = np.mean(in_interval)

    print(f"Seed: {seed}, Width: {average_width:.4f}, Coverage: {coverage_rate:.4f}")

    return average_width, coverage_rate

def calculate_statistics(X, y, num_runs=100, seed_start=1, dataset='summeval', dimension='consistency'):
    from tqdm import tqdm
    width = []
    coverage = []
    for i in tqdm(range(num_runs), desc="Running experiments"):
        seed = seed_start + i
        try:
            average_width, coverage_rate = run_experiment(X, y, seed, dataset, dimension)
            width.append(average_width)
            coverage.append(coverage_rate)
        except IndexError as e:
            print(f"Skipping seed {seed} due to error: {e}")
            continue
    
    mean_width = np.mean(width)
    std_width = np.std(width)
    mean_coverage = np.mean(coverage)
    std_coverage = np.std(coverage)

    print("\nSummary of Ordinal APS:")
    print(f"Width: {mean_width:.4f}, {std_width:.4f}")
    print(f"Coverage: {mean_coverage:.4f}, {std_coverage:.4f}")

    return  width, coverage

In [28]:
import os
import pandas as pd

folder_path = f'./model_logits/qwen/'
dataset = 'Dialsumm'
data = {}
for dimension in ["consistency", "coherence", "fluency", "relevance"]:
        file_path = os.path.join(folder_path, f"{dataset}_{dimension}_logits.csv")
        df = pd.read_csv(file_path)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        width, coverage = calculate_statistics(X, y, num_runs=30, seed_start=1, dimension=dimension, dataset=dataset)



Running experiments:   3%|▎         | 1/30 [00:00<00:11,  2.43it/s]

Seed: 1, Width: 2.4981, Coverage: 0.8600


Running experiments:   7%|▋         | 2/30 [00:00<00:11,  2.44it/s]

Seed: 2, Width: 2.4919, Coverage: 0.8614


Running experiments:  10%|█         | 3/30 [00:01<00:10,  2.59it/s]

Seed: 3, Width: 2.5514, Coverage: 0.8843


Running experiments:  13%|█▎        | 4/30 [00:01<00:09,  2.80it/s]

Seed: 4, Width: 2.6267, Coverage: 0.8986


Running experiments:  17%|█▋        | 5/30 [00:01<00:08,  2.80it/s]

Seed: 5, Width: 2.5719, Coverage: 0.8814


Running experiments:  20%|██        | 6/30 [00:02<00:08,  2.68it/s]

Seed: 6, Width: 2.5029, Coverage: 0.8700


Running experiments:  23%|██▎       | 7/30 [00:02<00:07,  2.89it/s]

Seed: 7, Width: 2.6281, Coverage: 0.9100


Running experiments:  27%|██▋       | 8/30 [00:02<00:07,  2.96it/s]

Seed: 8, Width: 2.6038, Coverage: 0.8829


Running experiments:  30%|███       | 9/30 [00:03<00:06,  3.05it/s]

Seed: 9, Width: 2.6324, Coverage: 0.8986


Running experiments:  33%|███▎      | 10/30 [00:03<00:06,  3.06it/s]

Seed: 10, Width: 2.6386, Coverage: 0.8929


Running experiments:  37%|███▋      | 11/30 [00:03<00:06,  2.96it/s]

Seed: 11, Width: 2.5505, Coverage: 0.8800


Running experiments:  40%|████      | 12/30 [00:04<00:06,  2.80it/s]

Seed: 12, Width: 2.4781, Coverage: 0.8614


Running experiments:  43%|████▎     | 13/30 [00:04<00:05,  2.89it/s]

Seed: 13, Width: 2.6362, Coverage: 0.8914


Running experiments:  47%|████▋     | 14/30 [00:04<00:05,  2.91it/s]

Seed: 14, Width: 2.5838, Coverage: 0.8900


Running experiments:  50%|█████     | 15/30 [00:05<00:05,  2.92it/s]

Seed: 15, Width: 2.6048, Coverage: 0.8943


Running experiments:  53%|█████▎    | 16/30 [00:05<00:04,  2.99it/s]

Seed: 16, Width: 2.6295, Coverage: 0.8929


Running experiments:  57%|█████▋    | 17/30 [00:05<00:04,  2.89it/s]

Seed: 17, Width: 2.5481, Coverage: 0.8714


Running experiments:  60%|██████    | 18/30 [00:06<00:04,  2.73it/s]

Seed: 18, Width: 2.4967, Coverage: 0.8629


Running experiments:  63%|██████▎   | 19/30 [00:06<00:03,  2.88it/s]

Seed: 19, Width: 2.6443, Coverage: 0.8914


Running experiments:  67%|██████▋   | 20/30 [00:07<00:03,  2.75it/s]

Seed: 20, Width: 2.5086, Coverage: 0.8643


Running experiments:  70%|███████   | 21/30 [00:07<00:03,  2.61it/s]

Seed: 21, Width: 2.4371, Coverage: 0.8486


Running experiments:  73%|███████▎  | 22/30 [00:07<00:03,  2.61it/s]

Seed: 22, Width: 2.5605, Coverage: 0.8629


Running experiments:  77%|███████▋  | 23/30 [00:08<00:02,  2.53it/s]

Seed: 23, Width: 2.4986, Coverage: 0.8571


Running experiments:  80%|████████  | 24/30 [00:08<00:02,  2.67it/s]

Seed: 24, Width: 2.6229, Coverage: 0.8943


Running experiments:  83%|████████▎ | 25/30 [00:08<00:01,  2.74it/s]

Seed: 25, Width: 2.5633, Coverage: 0.8814


Running experiments:  87%|████████▋ | 26/30 [00:09<00:01,  2.78it/s]

Seed: 26, Width: 2.5786, Coverage: 0.8800


Running experiments:  90%|█████████ | 27/30 [00:09<00:01,  2.81it/s]

Seed: 27, Width: 2.5652, Coverage: 0.8800


Running experiments:  93%|█████████▎| 28/30 [00:09<00:00,  2.89it/s]

Seed: 28, Width: 2.5757, Coverage: 0.8843


Running experiments:  97%|█████████▋| 29/30 [00:10<00:00,  2.95it/s]

Seed: 29, Width: 2.5810, Coverage: 0.8886


Running experiments: 100%|██████████| 30/30 [00:10<00:00,  2.83it/s]


Seed: 30, Width: 2.6200, Coverage: 0.9014

Summary of Ordinal APS:
Width: 2.5676, 0.0560
Coverage: 0.8806, 0.0151


  covered[val_labels == j].mean() for j in range(np.unique(val_labels).max() + 1)
  ret = ret.dtype.type(ret / rcount)
Running experiments:  10%|█         | 3/30 [00:00<00:01, 24.76it/s]

Seed: 1, Width: 2.8805, Coverage: 0.5986
Seed: 2, Width: 2.8633, Coverage: 0.5957
Seed: 3, Width: 2.8729, Coverage: 0.5757
Seed: 4, Width: 2.8967, Coverage: 0.6100
Seed: 5, Width: 2.8976, Coverage: 0.6129


Running experiments:  30%|███       | 9/30 [00:00<00:00, 24.22it/s]

Seed: 6, Width: 2.8414, Coverage: 0.5771
Seed: 7, Width: 2.8424, Coverage: 0.5914
Seed: 8, Width: 2.8800, Coverage: 0.5857
Seed: 9, Width: 2.8871, Coverage: 0.6057
Seed: 10, Width: 2.8876, Coverage: 0.6071
Seed: 11, Width: 2.8605, Coverage: 0.5871


Running experiments:  40%|████      | 12/30 [00:00<00:00, 24.36it/s]

Seed: 12, Width: 2.8914, Coverage: 0.5929
Seed: 13, Width: 2.8886, Coverage: 0.6186
Seed: 14, Width: 2.8795, Coverage: 0.5814


Running experiments:  50%|█████     | 15/30 [00:00<00:00, 24.52it/s]

Seed: 15, Width: 2.8690, Coverage: 0.5729
Seed: 16, Width: 2.8919, Coverage: 0.5971
Seed: 17, Width: 2.8505, Coverage: 0.5857


Running experiments:  60%|██████    | 18/30 [00:00<00:00, 24.47it/s]

Seed: 18, Width: 2.8671, Coverage: 0.5857
Seed: 19, Width: 2.8624, Coverage: 0.5957
Seed: 20, Width: 2.8786, Coverage: 0.5900


Running experiments:  70%|███████   | 21/30 [00:00<00:00, 24.38it/s]

Seed: 21, Width: 2.8724, Coverage: 0.6186
Seed: 22, Width: 2.8976, Coverage: 0.6257
Seed: 23, Width: 2.8648, Coverage: 0.5886


Running experiments:  80%|████████  | 24/30 [00:00<00:00, 24.70it/s]

Seed: 24, Width: 2.8971, Coverage: 0.6257
Seed: 25, Width: 2.8771, Coverage: 0.5971
Seed: 26, Width: 2.8557, Coverage: 0.5714


Running experiments:  90%|█████████ | 27/30 [00:01<00:00, 24.67it/s]

Seed: 27, Width: 2.8757, Coverage: 0.6043
Seed: 28, Width: 2.8614, Coverage: 0.5843
Seed: 29, Width: 2.8695, Coverage: 0.5914


Running experiments: 100%|██████████| 30/30 [00:01<00:00, 24.49it/s]


Seed: 30, Width: 2.8919, Coverage: 0.6343

Summary of Ordinal APS:
Width: 2.8751, 0.0158
Coverage: 0.5970, 0.0161


  covered[val_labels == j].mean() for j in range(np.unique(val_labels).max() + 1)
  ret = ret.dtype.type(ret / rcount)
Running experiments:  10%|█         | 3/30 [00:00<00:01, 25.42it/s]

Seed: 1, Width: 2.9495, Coverage: 0.7400
Seed: 2, Width: 2.9524, Coverage: 0.7286
Seed: 3, Width: 2.9838, Coverage: 0.7429
Seed: 4, Width: 2.9371, Coverage: 0.7300
Seed: 5, Width: 2.9005, Coverage: 0.7214


Running experiments:  30%|███       | 9/30 [00:00<00:00, 23.98it/s]

Seed: 6, Width: 2.9438, Coverage: 0.7114
Seed: 7, Width: 2.9500, Coverage: 0.7271
Seed: 8, Width: 2.9571, Coverage: 0.7443
Seed: 9, Width: 2.9152, Coverage: 0.7329
Seed: 10, Width: 2.9462, Coverage: 0.7329
Seed: 11, Width: 2.9614, Coverage: 0.7329


Running experiments:  40%|████      | 12/30 [00:00<00:00, 23.66it/s]

Seed: 12, Width: 2.9110, Coverage: 0.7143
Seed: 13, Width: 2.9181, Coverage: 0.7171
Seed: 14, Width: 2.9690, Coverage: 0.7214


Running experiments:  50%|█████     | 15/30 [00:00<00:00, 23.69it/s]

Seed: 15, Width: 2.9586, Coverage: 0.7200
Seed: 16, Width: 2.9495, Coverage: 0.7357
Seed: 17, Width: 2.9771, Coverage: 0.7143


Running experiments:  60%|██████    | 18/30 [00:00<00:00, 23.86it/s]

Seed: 18, Width: 2.9714, Coverage: 0.7443
Seed: 19, Width: 2.9324, Coverage: 0.7400


Running experiments:  70%|███████   | 21/30 [00:00<00:00, 23.53it/s]

Seed: 20, Width: 2.9533, Coverage: 0.7314
Seed: 21, Width: 2.9652, Coverage: 0.7386
Seed: 22, Width: 2.9314, Coverage: 0.7171
Seed: 23, Width: 2.9624, Coverage: 0.7257


Running experiments:  80%|████████  | 24/30 [00:01<00:00, 23.61it/s]

Seed: 24, Width: 2.9405, Coverage: 0.7229


Running experiments:  90%|█████████ | 27/30 [00:01<00:00, 23.79it/s]

Seed: 25, Width: 2.9390, Coverage: 0.7400
Seed: 26, Width: 2.9548, Coverage: 0.7243
Seed: 27, Width: 2.9290, Coverage: 0.7086
Seed: 28, Width: 2.9433, Coverage: 0.7271
Seed: 29, Width: 2.9276, Coverage: 0.7357


Running experiments: 100%|██████████| 30/30 [00:01<00:00, 23.72it/s]


Seed: 30, Width: 2.9519, Coverage: 0.7386

Summary of Ordinal APS:
Width: 2.9461, 0.0193
Coverage: 0.7287, 0.0101


Running experiments:   7%|▋         | 2/30 [00:00<00:01, 16.49it/s]

Seed: 1, Width: 2.8819, Coverage: 0.9014
Seed: 2, Width: 2.7705, Coverage: 0.8743
Seed: 3, Width: 2.8438, Coverage: 0.9014


Running experiments:  13%|█▎        | 4/30 [00:00<00:01, 17.36it/s]

Seed: 4, Width: 2.8957, Coverage: 0.9243


Running experiments:  20%|██        | 6/30 [00:00<00:01, 17.07it/s]

Seed: 5, Width: 2.9119, Coverage: 0.9086
Seed: 6, Width: 2.7838, Coverage: 0.8786
Seed: 7, Width: 2.7948, Coverage: 0.8929


Running experiments:  33%|███▎      | 10/30 [00:00<00:01, 16.55it/s]

Seed: 8, Width: 2.8419, Coverage: 0.8943
Seed: 9, Width: 2.8376, Coverage: 0.8900
Seed: 10, Width: 2.8257, Coverage: 0.8900
Seed: 11, Width: 2.8029, Coverage: 0.8843


Running experiments:  47%|████▋     | 14/30 [00:00<00:00, 17.25it/s]

Seed: 12, Width: 2.8386, Coverage: 0.8929
Seed: 13, Width: 2.9100, Coverage: 0.9200
Seed: 14, Width: 2.8643, Coverage: 0.9029
Seed: 15, Width: 2.8067, Coverage: 0.8886


Running experiments:  60%|██████    | 18/30 [00:01<00:00, 16.30it/s]

Seed: 16, Width: 2.8133, Coverage: 0.8786
Seed: 17, Width: 2.7076, Coverage: 0.8529
Seed: 18, Width: 2.8586, Coverage: 0.9071
Seed: 19, Width: 2.8710, Coverage: 0.9114


Running experiments:  73%|███████▎  | 22/30 [00:01<00:00, 16.48it/s]

Seed: 20, Width: 2.8971, Coverage: 0.9086
Seed: 21, Width: 2.7762, Coverage: 0.8771
Seed: 22, Width: 2.8400, Coverage: 0.8814


Running experiments:  80%|████████  | 24/30 [00:01<00:00, 16.09it/s]

Seed: 23, Width: 2.7381, Coverage: 0.8614
Seed: 24, Width: 2.8481, Coverage: 0.8857
Seed: 25, Width: 2.7676, Coverage: 0.8671
Seed: 26, Width: 2.8519, Coverage: 0.9000


Running experiments: 100%|██████████| 30/30 [00:01<00:00, 16.57it/s]

Seed: 27, Width: 2.8333, Coverage: 0.8943
Seed: 28, Width: 2.8067, Coverage: 0.8843
Seed: 29, Width: 2.8195, Coverage: 0.8986
Seed: 30, Width: 2.9071, Coverage: 0.9271

Summary of Ordinal APS:
Width: 2.8315, 0.0500
Coverage: 0.8927, 0.0172





In [None]:
# import os
# import pandas as pd

# folder_path = f'./model_logits/qwen/'

# data = {}
# for dimension in ["cosmos", "drop", "esnli", "gsm8k"]:
#         file_path = os.path.join(folder_path, f"SocREval_{dimension}_logits.csv")
#         df = pd.read_csv(file_path)
#         X = df.iloc[:, :-1]
#         y = df.iloc[:, -1]
#         width, coverage = calculate_statistics(X, y, num_runs=30, seed_start=1, dimension=dimension, dataset='SocREval')

