In [1]:
word_to_number = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5
}

import numpy as np
import random
import torch

alpha = 0.10

import json
import math
import pandas as pd
from matplotlib import pyplot as plt
import os, sys



In [2]:
def get_qhat_ordinal_aps(prediction_function, cal_scores, cal_labels, alpha):
    n = cal_scores.shape[0]
    grid_size = 10000
    for q in np.linspace(1e-3, 1 - 1e-3, grid_size)[::-1]:
        coverage, _, _ = evaluate_sets(prediction_function, np.copy(cal_scores), np.copy(cal_labels), q, alpha)
        if coverage <= (np.ceil((n + 1)*(1 - alpha))/n):
            # return q + 1/(grid_size - 1)
            return np.minimum(q + 1/(grid_size - 1), 1.0 - 1e-6)  # Clip q to be less than 1.0
    return q

def evaluate_sets(prediction_function, val_scores, val_labels, qhat, alpha, print_bool=False):
    sets = prediction_function(val_scores, qhat)
    # Check
    sizes = sets.sum(axis=1)
    sizes_distribution = np.array([(sizes == i).mean() for i in range(5)])
    # Evaluate coverage
    covered = sets[np.arange(val_labels.shape[0]), val_labels]
    coverage = covered.mean()
    label_stratified_coverage = [
        covered[val_labels == j].mean() for j in range(np.unique(val_labels).max() + 1)
    ]
    label_distribution = [
        (val_labels == j).mean() for j in range(np.unique(val_labels).max() + 1)
    ]
    if(print_bool):
        print(r'$\alpha$' + f":{alpha}  |  coverage: {coverage}  |  average size: {sizes.mean()}  |  qhat: {qhat}  |  set size distribution: {sizes_distribution} ")
        print(f"label stratified coverage: {label_stratified_coverage}  \nlabel distribution: {label_distribution}")
    return coverage, label_stratified_coverage, sizes_distribution

def ordinal_aps_prediction(val_scores, qhat):
    import numpy as np

    n_samples, n_classes = val_scores.shape
    P = val_scores == val_scores.max(axis=1)[:, None]

    idx_construction_incomplete = (val_scores * P.astype(float)).sum(axis=1) <= qhat

    max_iter = n_classes  
    iter_count = 0

    while idx_construction_incomplete.sum() > 0:
        iter_count += 1
        if iter_count > max_iter:
            P[idx_construction_incomplete] = True
            break

        P_inc = P[idx_construction_incomplete]
        scores_inc = val_scores[idx_construction_incomplete]

        set_cumsum = P_inc.cumsum(axis=1)
        lower_edge_idx = (P_inc > 0).argmax(axis=1)
        upper_edge_idx = set_cumsum.argmax(axis=1)

        left_valid = (lower_edge_idx - 1) >= 0
        right_valid = (upper_edge_idx + 1) < scores_inc.shape[1]

        lower_edge_wins = np.zeros(scores_inc.shape[0], dtype=bool)

        lower_edge_wins[~right_valid & left_valid] = True

        both_valid = left_valid & right_valid
        lower_scores = scores_inc[np.arange(scores_inc.shape[0])[both_valid], lower_edge_idx[both_valid] - 1]
        upper_scores = scores_inc[np.arange(scores_inc.shape[0])[both_valid], upper_edge_idx[both_valid] + 1]
        lower_edge_wins[both_valid] = lower_scores > upper_scores

        valid_left = lower_edge_wins & ((lower_edge_idx - 1) >= 0)
        P_inc[valid_left, lower_edge_idx[valid_left] - 1] = True

        valid_right = (~lower_edge_wins) & ((upper_edge_idx + 1) < scores_inc.shape[1])
        P_inc[valid_right, upper_edge_idx[valid_right] + 1] = True

        P[idx_construction_incomplete] = P_inc

        idx_construction_incomplete = (val_scores * P.astype(float)).sum(axis=1) <= qhat

    return P


In [3]:
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def run_experiment(X, y, seed, dataset='summeval', dimension='consistency'):
    random.seed(seed)
    np.random.seed(seed)

    X.columns = list(range(len(X.columns)))
    X = X.to_numpy().astype(np.float32)
    y = y.to_numpy().astype(np.float32)-1

    y = y.astype(int)
    x_arr = X
    from scipy.interpolate import interp1d
    n = x_arr.shape[0]

    new_x = softmax(x_arr)

    from sklearn.model_selection import train_test_split

    fyxs_cal, fyxs_test, y_cal, y_test = train_test_split(new_x, y, test_size=0.5, random_state=seed)
    y_cal = y_cal.ravel()
    y_test = y_test.ravel()

    cal_scores = fyxs_cal
    cal_labels = y_cal
    test_scores = fyxs_test
    test_labels = y_test

    qhat = get_qhat_ordinal_aps(ordinal_aps_prediction, np.copy(cal_scores), np.copy(cal_labels), alpha)
    test_pred_sets = ordinal_aps_prediction(np.copy(test_scores), qhat)
    prediction_intervals = []
    for pred_set in test_pred_sets:
        indices = np.where(pred_set)[0]
        if len(indices) > 0:
            interval = (indices.min(), indices.max())
        else:
            interval = None 
        prediction_intervals.append(interval)

    y_qlow, y_qup = zip(*prediction_intervals)
    y_qlow = np.array(y_qlow)+1
    y_qup = np.array(y_qup)+1

    y_test_real = test_labels/3+1
    # y_qlow = np.array(y_qlow)+1
    # y_qup = np.array(y_qup)+1

    # y_test_real = test_labels+1

    df = pd.DataFrame({
        'low':    y_qlow.ravel(),
        'up':     y_qup.ravel(),
        'y_test': y_test_real.ravel(),
    })

    df.to_csv(f'OrdinalAPS_{dataset}_{dimension}_{seed}.csv', index=False)

    in_interval = (y_test_real >= y_qlow) & (y_test_real <= y_qup)

    average_width = np.mean(y_qup-y_qlow)
    coverage_rate = np.mean(in_interval)

    print(f"Seed: {seed}, Width: {average_width:.4f}, Coverage: {coverage_rate:.4f}")

    return average_width, coverage_rate


import time
import tracemalloc
def calculate_statistics(X, y, num_runs=100, seed_start=1, dataset='Summeval', dimension='consistency'):
    from tqdm import tqdm
    timecost = []
    memory = []
    for i in tqdm(range(num_runs), desc="Running experiments"):
        seed = seed_start + i
        tracemalloc.start()
        start = time.perf_counter()
        average_width, coverage_rate = run_experiment(X, y, seed, dataset, dimension)
        end = time.perf_counter()
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        timecost.append(end - start)
        memory.append(peak)

    mean_time = np.mean(timecost)
    std_time = np.std(timecost)
    mean_memory = np.mean(memory)
    std_memory = np.std(memory)

    return  mean_time, std_time, mean_memory, std_memory

In [4]:
import os
import pandas as pd

folder_path = f'../model_logits/qwen/'
for dimension in ["cosmos", "drop", "esnli", "gsm8k"]:
        file_path = os.path.join(folder_path, f"SocREval_{dimension}_logits.csv")
        df = pd.read_csv(file_path)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        mean_time, std_time, mean_memory, std_memory =  calculate_statistics(X, y, num_runs=10, seed_start=1, dimension=dimension, dataset='SocREval')



Running experiments: 100%|██████████| 10/10 [00:01<00:00,  7.09it/s]


Seed: 1, Width: 0.7449, Coverage: 0.4184
Seed: 2, Width: 0.7449, Coverage: 0.3571
Seed: 3, Width: 0.7143, Coverage: 0.3980
Seed: 4, Width: 0.7041, Coverage: 0.3571
Seed: 5, Width: 0.7143, Coverage: 0.3776
Seed: 6, Width: 0.7143, Coverage: 0.3469
Seed: 7, Width: 0.7347, Coverage: 0.3673
Seed: 8, Width: 0.6531, Coverage: 0.2959
Seed: 9, Width: 0.7245, Coverage: 0.3367
Seed: 10, Width: 0.6837, Coverage: 0.2857


Running experiments: 100%|██████████| 10/10 [00:00<00:00, 130.50it/s]


Seed: 1, Width: 0.2857, Coverage: 0.1238
Seed: 2, Width: 0.2667, Coverage: 0.1143
Seed: 3, Width: 0.2857, Coverage: 0.1238
Seed: 4, Width: 0.2667, Coverage: 0.1048
Seed: 5, Width: 0.2857, Coverage: 0.0952
Seed: 6, Width: 0.2667, Coverage: 0.1048
Seed: 7, Width: 0.2286, Coverage: 0.0667
Seed: 8, Width: 0.2762, Coverage: 0.1143
Seed: 9, Width: 0.2381, Coverage: 0.0762
Seed: 10, Width: 0.2667, Coverage: 0.1143


Running experiments:   0%|          | 0/10 [00:00<?, ?it/s]

Seed: 1, Width: 0.6974, Coverage: 0.0526
Seed: 2, Width: 0.5921, Coverage: 0.0526
Seed: 3, Width: 0.6447, Coverage: 0.0263
Seed: 4, Width: 0.7632, Coverage: 0.0658


  covered[val_labels == j].mean() for j in range(np.unique(val_labels).max() + 1)
  ret = ret.dtype.type(ret / rcount)
Running experiments: 100%|██████████| 10/10 [00:00<00:00, 121.09it/s]


Seed: 5, Width: 0.6184, Coverage: 0.0395
Seed: 6, Width: 0.6711, Coverage: 0.0789
Seed: 7, Width: 0.6447, Coverage: 0.0658
Seed: 8, Width: 0.6974, Coverage: 0.0658
Seed: 9, Width: 0.6579, Coverage: 0.0526
Seed: 10, Width: 0.6447, Coverage: 0.0658


Running experiments:   0%|          | 0/10 [00:00<?, ?it/s]

Seed: 1, Width: 0.4300, Coverage: 0.1900
Seed: 2, Width: 0.4000, Coverage: 0.2000
Seed: 3, Width: 0.4200, Coverage: 0.1400
Seed: 4, Width: 0.4700, Coverage: 0.1800
Seed: 5, Width: 0.3900, Coverage: 0.1700
Seed: 6, Width: 0.3500, Coverage: 0.1600


  covered[val_labels == j].mean() for j in range(np.unique(val_labels).max() + 1)
  ret = ret.dtype.type(ret / rcount)
Running experiments: 100%|██████████| 10/10 [00:00<00:00, 133.30it/s]

Seed: 7, Width: 0.5100, Coverage: 0.2200
Seed: 8, Width: 0.4400, Coverage: 0.2100
Seed: 9, Width: 0.5000, Coverage: 0.2000
Seed: 10, Width: 0.4900, Coverage: 0.1900





In [6]:
mean_time, std_time, mean_memory/1024/1024, std_memory/1024/1024

(0.007449290005024522,
 0.0005216576775678698,
 0.19980039596557617,
 0.0013422520293023142)