In [3]:
import os
import random
import time
import numpy as np

import json
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from mapie.metrics import regression_coverage_score
from mapie.quantile_regression import MapieQuantileRegressor

def range_modification(y_qlow, y_qup, range_low,  range_up):
    y_qlow = np.clip(y_qlow, range_low, range_up)
    y_qup = np.clip(y_qup, range_low, range_up)
    return y_qlow, y_qup

def run_experiment(X, y, seed, dimension, dataset, type = 'sym'):
    random.seed(seed)
    np.random.seed(seed)

    # X = clr_transform(X)
    X = X.to_numpy().astype(np.float32)
    y = y.to_numpy().astype(np.float32)

    from sklearn.model_selection import train_test_split
    X_cal, X_test, y_cal, y_test = train_test_split(X, y, test_size=0.5, random_state=seed)

    gb_reg = GradientBoostingRegressor(loss="quantile", random_state=seed)
    mapie_qr = MapieQuantileRegressor(estimator=gb_reg, alpha=0.1)
    mapie_qr.fit(X_cal, y_cal, random_state=seed)

    if type == 'sym':
        # Symmetric prediction interval
        y_pred_sym, y_pis_sym = mapie_qr.predict(X_test, symmetry=True)
        y_pis_sym = np.clip(y_pis_sym, 1, 5)
        y_qlow = y_pis_sym[:, 0]
        y_qup = y_pis_sym[:, 1]
        coverage = regression_coverage_score(y_test, y_qlow, y_qup)
        width = (y_qup - y_qlow).mean()

        df = pd.DataFrame({
            'low':    y_qlow.ravel(),
            'up':     y_qup.ravel(),
            'y_test': y_test.ravel()
        })

        df.to_csv(f'CQR_sym_{dataset}_{dimension}_{seed}.csv', index=False)

    else:
        # Asymmetric prediction interval
        y_pred_asym, y_pis_asym = mapie_qr.predict(X_test, symmetry=False)
        coverage_asym = regression_coverage_score(y_test, y_pis_asym[:, 0], y_pis_asym[:, 1])
        y_pis_asym = np.clip(y_pis_asym, 1, 5)
        y_qlow = y_pis_asym[:, 0]
        y_qup = y_pis_asym[:, 1]
        coverage = regression_coverage_score(y_test, y_qlow, y_qup)
        width = (y_qup - y_qlow).mean()

        df = pd.DataFrame({
            'low':    y_qlow.ravel(),
            'up':     y_qup.ravel(),
            'y_test': y_test.ravel()
        })

        df.to_csv(f'CQR_asym_{dataset}_{dimension}_{seed}.csv', index=False)
    
    return coverage, width

def calculate_statistics(X, y, num_runs=100, seed_start=1, dimension = 'consistency', dataset='summeval'):
    from tqdm import tqdm
    width_sym_list = []
    coverage_sym_list = []
    width_asym_list = []
    coverage_asym_list = []
    for i in tqdm(range(num_runs), desc="Running experiments"):
        seed = seed_start + i
        try:
            coverage_sym, width_sym, coverage_asym, width_asym = run_experiment(X, y, seed, dimension, dataset)
            width_sym_list.append(width_sym)
            coverage_sym_list.append(coverage_sym)
            width_asym_list.append(width_asym)
            coverage_asym_list.append(coverage_asym)
        except IndexError as e:
            print(f"Skipping seed {seed} due to error: {e}")
            continue
    
    mean_width_sym = np.mean(width_sym_list)
    std_width_sym = np.std(width_sym_list)
    mean_coverage_sym = np.mean(coverage_sym_list)
    std_coverage_sym = np.std(coverage_sym_list)
    mean_width_asym = np.mean(width_asym_list)
    std_width_asym = np.std(width_asym_list)
    mean_coverage_asym = np.mean(coverage_asym_list)
    std_coverage_asym = np.std(coverage_asym_list)

    print("\nSummary of CQR sym:")
    print(f"Width: {mean_width_sym:.4f} ± {std_width_sym:.4f}")
    print(f"Coverage: {mean_coverage_sym:.4f} ± {std_coverage_sym:.4f}")
    print("\nSummary of CQR asym:")
    print(f"Width: {mean_width_asym:.4f} ± {std_width_asym:.4f}")
    print(f"Coverage: {mean_coverage_asym:.4f} ± {std_coverage_asym:.4f}")

    return  mean_width_sym, mean_coverage_sym, mean_width_asym, mean_coverage_asym


import time
import tracemalloc
def calculate_statistics(X, y, num_runs=30, seed_start=1, dataset='summeval', dimension='consistency'):
    from tqdm import tqdm
    time_sym = []
    memory_sym = []
    time_asym = []
    memory_asym = []
    for i in tqdm(range(num_runs), desc="Running experiments"):
        seed = seed_start + i 
        tracemalloc.start()
        start = time.perf_counter()
        average_width, coverage_rate = run_experiment(X, y, seed, dataset, dimension, 'sym')
        end = time.perf_counter()
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        
        time_sym.append(end - start)
        memory_sym.append(peak/1024/1024)

        tracemalloc.start()
        start = time.perf_counter()
        average_width, coverage_rate = run_experiment(X, y, seed,  dataset, dimension, 'asym')
        end = time.perf_counter()
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        time_asym.append(end - start)
        memory_asym.append(peak/1024/1024)

    mean_time_sym = np.mean(time_sym)
    std_time_sym = np.std(time_sym)
    mean_memory_sym = np.mean(memory_sym)
    std_memory_sym = np.std(memory_sym)
    mean_time_asym = np.mean(time_asym)
    std_time_asym = np.std(time_asym)
    mean_memory_asym = np.mean(memory_asym)
    std_memory_asym = np.std(memory_asym)

    return mean_memory_sym, std_memory_sym, mean_time_sym, std_time_sym, mean_memory_asym, std_memory_asym, mean_time_asym, std_time_asym

In [4]:
import os
import pandas as pd

folder_path = f'../model_logits/qwen/'
for dimension in ["cosmos", "drop", "esnli", "gsm8k"]:
        file_path = os.path.join(folder_path, f"SocREval_{dimension}_logits.csv")
        df = pd.read_csv(file_path)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        mean_memory_sym, std_memory_sym, mean_time_sym, std_time_sym, mean_memory_asym, std_memory_asym, mean_time_asym, std_time_asym =  calculate_statistics(X, y, num_runs=10, seed_start=1, dimension=dimension, dataset='SocREval')



INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.
Running experiments: 100%|██████████| 10/10 [00:16<00:00,  1.63s/it]
INFO:ro

In [5]:
mean_memory_sym, std_memory_sym, mean_time_sym, std_time_sym, mean_memory_asym, std_memory_asym, mean_time_asym, std_time_asym

(0.3490950584411621,
 0.003427822416011128,
 0.8254816199885682,
 0.03127477569849191,
 0.34865474700927734,
 0.002112444711116759,
 0.8244668800150976,
 0.030993821622973625)