In [2]:
import pickle
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

In [3]:
with open('/restricted/projectnb/batmanlab/chyuwang/RaDialog/exp/uncertainty/green_uncertainty-3858.pkl', 'rb') as file:
    green_uncertainty = pickle.load(file)
with open('/restricted/projectnb/batmanlab/chyuwang/RaDialog/exp/text_score/green_scores-3858.pkl', 'rb') as file:
    green_score = pickle.load(file)
us = np.array([t.numpy() for t in green_score['greens']])


ug = np.array([t.numpy() for t in green_uncertainty['uncertainty']])
pearson_coeff, _ = pearsonr(us, ug)

In [4]:
def calculate_empirical_rce(uncertainty_values, correctness_values, num_bins=20):
    """
    Calculate the Empirical Rank Calibration Error (RCE) for uncertainty and correctness values.
    
    Parameters:
    - uncertainty_values: numpy array of uncertainty values (ug)
    - correctness_values: numpy array of correctness values (us)
    - num_bins: Number of bins to divide the uncertainty values (default is 20)
    
    Returns:
    - empirical_rce: Calculated Empirical Rank Calibration Error
    """
    quantiles = np.linspace(0, 1, num_bins + 1)
    bin_edges = np.quantile(uncertainty_values, quantiles)
    
    bin_indices = np.digitize(uncertainty_values, bin_edges, right=True) - 1  # Bin indices for each uncertainty value

    expected_correctness = np.zeros(num_bins)
    average_uncertainty = np.zeros(num_bins)
    bin_counts = np.zeros(num_bins)

    for i in range(len(uncertainty_values)):
        bin_idx = bin_indices[i]
        if 0 <= bin_idx < num_bins:
            expected_correctness[bin_idx] += correctness_values[i]
            average_uncertainty[bin_idx] += uncertainty_values[i]
            bin_counts[bin_idx] += 1

    expected_correctness /= np.maximum(bin_counts, 1)
    average_uncertainty /= np.maximum(bin_counts, 1)

    return expected_correctness,average_uncertainty

def cal_rce(uq,score,num_bins):
    rce = 0
    tmp = calculate_empirical_rce(uq, score, num_bins=20)
    acc = tmp[0]
    uq = tmp[1]
    for i in range(num_bins):
        ans = abs(len([bol for bol in acc>acc[i] if bol == True]) - len([bol for bol in uq<uq[i] if bol == True]))/num_bins
        rce += ans
    return rce/num_bins
# Calculate RCE

In [135]:
res = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp_result/RaDialog/scores/report_scores_-1.csv')

In [136]:
RadCliQ = res['RadCliQ-v0'].values

In [137]:
pe = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp/uncertainty/u_normnll.csv')
pes = list(pe['u_normnll'].values)
pe1 = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp/uncertainty/u_nll.csv')
pes1 = np.array(pe1['u_nll'].values)
u_lexicalsim = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp_result/RaDialog/UQ/lexicalUQ.csv')
u_lexicalsim = u_lexicalsim['ROUGE_L_UQ'].values
u_lexicalsim = [1-i for i in u_lexicalsim]

In [138]:
uncertainty_list = [pes,pes1,u_lexicalsim,ug] # Normalized Entropy - Predictive Entropy  - Lexical Similarity - VRO-GREEN

In [148]:
for idx,i in enumerate(uncertainty_list):
    print(cal_rce(i,us,num_bins=20),cal_rce(i[:-1],-RadCliQ,num_bins=20))


0.045 0.145
0.045000000000000005 0.09
0.045 0.04
0.015 0.019999999999999997


#

#  CheXpertPlus_mimiccxr

In [5]:
res = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp_result/ChexpertPlus/cxr_benchmark/chexpertPlus_report_scores.csv')

In [6]:
score = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp_result/ChexpertPlus/green_scores-chexpert-plus-3858.csv',header=None)
ugreen = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp_result/ChexpertPlus/chexpert-plus-green_uncertainty-3858.csv',header=None)
score = np.array([float(t.replace("tensor(", "").replace(")", "")) for t in score[0].values])
ugreen = np.array([float(t.replace("tensor(", "").replace(")", "")) for t in ugreen[0].values])

In [26]:
u_lexicalsim = pd.read_csv('/restricted/projectnb/batmanlab/chyuwang/rrg_factual_uncertainty/exp_result/ChexpertPlus/chexpert-plus_lexicalUQ.csv')
u_lexicalsim = u_lexicalsim['ROUGE_L_UQ'].values
u_lexicalsim = [1-i for i in u_lexicalsim]

In [27]:
uncertainty_list = [u_lexicalsim,ugreen] # Normalized Entropy - Predictive Entropy  - Lexical Similarity - VRO-GREEN

In [28]:
RadCliQ = res['RadCliQ-v0'].values

In [29]:
for idx,i in enumerate(uncertainty_list):
    print(cal_rce(i,-RadCliQ,num_bins=20)) # RadCliQ - lexical , ugreen


0.030000000000000006
0.025


In [30]:
for idx,i in enumerate(uncertainty_list):
    print(cal_rce(i,score,num_bins=20))

0.030000000000000006
0.019999999999999997
