In [1]:
import json
import os
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import torch
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../utils/reliability_utils")))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../utils")))
from transformers import AutoModelForSequenceClassification, AutoTokenizer
device = torch.device("cuda:" + str(1) if torch.cuda.is_available() else "cpu")
question_file = '/vault/erum/generations/llava7b_aokvqa_generation.json' #update the filename
questions_file = open(os.path.expanduser(question_file), "r")
data_dict = json.load(questions_file)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def AUROC(confidence_scores, correctness):
    auroc_score = metrics.roc_auc_score(1 - np.array(correctness),
                                                            1-np.array(confidence_scores))
    return auroc_score
    # print("AUROC "+label+" "+str(metrics.roc_auc_score(1 - np.array(correctness),
    #                                                         1-np.array(confidence_scores))))

In [3]:
#ref: https://github.com/jlko/semantic_uncertainty
class BaseEntailment:
    def save_prediction_cache(self):
        pass
class EntailmentDeberta(BaseEntailment):
    def __init__(self, DEVICE="cpu"):
        self.Device = DEVICE
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v2-xlarge-mnli")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "microsoft/deberta-v2-xlarge-mnli").to(self.Device)

    def check_implication(self, text1, text2, *args, **kwargs):
        inputs = self.tokenizer(text1, text2, return_tensors="pt").to(self.Device)
        outputs = self.model(**inputs)
        logits = outputs.logits
        # Deberta-mnli returns `neutral` and `entailment` classes at indices 1 and 2.
        largest_index = torch.argmax(F.softmax(logits, dim=1))  # pylint: disable=no-member
        prediction = largest_index.cpu().item()
        return prediction

In [4]:
#ref: https://github.com/jlko/semantic_uncertainty
def get_semantic_ids(strings_list, model, strict_entailment=False, example=None):
    """Group list of predictions into semantic meaning."""

    def are_equivalent(text1, text2):

        implication_1 = model.check_implication(text1, text2, example=example)
        implication_2 = model.check_implication(text2, text1, example=example)  # pylint: disable=arguments-out-of-order
        assert (implication_1 in [0, 1, 2]) and (implication_2 in [0, 1, 2])

        if strict_entailment:
            semantically_equivalent = (implication_1 == 2) and (implication_2 == 2)

        else:
            implications = [implication_1, implication_2]
            # Check if none of the implications are 0 (contradiction) and not both of them are neutral.
            semantically_equivalent = (0 not in implications) and ([1, 1] != implications)

        return semantically_equivalent

    # Initialise all ids with -1.
    semantic_set_ids = [-1] * len(strings_list)
    # Keep track of current id.
    next_id = 0
    for i, string1 in enumerate(strings_list):
        # Check if string1 already has an id assigned.
        if semantic_set_ids[i] == -1:
            # If string1 has not been assigned an id, assign it next_id.
            semantic_set_ids[i] = next_id
            for j in range(i+1, len(strings_list)):
                # Search through all remaining strings. If they are equivalent to string1, assign them the same id.
                if are_equivalent(string1, strings_list[j]):
                    semantic_set_ids[j] = next_id
            next_id += 1

    assert -1 not in semantic_set_ids

    return semantic_set_ids

In [5]:
def cluster_assignment_entropy(semantic_ids):
    """Estimate semantic uncertainty from how often different clusters get assigned.

    We estimate the categorical distribution over cluster assignments from the
    semantic ids. The uncertainty is then given by the entropy of that
    distribution. This estimate does not use token likelihoods, it relies soley
    on the cluster assignments. If probability mass is spread of between many
    clusters, entropy is larger. If probability mass is concentrated on a few
    clusters, entropy is small.

    Input:
        semantic_ids: List of semantic ids, e.g. [0, 1, 2, 1].
    Output:
        cluster_entropy: Entropy, e.g. (-p log p).sum() for p = [1/4, 2/4, 1/4].
    """

    n_generations = len(semantic_ids)
    counts = np.bincount(semantic_ids)
    probabilities = counts/n_generations
    assert np.isclose(probabilities.sum(), 1)
    entropy = - (probabilities * np.log(probabilities)).sum()
    return entropy

def predictive_entropy_rao(log_probs):
    entropy = - np.sum(np.exp(log_probs) * log_probs)
    return entropy


def logsumexp_by_id(semantic_ids, log_likelihoods, agg='sum_normalized'):
    """Sum probabilities with the same semantic id.

    Log-Sum-Exp because input and output probabilities in log space.
    """
    unique_ids = sorted(list(set(semantic_ids)))
    assert unique_ids == list(range(len(unique_ids)))
    log_likelihood_per_semantic_id = []

    for uid in unique_ids:
        # Find positions in `semantic_ids` which belong to the active `uid`.
        id_indices = [pos for pos, x in enumerate(semantic_ids) if x == uid]
        # Gather log likelihoods at these indices.
        id_log_likelihoods = [log_likelihoods[i] for i in id_indices]
        if agg == 'sum_normalized':
            # log_lik_norm = id_log_likelihoods - np.prod(log_likelihoods)
            log_lik_norm = id_log_likelihoods - np.log(np.sum(np.exp(log_likelihoods)))
            logsumexp_value = np.log(np.sum(np.exp(log_lik_norm)))
        else:
            raise ValueError
        log_likelihood_per_semantic_id.append(logsumexp_value)

    return log_likelihood_per_semantic_id

In [6]:
model = EntailmentDeberta(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
import numpy as np
from scipy.integrate import simps

def compute_aurac(correctness, reliability):
    # Ensure inputs are numpy arrays
    correctness = np.array(correctness)
    reliability = np.array(reliability)
    
    # Sort by reliability in descending order
    sorted_indices = np.argsort(-reliability)
    correctness = correctness[sorted_indices]
    reliability = reliability[sorted_indices]
    
    # Cumulative sums for reliability and correctness
    cumulative_reliability = np.cumsum(reliability) / np.sum(reliability)
    cumulative_accuracy = np.cumsum(correctness) / np.arange(1, len(correctness) + 1)
    
    # Compute the area under the curve
    aurac = simps(cumulative_accuracy, cumulative_reliability)
    
    return aurac

def compute_aurac_with_penalty(correctness, reliability):
    # Ensure inputs are numpy arrays
    correctness = np.array(correctness)
    reliability = np.array(reliability)
    
    # Sort by reliability in descending order
    sorted_indices = np.argsort(-reliability)
    correctness = correctness[sorted_indices]
    reliability = reliability[sorted_indices]
    
    # Compute cumulative metrics with penalty
    cumulative_correct = np.cumsum(correctness)  # Correct answers cumulative sum
    cumulative_penalty = np.cumsum(1 - correctness)  # Incorrect answers cumulative sum
    cumulative_accuracy = (cumulative_correct - cumulative_penalty) / np.arange(1, len(correctness) + 1)
    
    # Ensure cumulative accuracy remains within valid bounds
    cumulative_accuracy = np.clip(cumulative_accuracy, 0, 1)
    
    # Cumulative reliability
    cumulative_reliability = np.cumsum(reliability) / np.sum(reliability)
    
    # Compute the area under the curve
    aurac = simps(cumulative_accuracy, cumulative_reliability)
    
    return aurac

In [8]:
#ref: https://github.com/Ybakman/TruthTorchLM
def prediction_rejection_curve(estimator, target):
    """
    Calculates the prediction rejection curve score.
    
    The prediction rejection curve shows how model performance changes as we reject predictions
    based on their uncertainty estimates.

    Args:
        estimator (array-like): Array of uncertainty estimates for each prediction
        target (array-like): Array of true values/labels

    Returns:
        float: Prediction rejection curve score
    """
    target = normalize(target) #higher is correct
    # estimator: lower is more uncertain
    ue = np.array(estimator)
    num_obs = len(ue)
    # Sort in descending order: the least uncertain come first
    ue_argsort = np.argsort(ue)[::-1]
    # want sorted_metrics to be increasing => smaller scores is better
    sorted_metrics = np.array(target)[ue_argsort]
    # Since we want all plots to coincide when all the data is discarded
    cumsum = np.cumsum(sorted_metrics)[-num_obs:]
    scores = (cumsum / np.arange(1, num_obs + 1))[::-1]
    prr_score = np.sum(scores) / num_obs
    return prr_score

def normalize(target):
    """
    Normalizes an array of values to the range [0,1].

    Args:
        target (array-like): Array of values to normalize

    Returns:
        array: Normalized values between 0 and 1
    """
    min_t, max_t = np.min(target), np.max(target)
    if np.isclose(min_t, max_t):
        min_t -= 1
        max_t += 1
    target = (np.array(target) - min_t) / (max_t - min_t)
    return target

In [9]:
from scipy.special import rel_entr
from scipy.stats import entropy
import torch
import torch.nn.functional as F

device = torch.device("cuda:" + str(0) if torch.cuda.is_available() else "cpu")

qids = list(data_dict.keys())
abstained_qids = []
confidence = []
accQA = []
N = 0
accuracy = 0

entropy_scores = []    
semantic_entropy = []
cluster_entropy_ = []   
self_eval_ = []
first_token_ = [] 
difference_ =[]
exact_match = []
for qid in qids:
    N += 1
    rollout = data_dict[qid]
    question = rollout['vqa_question']
    answer_tokens =  rollout['answer_logprobs_dict']['answer_tokens']
    lave_score = rollout['lave_score']
    if answer_tokens[0] == '':
        reg_first_token = rollout['answer_logprobs_dict']['token_probs'][1]
    else:
        reg_first_token = rollout['answer_logprobs_dict']["first_token_prob"]
    most_likely_logprob = np.prod(rollout['answer_logprobs_dict']['token_probs'])**(1/len(rollout['answer_logprobs_dict']['token_probs']))
    first_token_.append(reg_first_token)

    aug_first_token = rollout['blackimage_logprobs_dict']["first_token_prob"]
    self_eval = rollout['answer_logprobs_dict']["yn_logits_reg"]
    self_eval_.append(self_eval)
    difference_.append(abs(rollout['answer_logprobs_dict']["first_token_prob"] - rollout['blackimage_logprobs_dict']["first_token_prob"]))
    beam_answers = rollout['beam_answers']
    log_likelihoods = []
    for beam in range(5):
        log_likelihoods.append(np.sum(np.log(rollout['beam_logprobs'][beam]['token_probs'])))
    beam_answers = rollout['beam_answers']

    if lave_score > 0:
        correctess_score = 1
    else:
        correctess_score = 0
    accQA.append(correctess_score)


    strings_list = [f'{question} {r}' for r in beam_answers[1:]]
    semantic_ids = get_semantic_ids(strings_list, model, strict_entailment=False, example=None)

    cluster_entropy = cluster_assignment_entropy(semantic_ids)
    confidence.append(np.prod(most_likely_logprob))
    entropy_scores.append(-predictive_entropy_rao(log_likelihoods))
    log_probs = logsumexp_by_id(semantic_ids, log_likelihoods, agg='sum_normalized')
    semantic_entropy.append(-predictive_entropy_rao(log_probs))
    cluster_entropy_.append(-cluster_entropy)

metrics_ = ["confidence", "Entropy", "SE", "Cluster Entropy", "Self eval", "First Token", "Visual UE"]
scores = [confidence, entropy_scores, semantic_entropy, cluster_entropy_, self_eval_, first_token_, difference_]
index = 0
flattened_label = accQA
RANDOM_SEED = 42
for metric_name in metrics_:
    flattened_scores = scores[index]
    auroc = AUROC(flattened_scores, flattened_label)
    prc = prediction_rejection_curve(flattened_scores, flattened_label)
    print(f"Metric: {metric_name} AUROC: {auroc}, PRC: {prc}")
    index += 1



Metric: confidence AUROC: 0.7491123572393192, PRC: 0.8759513950357724
Metric: Entropy AUROC: 0.5960085142156994, PRC: 0.807482394072488
Metric: SE AUROC: 0.7614119109584754, PRC: 0.8733835496477144
Metric: Cluster Entropy AUROC: 0.6977788808099237, PRC: 0.8487278697389765
Metric: Self eval AUROC: 0.7206787225099051, PRC: 0.8604434414039752
Metric: First Token AUROC: 0.6939309382629928, PRC: 0.8122382986460082
Metric: Visual UE AUROC: 0.7527434198170159, PRC: 0.8764201029903292
