# Get the answers

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
os.chdir("/home/yw699/codes/LLM-halu")
sys.path.append(os.path.abspath("src"))
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

In [None]:
import yaml
import math
from dataset import Dataset
from prompt_engineer import PromptGenerator
from models import HuggingfaceModel
from utils import *
from metrics import *
import logging
import wandb
from tqdm import tqdm
import gc
import torch
import numpy as np
import random

In [None]:
setup_logger()

In [None]:
with open("configs/experiment_config1.yaml", "r") as file:
    config = yaml.safe_load(file)
    
wandb_config = config["wandb"]
metrics_config = config["metrics"]
experiment_details = {'config': config}

In [None]:
user = os.environ['USER']
slurm_jobid = os.getenv('SLURM_JOB_ID', None)
scratch_dir = os.getenv('SCRATCH_DIR', '.')
entity = os.getenv('WANDB_SEM_UNC_ENTITY', None)

dir = f"{scratch_dir}/{user}/{entity}"
if not os.path.exists(dir):
        os.makedirs(dir)
project = config["wandb"]["project"]

if config["wandb"]["debug"]:
    project = f"{project}_debug"

experiment_lot = config["wandb"]['experiment_lot']
notes=f'slurm_id: {slurm_jobid}, experiment_lot: {experiment_lot}'

wandb.init(
    entity=entity,
    project= project,
    dir=dir,
    config=config,
    notes=notes,
)

logging.info('Finished wandb init.')

In [None]:
dataset_loader = Dataset(config)
train_dataset, validation_dataset = dataset_loader.load_data()


if not isinstance(train_dataset, list):
        logging.info('Train dataset: %s', train_dataset)

answerable_indices, unanswerable_indices = split_dataset(train_dataset)


if config["dataset"]['answerable_only']:
        unanswerable_indices = []
        val_answerable, val_unanswerable = split_dataset(validation_dataset)
        del val_unanswerable
        validation_dataset = [validation_dataset[i] for i in val_answerable]
        train_dataset = [train_dataset[i] for i in answerable_indices]


In [None]:
# The prompt is used in every sampling process.
promptgenerator = PromptGenerator(config,train_dataset)
few_shot_prompt,prompt_indices = promptgenerator.construct_fewshot_prompt_by_nums(2)
experiment_details['prompt_indices'] = prompt_indices
experiment_details['prompt'] = few_shot_prompt
experiment_details['BRIEF'] = promptgenerator.BRIEF
logging.info('Prompt is: %s', few_shot_prompt)

In [None]:
huggingface_model = HuggingfaceModel(config)


## P_True Measure


In [None]:
metric = get_metric('squad')
validation_promptgenerator = PromptGenerator(config,validation_dataset)
p_true_evaluator = PTrueEvaluator(config,huggingface_model,promptgenerator,validation_promptgenerator,metric,experiment_details)

In [None]:
p_true_few_shot_prompt = p_true_evaluator.construct_few_shot_prompt_for_p_true(few_shot_prompt,5,3)
#wandb.config.update({'p_true_num_fewshot': len_p_true}, allow_val_change=True)
#wandb.log(dict(len_p_true=len_p_true))

In [None]:
p_true_evaluator.all_evaluate(few_shot_prompt,1.0,2,p_true_few_shot_prompt,3)

In [48]:
print(experiment_details)

{'config': {'wandb': {'debug': False, 'project': 'test', 'experiment_lot': 'MyExperiment'}, 'dataset': {'name': 'squad', 'seed': 42, 'answerable_only': True}, 'prompt': {'few-shot': False, 'shot_num': 3, 'brief_always': True, 'use_context': True, 'add_tag': True, 'prompt_template_path': './data/prompt_templates/ask_templates/test2.txt'}, 'model': {'model_name': 'meta-llama/Llama-2-7b-hf', 'stop_sequences': 'default', 'max_new_tokens': 50}, 'sample': {'temperature': 1.0, 'sample_count': 5, 'sampling_method': 'simple_sample'}, 'metrics': [{'name': 'p_true', 'p_true_num_fewshot': 2}, {'name': 'accuracy'}, {'name': 'diversity'}], 'p_true': {'compute_p_true': True, 'get_training_set_generations': True, 'get_training_set_generations_most_likely_only': True, 'compute_accuracy_at_all_temps': True, 'p_true_hint': False}}, 'prompt_indices': [18303, 24501], 'prompt': 'Answer the following question as briefly as possible.\nContext: Jean-Jacques Rousseau was the first of many to present the Alps as

In [None]:
entropies = defaultdict(list)
validation_embeddings, validation_is_true, validation_answerable = [], [], []
p_trues = []

## P_ik

In [None]:
if args.compute_p_ik or args.compute_p_ik_answerable:
    # Assemble training data for embedding classification.
    train_is_true, train_embeddings, train_answerable = [], [], []
    for tid in train_generations:
        most_likely_answer = train_generations[tid]['most_likely_answer']
        train_embeddings.append(most_likely_answer['embedding'])
        train_is_true.append(most_likely_answer['accuracy'])
        train_answerable.append(is_answerable(train_generations[tid]))
    train_is_false = [0.0 if is_t else 1.0 for is_t in train_is_true]
    train_unanswerable = [0.0 if is_t else 1.0 for is_t in train_answerable]
    logging.info('Unanswerable prop on p_ik training: %f', np.mean(train_unanswerable))

if args.compute_p_ik:
    logging.info('Starting training p_ik on train embeddings.')
    # Train classifier of correct/incorrect from embeddings.
    p_ik_predictions = get_p_ik(
        train_embeddings=train_embeddings, is_false=train_is_false,
        eval_embeddings=validation_embeddings, eval_is_false=validation_is_false)
    result_dict['uncertainty_measures']['p_ik'] = p_ik_predictions
    logging.info('Finished training p_ik on train embeddings.')

if args.compute_p_ik_answerable:
    # Train classifier of answerable/unanswerable.
    p_ik_predictions = get_p_ik(
        train_embeddings=train_embeddings, is_false=train_unanswerable,
        eval_embeddings=validation_embeddings, eval_is_false=validation_unanswerable)
    result_dict['uncertainty_measures']['p_ik_unanswerable'] = p_ik_predictions

## Entropy

In [None]:
entailment_model = EntailmentDeberta()


# compute_context_entails_response
entropies['context_entails_response'].append(context_entails_response(context, responses, entailment_model))

# condition_on_question
responses = [f'{question} {r}' for r in responses]


if args.compute_predictive_entropy:
    # Token log likelihoods. Shape = (n_sample, n_tokens)
    if not args.use_all_generations:
        log_liks = [r[1] for r in full_responses[:args.use_num_generations]]
    else:
        log_liks = [r[1] for r in full_responses]

    for i in log_liks:
        assert i

    if args.compute_context_entails_response:
        # Compute context entails answer baseline.
        entropies['context_entails_response'].append(context_entails_response(context, responses, entailment_model))

    if args.condition_on_question and args.entailment_model == 'deberta':
        responses = [f'{question} {r}' for r in responses]

    # Compute semantic ids.
    semantic_ids = get_semantic_ids(
        responses, model=entailment_model,
        strict_entailment=args.strict_entailment, example=example)

    result_dict['semantic_ids'].append(semantic_ids)

    # Compute entropy from frequencies of cluster assignments.
    entropies['cluster_assignment_entropy'].append(cluster_assignment_entropy(semantic_ids))

    # Length normalization of generation probabilities.
    log_liks_agg = [np.mean(log_lik) for log_lik in log_liks]

    # Compute naive entropy.
    entropies['regular_entropy'].append(predictive_entropy(log_liks_agg))

    # Compute semantic entropy.
    log_likelihood_per_semantic_id = logsumexp_by_id(semantic_ids, log_liks_agg, agg='sum_normalized')
    pe = predictive_entropy_rao(log_likelihood_per_semantic_id)
    entropies['semantic_entropy'].append(pe)

    # pylint: disable=invalid-name
    log_str = 'semantic_ids: %s, avg_token_log_likelihoods: %s, entropies: %s'
    entropies_fmt = ', '.join([f'{i}:{j[-1]:.2f}' for i, j in entropies.items()])
    # pylint: enable=invalid-name
    logging.info(80*'#')
    logging.info('NEW ITEM %d at id=`%s`.', idx, tid)
    logging.info('Context:')
    logging.info(example['context'])
    logging.info('Question:')
    logging.info(question)
    logging.info('True Answers:')
    logging.info(example['reference'])
    logging.info('Low Temperature Generation:')
    logging.info(most_likely_answer['response'])
    logging.info('Low Temperature Generation Accuracy:')
    logging.info(most_likely_answer['accuracy'])
    logging.info('High Temp Generation:')
    logging.info([r[0] for r in full_responses])
    logging.info('High Temp Generation:')
    logging.info(log_str, semantic_ids, log_liks_agg, entropies_fmt)





## Semantic_uncertainty

## Semantic_uncertainty2

In [None]:
# 输入results_old

'''
results_old['validation_is_false']
results_old['uncertainty_measures']
results_old['uncertainty_measures']['p_false']
results_old['uncertainty_measures']['p_false_fixed']
results_old['validation_unanswerable']


#### for measure_name, measure_values in rum.items():
'''

In [None]:
for measure_name, measure_values in rum.items():
        logging.info('Computing for uncertainty measure `%s`.', measure_name)

        # Validation accuracy.
        validation_is_falses = [
            results_old['validation_is_false'],
            results_old['validation_unanswerable']
        ]

        logging_names = ['', '_UNANSWERABLE']

        # Iterate over predictions of 'falseness' or 'answerability'.
        # 指标计算
        for validation_is_false, logging_name in zip(validation_is_falses, logging_names):
            name = measure_name + logging_name
            result_dict['uncertainty'][name] = {}

            validation_is_false = np.array(validation_is_false)
            validation_accuracy = 1 - validation_is_false
            if len(measure_values) > len(validation_is_false):
                # This can happen, but only for p_false.
                if 'p_false' not in measure_name:
                    raise ValueError
                logging.warning(
                    'More measure values for %s than in validation_is_false. Len(measure values): %d, Len(validation_is_false): %d',
                    measure_name, len(measure_values), len(validation_is_false))
                measure_values = measure_values[:len(validation_is_false)]

            # fargs = {
            #     'AUROC': [validation_is_false, measure_values],
            #     'area_under_thresholded_accuracy': [validation_accuracy, measure_values],
            #     'mean_uncertainty': [measure_values]}

            # for answer_fraction in answer_fractions:
            #     fargs[f'accuracy_at_{answer_fraction}_answer_fraction'] = [validation_accuracy, measure_values]

            # for fname, (function, bs_function) in eval_metrics.items():
            #     metric_i = function(*fargs[fname])
            #     result_dict['uncertainty'][name][fname] = {}
            #     result_dict['uncertainty'][name][fname]['mean'] = metric_i
            #     logging.info("%s for measure name `%s`: %f", fname, name, metric_i)
            #     result_dict['uncertainty'][name][fname]['bootstrap'] = bs_function(
            #         function, rng)(*fargs[fname])

In [None]:
## 将结果都存放在 result_dict 中
result_dict = {'performance': {}, 'uncertainty': {}}

## 测量 performance
all_accuracies['accuracy'] = 1 - np.array(results_old['validation_is_false'])

for name, target in all_accuracies.items():
    result_dict['performance'][name] = {}
    result_dict['performance'][name]['mean'] = np.mean(target)
    result_dict['performance'][name]['bootstrap'] = bootstrap(np.mean, rng)(target)

In [None]:
## 修改p_true
##转换p_true为概率值
rum = results_old['uncertainty_measures']
if 'p_false' in rum and 'p_false_fixed' not in rum:
    # Restore log probs true: y = 1 - x --> x = 1 - y.
    # Convert to probs --> np.exp(1 - y).
    # Convert to p_false --> 1 - np.exp(1 - y).
    rum['p_false_fixed'] = [1 - np.exp(1 - x) for x in rum['p_false']]

In [None]:
'''
val_metrics 是一个字典，键是指标名称（如 'AUROC'），值是一个元组 (function, bs_function)
function 用于计算该指标的函数。
bs_function：用于对该指标进行 bootstrap 分析 的函数



eval_metrics = {
    'AUROC': (auroc, compatible_bootstrap),
    'area_under_thresholded_accuracy': (area_under_thresholded_accuracy, compatible_bootstrap),
    'mean_uncertainty': (np.mean, bootstrap)
    }

'''

 eval_metrics = dict(zip(
        ['AUROC', 'area_under_thresholded_accuracy', 'mean_uncertainty'],
        list(zip([auroc, area_under_thresholded_accuracy, np.mean],[compatible_bootstrap, compatible_bootstrap, bootstrap])),
    ))

for answer_fraction in answer_fractions:
        key = f'accuracy_at_{answer_fraction}_answer_fraction'
        eval_metrics[key] = [functools.partial(accuracy_at_quantile, quantile=answer_fraction),compatible_bootstrap]


In [None]:
for measure_name, measure_values in rum.items():
        logging.info('Computing for uncertainty measure `%s`.', measure_name)

        # Validation accuracy.
        validation_is_falses = [
            results_old['validation_is_false'],
            results_old['validation_unanswerable']
        ]

        logging_names = ['', '_UNANSWERABLE']

        # Iterate over predictions of 'falseness' or 'answerability'.
        # 指标计算
        for validation_is_false, logging_name in zip(validation_is_falses, logging_names):
            name = measure_name + logging_name
            result_dict['uncertainty'][name] = {}

            validation_is_false = np.array(validation_is_false)
            validation_accuracy = 1 - validation_is_false
            if len(measure_values) > len(validation_is_false):
                # This can happen, but only for p_false.
                if 'p_false' not in measure_name:
                    raise ValueError
                logging.warning(
                    'More measure values for %s than in validation_is_false. Len(measure values): %d, Len(validation_is_false): %d',
                    measure_name, len(measure_values), len(validation_is_false))
                measure_values = measure_values[:len(validation_is_false)]




                # fargs 字典：评估任务 + 所需要的指标
                # 实验指标通过fargs字典与eval_metrics字典实现可拓展
                # 汇总每个评估指标以及所需要的不同输入数据
                fargs = {
                        'AUROC': [validation_is_false, measure_values],
                        'area_under_thresholded_accuracy': [validation_accuracy, measure_values],
                        'mean_uncertainty': [measure_values]}

                #添加新的评估任务
                for answer_fraction in answer_fractions:
                    fargs[f'accuracy_at_{answer_fraction}_answer_fraction'] = [validation_accuracy, measure_values]

                '''
                fargs = {
                    'AUROC': [validation_is_false, measure_values],
                    'area_under_thresholded_accuracy': [validation_accuracy, measure_values],
                    'mean_uncertainty': [measure_values],
                    'accuracy_at_0.5_answer_fraction': [validation_accuracy, measure_values],
                    'accuracy_at_0.75_answer_fraction': [validation_accuracy, measure_values]
                }
                '''


                for fname, (function, bs_function) in eval_metrics.items():
                    metric_i = function(*fargs[fname])
                    result_dict['uncertainty'][name][fname] = {}
                    result_dict['uncertainty'][name][fname]['mean'] = metric_i
                    logging.info("%s for measure name `%s`: %f", fname, name, metric_i)
                    result_dict['uncertainty'][name][fname]['bootstrap'] = bs_function(
                        function, rng)(*fargs[fname])

In [None]:
##最终的result_dict

'''
result_dict = {
    'performance': {},  # 存储性能指标的结果
    'uncertainty': {}   # 存储不确定性测量的结果
}

'performance': {
    'accuracy': {  # 性能指标名称
        'mean': 0.95,  # 准确率的均值
        'bootstrap': [0.94, 0.96]  # 使用 bootstrap 计算的置信区间
    }
}

'uncertainty': {
    'p_false': {  # 不确定性测量名称（如 p_false）
        'AUROC': {
            'mean': 0.85,  # AUROC 的均值
            'bootstrap': [0.83, 0.87]  # AUROC 的置信区间
        },
        'mean_uncertainty': {
            'mean': 0.3,  # 不确定性测量的均值
            'bootstrap': [0.28, 0.32]  # 不确定性测量的置信区间
        },
        'accuracy_at_0.8_answer_fraction': {
            'mean': 0.88,  
            'bootstrap': [0.86, 0.89]
        },
        ...
    },
    'p_false_UNANSWERABLE': {  # 对应另一个验证指标的结果
        'AUROC': {
            'mean': 0.75,
            'bootstrap': [0.73, 0.77]
        },
        ...
    }
}


'''

# Compare: performance of this uncertain metrics
Through the value of auroc, we compare these methods

In [None]:
import os
import json
import wandb
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
example_metrics = {
    "uncertainty": {
        "semantic_entropy": {
            "AUROC": {"mean": 0.75},
            "Accuracy": {"mean": 0.85}
        },
        "cluster_assignment_entropy": {
            "AUROC": {"mean": 0.72},
            "Accuracy": {"mean": 0.83}
        },
        "regular_entropy": {
            "AUROC": {"mean": 0.70},
            "Accuracy": {"mean": 0.80}
        },
        "p_false": {
            "AUROC": {"mean": 0.68},
            "Accuracy": {"mean": 0.78}
        },
        "p_ik": {
            "AUROC": {"mean": 0.74},
            "Accuracy": {"mean": 0.82}
        }
    }
}

In [None]:
# wandb_id = 'h1scz5qz'
# if wandb_id == 'YOUR_ID':
#     raise ValueError('Need to provide wandb_id of demo run!')
# def restore_file(wandb_id, filename='wandb-summary.json'):
#     files_dir = 'notebooks/restored_files'    
#     os.system(f'mkdir -p {files_dir}')

#     api = wandb.Api()
#     run = api.run(f'semantic_uncertainty/{wandb_id}')

#     path = f'{files_dir}/{filename}'
#     os.system(f'rm -rf {path}')
#     run.file(filename).download(root=files_dir, replace=True, exist_ok=False)
#     with open(path, 'r') as f:
#         out = json.load(f)
#     return out

In [None]:
def get_uncertainty_df(metrics):
    data = []
    for method in metrics['uncertainty']:
        for metric in metrics['uncertainty'][method]:
            mean = metrics['uncertainty'][method][metric]['mean']
            data.append([method, metric, mean])
    df = pd.DataFrame(data, columns=['method', 'metric', 'means'])
    main_methods = ['semantic_entropy', 'cluster_assignment_entropy', 'regular_entropy', 'p_false', 'p_ik']
    df = df.set_index('method').loc[main_methods].reset_index()
    main_names = ['Semantic entropy', 'Discrete Semantic Entropy', 'Naive Entropy', 'p(True)', 'Embedding Regression']
    conversion = dict(zip(main_methods, main_names))
    df['method'] = df.method.map(lambda x: conversion[x])
    return df

In [None]:
unc_df = get_uncertainty_df(example_metrics)

In [None]:
metric = 'AUROC'
unc_df.set_index('metric').loc[metric].plot.bar(x='method', y='means')
plt.gca().set_ylabel(metric)
plt.gca().grid(axis='y')
plt.gca().set_ylim(0.6, 0.8)

In [None]:
metric = 'Accuracy'
unc_df.set_index('metric').loc[metric].plot.bar(x='method', y='means')
plt.gca().set_ylabel(metric)
plt.gca().grid(axis='y')
plt.gca().set_ylim(0.6, 1.0)