In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
os.chdir("/home/yw699/codes/LLM-Hallu")
sys.path.append(os.path.abspath("src"))
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

In [None]:
import yaml
import math
from dataset import Dataset
from prompt_engineer import PromptGenerator
from models import HuggingfaceModel
from utils import *
import logging

In [None]:
setup_logger()


In [None]:
with open("configs/experiment_config1.yaml", "r") as file:
    config = yaml.safe_load(file)
wandb_config = config["wandb"]
metrics_config = config["metrics"]
experiment_details = {'config': config}

In [None]:
import wandb
user = os.environ['USER']
slurm_jobid = os.getenv('SLURM_JOB_ID', None)
scratch_dir = os.getenv('SCRATCH_DIR', '.')
entity = os.getenv('WANDB_SEM_UNC_ENTITY', None)

dir = f"{scratch_dir}/{user}/{entity}"
if not os.path.exists(dir):
        os.makedirs(dir)
project = config["wandb"]["project"]

if config["wandb"]["debug"]:
    project = f"{project}_debug"

experiment_lot = config["wandb"]['experiment_lot']
notes=f'slurm_id: {slurm_jobid}, experiment_lot: {experiment_lot}'

wandb.init(
    entity=entity,
    project= project,
    dir=dir,
    config=config,
    notes=notes,
)

logging.info('Finished wandb init.')

In [None]:
dataset_loader = Dataset(config)
train_dataset, validation_dataset = dataset_loader.load_data()


if not isinstance(train_dataset, list):
        logging.info('Train dataset: %s', train_dataset)

answerable_indices, unanswerable_indices = split_dataset(train_dataset)


if config["dataset"]['answerable_only']:
        unanswerable_indices = []
        val_answerable, val_unanswerable = split_dataset(validation_dataset)
        del val_unanswerable
        validation_dataset = [validation_dataset[i] for i in val_answerable]
        train_dataset = [train_dataset[i] for i in answerable_indices]


In [None]:
from prompt_engineer import PromptGenerator
####  experiment_details['prompt_indices'] = prompt_indices
promptgenerator = PromptGenerator(config,train_dataset)
few_shot_prompt = promptgenerator.construct_fewshot_prompt_from_indices()
logging.info('Prompt is: %s', few_shot_prompt)

In [None]:
from models import HuggingfaceModel
huggingface_model = HuggingfaceModel(config)

In [None]:
from utils import *
from metrics import *
metric = get_metric('squad')
##上面的prompt是在每次采样的时候都使用，
##这里生成的是p_true的few-shot 
BRIEF = "Answer the following question as briefly as possible.\n"
for m in metrics_config:
    metric_name = m.get("name")
    if metric_name == "p_true":
        logging.info(80*'#')
        logging.info('Constructing few-shot prompt for p_true.')
        p_true_few_shot_prompt, p_true_responses, len_p_true = construct_few_shot_prompt_from_indices(
            model=huggingface_model, p_true_num_fewshot= 3,
            prompt_generator = promptgenerator, prompt = few_shot_prompt,brief=BRIEF,
            brief_always=True,
            num_generations=5,
            metric=metric)
        wandb.config.update({'p_true_num_fewshot': len_p_true}, allow_val_change=True)
        wandb.log(dict(len_p_true=len_p_true))
        #experiment_details['p_true_indices'] = p_true_indices
        experiment_details['p_true_responses'] = p_true_responses
        experiment_details['p_true_few_shot_prompt'] = p_true_few_shot_prompt
        logging.info('Finished constructing few-shot prompt for p_true.')
        logging.info(80*'#')
        logging.info('p_true_few_shot_prompt: %s', p_true_few_shot_prompt)
        logging.info(80*'#')
        

In [45]:
num_samples = 3
get_training_set_generations = True
get_training_set_generations_most_likely_only = False
a_num_generations = 2 #sample number
compute_accuracy_at_all_temps = False
temperature = 1
use_context = True
brief = "Answer the following question as briefly as possible.\n"
brief_always = True
compute_p_true = True
p_true_hint = False

In [52]:

from tqdm import tqdm
import gc
import torch
import numpy as np

# Start answer generation.
logging.info(80 * '=')
logging.info('Generating answers: ')
logging.info(80 * '=')

for dataset_split in ['train', 'validation']:
    logging.info(80 * 'x')
    logging.info('Starting with dataset_split %s.', dataset_split)
    logging.info(80 * 'x')
    


    if dataset_split == 'train':
        if not get_training_set_generations:
            logging.info('Skip training data.')
            continue

    else:
        promptgenerator = PromptGenerator(config,validation_dataset)

    logging.info('Unused items in dataset_split %s: %d', dataset_split, len(promptgenerator.get_unused_indices()))
    

    if num_samples > len(promptgenerator.get_unused_indices()):
        logging.warning('Not enough samples in dataset. Using all %d samples.', len(promptgenerator.get_unused_indices()))

    # This will store all input data and model predictions.
    accuracies, generations, results_dict, p_trues = [], {}, {}, []
    for it, index in enumerate(tqdm(range(num_samples))):  # Use range(num_samples)
        if (it + 1) % 10 == 0:
            gc.collect()
            torch.cuda.empty_cache()
        # Grab example at index.
        
        
        example = promptgenerator.dataset_item[index]
        question, context = example["question"], example['context']
        generations[example['id']] = {'question': question, 'context': context}
        correct_answer = example['answers']['text']
        current_input = promptgenerator._make_prompt(context, question, None, brief, brief_always)
        local_prompt = few_shot_prompt + current_input
        

        full_responses = []

        # We sample one low temperature answer on which we will compute the
        # accuracy and args.num_generation high temperature answers which will
        # be used to estimate the entropy variants.


        if dataset_split == 'train' and get_training_set_generations_most_likely_only:
            num_generations = 1
        else:
            num_generations = a_num_generations + 1


        for i in range(num_generations):
            # Temperature for first generation is always `0.1`.
            temperature = 0.1 if i == 0 else temperature

            predicted_answer, token_log_likelihoods = huggingface_model.predict(local_prompt, temperature)

            # Only compute accuracy if question is answerable.
            compute_acc = compute_accuracy_at_all_temps or (i == 0)
            if correct_answer and compute_acc:
                acc = metric(predicted_answer, example, huggingface_model)
            else:
                acc = 0.0  
            if i == 0:
                    logging.info('Iteration ' + str(it) + ':  ' + 80*'#')

                    logging.info('Current input: '.ljust(15) + current_input)
                    if use_context:
                        logging.info('context: '.ljust(15) + str(context))
                    logging.info('question: '.ljust(15) + question)
                    logging.info('low-t prediction: '.ljust(15) + predicted_answer)
                    logging.info('correct answer: '.ljust(15) + str(correct_answer))
                    logging.info('accuracy: '.ljust(15) + str(acc))

                    accuracies.append(acc)
                    most_likely_answer_dict = {
                        'response': predicted_answer,
                        'token_log_likelihoods': token_log_likelihoods,
                        #'embedding': embedding,
                        'accuracy': acc}
                    generations[example['id']].update({
                        'most_likely_answer': most_likely_answer_dict,
                        'reference': get_reference(example)})

            else:
                logging.info('high-t prediction '.ljust(15) + str(i) + ' : ' + predicted_answer)
                # Aggregate predictions over num_generations.
                full_responses.append(
                        (predicted_answer, token_log_likelihoods, acc))
        # Append all predictions for this example to `generations`.
        generations[example['id']]['responses'] = full_responses

        if compute_p_true and dataset_split == 'validation':
                # Already compute p_true here. Avoid cost of generations in compute_uncertainty script.
                p_true = calculate_p_true(
                    huggingface_model, question, most_likely_answer_dict['response'],
                    [r[0] for r in full_responses], p_true_few_shot_prompt,
                    hint=p_true_hint)
                p_trues.append(p_true)
                logging.info('p_true: %s', p_true)
    # Save generations for that split.
    save(generations, f'{dataset_split}_generations.pkl')

    # Log overall accuracy.
    accuracy = np.mean(accuracies)
    print(f"Overall {dataset_split} split accuracy: {accuracy}")
    wandb.log({f"{dataset_split}_accuracy": accuracy})

    if dataset_split == 'validation':
        if compute_p_true:
            results_dict['uncertainty_measures'] = {
                'p_false':  [1 - p for p in p_trues],
                'p_false_fixed':  [1 - np.exp(p) for p in p_trues],
            }
        save(results_dict, 'uncertainty_measures.pkl')
save(experiment_details, 'experiment_details.pkl')
logging.info('Run complete.')







                    


            

2024-11-18 17:52:40 INFO     Generating answers: 
2024-11-18 17:52:40 INFO     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
2024-11-18 17:52:40 INFO     Starting with dataset_split train.
2024-11-18 17:52:40 INFO     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
2024-11-18 17:52:40 INFO     Unused items in dataset_split train: 5928
  0%|          | 0/3 [00:00<?, ?it/s]2024-11-18 17:52:40 INFO     Iteration 0:  ################################################################################
2024-11-18 17:52:40 INFO     Current input: Answer the following question as briefly as possible.
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to sw

File saved locally at: /home/yw699/codes/LLM-Hallu/results/train_generations.pkl
Overall train split accuracy: 1.0


  0%|          | 0/3 [00:00<?, ?it/s]2024-11-18 17:52:44 INFO     Iteration 0:  ################################################################################
2024-11-18 17:52:44 INFO     Current input: Answer the following question as briefly as possible.
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding 

Question: What was the importance of the Dutch East Indies to Japan?
Brainstormed Answers: oil reserves 
of considerable importance 
the oil reserves 
oil reserves 
Oil reserves 
oil reserves 
Possible answer: oil reserves
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: How does this help the system?
Brainstormed Answers: to cancel the first call if the passenger decides to travel to another destination 
To prevent empty calls 
knows every user call 
To prevent empty calls. 
It knows every user call 
To prevent this problem, in one implementation of destination control, every user gets an RFID card to identify himself, so the system knows every user call and can cancel the first call if the passenger decides to travel to another destination to prevent empty 
Possible answer: to cancel the first call if the passenger decides to travel to another destination
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: How did the writer refer 

2024-11-18 17:52:45 INFO     Iteration 1:  ################################################################################
2024-11-18 17:52:45 INFO     Current input: Answer the following question as briefly as possible.
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Question: When were the No

Question: What was the importance of the Dutch East Indies to Japan?
Brainstormed Answers: oil reserves 
of considerable importance 
the oil reserves 
oil reserves 
Oil reserves 
oil reserves 
Possible answer: oil reserves
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: How does this help the system?
Brainstormed Answers: to cancel the first call if the passenger decides to travel to another destination 
To prevent empty calls 
knows every user call 
To prevent empty calls. 
It knows every user call 
To prevent this problem, in one implementation of destination control, every user gets an RFID card to identify himself, so the system knows every user call and can cancel the first call if the passenger decides to travel to another destination to prevent empty 
Possible answer: to cancel the first call if the passenger decides to travel to another destination
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: How did the writer refer 

2024-11-18 17:52:47 INFO     Iteration 2:  ################################################################################
2024-11-18 17:52:47 INFO     Current input: Answer the following question as briefly as possible.
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Question: From which count

Question: What was the importance of the Dutch East Indies to Japan?
Brainstormed Answers: oil reserves 
of considerable importance 
the oil reserves 
oil reserves 
Oil reserves 
oil reserves 
Possible answer: oil reserves
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: How does this help the system?
Brainstormed Answers: to cancel the first call if the passenger decides to travel to another destination 
To prevent empty calls 
knows every user call 
To prevent empty calls. 
It knows every user call 
To prevent this problem, in one implementation of destination control, every user gets an RFID card to identify himself, so the system knows every user call and can cancel the first call if the passenger decides to travel to another destination to prevent empty 
Possible answer: to cancel the first call if the passenger decides to travel to another destination
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: How did the writer refer 