-   The above code iterates through a subset of responses containing questions and answers, tokenizes them, and computes the negative log likelihoods using a pre-trained model. 
-   It then calculates the Area Under the Receiver Operating Characteristic (AUROC) for the probabilities of true answers and saves this value to a file. 
-   The code is part of an evaluation process to assess the model's ability to determine the correctness of generated answers.

In [4]:
import os
import random
import numpy as np
import torch

# Please make sure you are using CUDA enabled GPU for this project
device = 'cuda'

# Setting the seed value ensures that the results are reproducible across different runs
seed_val = 10

# Ensuring that the seed is set for Python's hashing, random operations, NumPy, and PyTorch
os.environ['PYTHONHASHSEED'] = str(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x1135b6454d0>

In [5]:
# LLM Parameter Tuning
params = {    
    # Model related arguments
    'generation_version': 'opt-125m',
    'experiment_id_for_few_shot': 'run_1',
    'experiment_id': 'run_1',
}

In [6]:
import wandb

# We are using wandb to track our experiments
wandb.init(project='nlg_uncertainty', id=params['experiment_id_for_few_shot'], config=params, resume='allow')
model_name = wandb.config.generation_version

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m4data692[0m ([33mnlp53113[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# Collect garbage to free up CUDA memory
import gc
gc.collect()

34

In [8]:
torch.cuda.empty_cache()

In [9]:
# Please use the models below you have some free memory in your GPU
# If you don't have enough memory, then it can lead to a crash (will require restart of the kernel/system)
torch.cuda.mem_get_info()

(5384437760, 6441926656)

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Getting the model from params and loading it to the GPU
# Since we will be using the same model for other notebooks, we will save it in the cache directory
generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir='./cache_dir')
model = AutoModelForCausalLM.from_pretrained(f"facebook/{model_name}", torch_dtype=torch.float16, cache_dir='./cache_dir').cuda()

In [11]:
run_version = wandb.run.name

In [12]:
import pickle

# Please run the cleaner notebook before running this notebook
# Load the responses for the given run and model version
with open(f'./sequences/{run_version}/{model_name}_cleaned_generations.pkl', 'rb') as infile:
    responses_for_few_shot_prompt = pickle.load(infile)

In [14]:
wandb.finish()

In [15]:
# Selecting a subset of sequences for few-shot prompts
subset_of_sequences_for_few_shot_prompt = sequences_for_few_shot_prompt[-10:]
number_of_few_shot_samples = 5

In [16]:
# Creating template for the few-shot prompt
prompt_template = 'Question: {} \n Here are some ideas that were brainstormed:{}\n Possible answer:{}\n Is the possible answer:\n (A) True\n (B) False\n The possible answer is:'
few_shot_promopt = ''


# Iterating through the subset of sequences to create the few-shot prompt
for sequence in subset_of_sequences_for_few_shot_prompt:
    question_text = sequence['question']
    question_text = question_text.split('Question: ')[-1].split('Answer: ')[0]
    prompt_text = sequence['prompt']
    generated_ideas = '\n'.join(sequence['cleaned_generated_texts'][:number_of_few_shot_samples])
    most_probable_answer = sequence['most_likely_generation']
    is_correct = ' True' if sequence['rougeL_to_target'] > 0.3 else ' False'
    
    # Appending the formatted prompt to the few_shot_prompt
    few_shot_prompt += prompt_template.format(question_text, generated_ideas, most_probable_answer) + is_correct + '\n'


In [17]:
# Initializing lists to store labels and probabilities across datasets
labels_across_datasets = []
p_trues_across_datasets = []

# Defining the number of samples to be used
n_samples_to_use = 2000

In [19]:
from sklearn import metrics
from tqdm import tqdm

# Ensure that no gradients are computed during this block, for efficiency
with torch.no_grad():

    # Lists to store Area Under the Receiver Operating Characteristic (AUROC) values, probabilities of true, and correctness flags
    auroc_values = []
    probabilities_true = []
    correctness_flags = []
    
    # Loop through the responses to compute the negative log likelihoods
    for response in tqdm(responses_for_few_shot_prompt[:n_samples_to_use]):

        # Extract and clean the question text from the response
        question_text = response['question']
        if 'Question: ' in question_text:
            question_text = question_text.split('Question: ')[-1].split('Answer: ')[0]
        else:
            question_text = question_text.split('Q: ')[-1].split('A: ')[0]

        # Extract generated texts and the most likely answer
        generated_ideas = '\n'.join(response['cleaned_generated_texts'][:number_of_few_shot_samples])
        most_probable_answer = response['most_likely_generation']
        
        # Determine if the answer is correct based on the rougeL_to_target metric
        is_correct = 1.0 if response['rougeL_to_target'] > 0.3 else 0.0

        # Construct the base and true prompts
        base_prompt_text = prompt_template.format(question_text, generated_ideas, most_probable_answer)
        prompt_with_true_answer = few_shot_prompt + prompt_template.format(question_text, generated_ideas, most_probable_answer) + ' True'

        # This computation of the negative log likelihoods follows this tutorial: https://huggingface.co/docs/transformers/perplexity
        # Tokenize the prompts for the model
        tokenized_base_prompt = generation_tokenizer(base_prompt_text)['input_ids']
        tokenized_prompt_with_true_answer = torch.tensor(generation_tokenizer(prompt_with_true_answer)['input_ids'], device=device)

        # Prepare target IDs for the model
        target_ids_with_true_answer = tokenized_prompt_with_true_answer.clone()
        target_ids_with_true_answer[:len(tokenized_base_prompt)] = -100

        # Compute the model's output and loss for the true prompt
        model_output_with_true_answer = model(torch.reshape(tokenized_prompt_with_true_answer, (1, -1)), labels=target_ids_with_true_answer)
        loss_with_true_answer = model_output_with_true_answer.loss

        # Append the computed values to the lists
        probabilities_true.append(loss_with_true_answer.item())
        correctness_flags.append(is_correct)

        labels_across_datasets += correctness_flags
        probabilities_of_true_across_datasets += probabilities_true

    # Compute the AUROC for the probabilities of true
    auroc_for_true = metrics.roc_auc_score(1 - torch.tensor(correctness_flags), torch.tensor(probabilities_true))


    # Store p_true aurocs in a pickle file
    with open(f'./uncertainity/{run_version}/{model_name}_p_true_aurocs.pkl', 'wb') as outfile:
        pickle.dump(p_true_auroc, outfile)

100%|██████████| 2000/2000 [02:00<00:00, 16.64it/s]


In [21]:
import pathlib

pathlib.Path(f'./uncertainity/' + run_version).mkdir(parents=True, exist_ok=True)