-   This notebook configures and prepares a pre-trained model, loads and preprocesses a specific dataset (either 'coqa' or 'trivia_qa'). 
-   Then generates responses to prompts using different decoding methods (such as beam search or greedy decoding). 
-   The generated responses are evaluated using various metrics like Rouge and exact match, and the results are organized and returned for further analysis or use.

In [14]:
import tqdm
import datasets
import evaluate

In [15]:
# LLM Parameter Tuning for NLI
params = {
    # Question related arguments
    'question_type': None,
    'generations_per_prompt': 5,
    
    # Data related arguments
    'data_fraction': 0.9,
    
    # Model related arguments
    'model_version': 'opt-350m',
    'experiment_id': 'run_1',
    'temperature': 1.0,
    'beam_count': 5,
    'decoding': 'beam_search',
    'nucleus_top_p': 1.0,
    'data_source': 'coqa'
}

In [18]:
import wandb

# We are using wandb to track our experiments
wandb.init(project='nlg_uncertainty', id=params['experiment_id'], config=params, resume='allow')

run_version = wandb.run.name

In [19]:
import random
import os
import numpy as np
import torch

# Please make sure you are using CUDA enabled GPU for this project
device = 'cuda'

# Setting the seed value ensures that the results are reproducible across different runs
seed_val = 10

# Ensuring that the seed is set for Python's hashing, random operations, NumPy, and PyTorch
os.environ['PYTHONHASHSEED'] = str(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x168951e14d0>

In [26]:
# Collect garbage to free up CUDA memory
import gc
gc.collect()

9641

In [27]:
torch.cuda.empty_cache()

In [28]:
# Please use the models below you have some free memory in your GPU
# If you don't have enough memory, then it can lead to a crash (will require restart of the kernel/system)
torch.cuda.mem_get_info()

(2481979392, 6441926656)

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Getting the model from params and loading it to the GPU
# We are using Facebook's OPT: Open Pre-trained Transformer Language Models (if you want to use a different model, change the model_version in params and repository info accordingly)
model = AutoModelForCausalLM.from_pretrained(f"facebook/{params['model_version']}", torch_dtype=torch.float16, cache_dir='./cache_dir').cuda()

tokenizer = AutoTokenizer.from_pretrained(f"facebook/{params['model_version']}", use_fast=False, cache_dir='./cache_dir')

In [30]:
print(params['data_source'])
print(params['model_version'])

if params['data_source'] == 'coqa':
    data = datasets.load_from_disk('./coqa_dataset')
    # Since we have multiple questions for each context, we need to map each question to its context
    question_mapping = dict(zip(data['id'], data['question']))
    
elif params['data_source'] == 'trivia_qa':
    data = datasets.load_from_disk('./trivia_qa')

coqa
opt-350m


In [31]:
# Data_fraction parameter is used to specify the fraction of the dataset to be used for training
# Splitting the dataset into training and testing sets based on the specified data_fraction
if params['data_fraction'] < 1.0:
    train_dataset = data.train_test_split(test_size=(1 - params['data_fraction']), seed=seed_val)['train']
else:
    train_dataset = data



In [32]:
# Tokenize the story and question, adding a prefix for the answer
def tokenize_story_and_question(samples):
    return tokenizer(samples['story'] + ' Q: ' + samples['question'] + ' A:', truncation=False, padding=False)



# Apply tokenization to the dataset and set the format for PyTorch
def prepare_dataset_for_training(input_dataset):
    
    # Mapping the tokenization function to the dataset
    processed_dataset = input_dataset.map(tokenize_story_and_question, batched=False, load_from_cache_file=False)
    
    # Setting the format to PyTorch tensors to feed to the model
    processed_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'], output_all_columns=True)

    return processed_dataset

In [33]:
# The trivia_qa's train dataset doesn't require any preprocessing, so we can just load it in.
if params['data_source'] == 'coqa':
    processed_questions  = prepare_dataset_for_training(train_dataset)
else:
    processed_questions = train_dataset

# DataLoader to handle batch training
data_loader = torch.utils.data.DataLoader(processed_questions, batch_size=1)

# Tokens to be used for framing periods, questions and answers
period_tokens = tokenizer('. ')['input_ids'][1]
framing_tokens  = ['Ques:', ' Question:', '\n', 'Ans:', ' Answer:', 'Q:']
question_framing_ids = [[tokenizer(token)['input_ids'][1]] for token in framing_tokens ]

# Evaluation metrics
rouge_metric = evaluate.load('rouge')
exact_match = evaluate.load("exact_match")

Map:   0%|          | 0/7184 [00:00<?, ? examples/s]

In [34]:
def generate_responses(model, data_loader, number_of_generations):
    """Genrates responses for a given model and data loader. """

    with torch.no_grad():
        sequence_max_length = 256
        responses = []
        
        for batch in tqdm.tqdm(data_loader):
            
            # Reshape input ids for trivia_qa
            inputs = torch.cat(batch['input_ids']).to(device).reshape(1, -1) if params['data_source'] == 'trivia_qa' else batch['input_ids'].to(device)
            
            # Getting refernce response for the given input ids based on the decoding strategy
            if params['decoding'] == 'greedy':
                most_likely_generation = model.generate(inputs, num_beams=1, do_sample=False, max_length=inputs.shape[1] + sequence_max_length,
                                                        eos_token_id=period_tokens, bad_words_ids=question_framing_ids)
                
            elif params['decoding'] == 'beam_search':
                most_likely_generation = model.generate(inputs, num_beams=5, num_return_sequences=2, do_sample=False, max_length=inputs.shape[1] + sequence_max_length,
                                                        eos_token_id=period_tokens, bad_words_ids=question_framing_ids)

            
            # Generating multiple responses for the given input ids
            input_length = inputs.shape[1] if params['data_source'] == 'trivia_qa' else batch['input_ids'].shape[1]
            generations = torch.ones((number_of_generations, input_length + sequence_max_length), dtype=torch.long, device=device)
            
            for i in range(number_of_generations):

                generation = model.generate(inputs, do_sample=True, num_return_sequences=1, num_beams=params['beam_count'], max_length=inputs.shape[1] + sequence_max_length,
                                            eos_token_id=period_tokens, temperature=params['temperature'], bad_words_ids=question_framing_ids, top_p=params['nucleus_top_p'])
                generations[i, :generation.shape[1]] = generation

            generations = torch.reshape(generations, (-1, number_of_generations, generations.shape[-1]))
            
            # Decoding the generated responses
            for i in range(generations.shape[0]):

                # Creating response dictionary based on the data source
                if params['data_source'] == 'coqa':
                    response_dict = {   
                                        'question': question_mapping[batch['id'][0]],
                                        'generations': generations[i].to('cpu'),
                                        'prompt': batch['input_ids'][i].to('cpu'),
                                        'id': batch['id']    
                                    }
                else:
                    few_shot_question = tokenizer.decode(inputs[0])
                    question = few_shot_question.split('Question: ')[-1].split('Answer: ')[0]
                    response_dict = {
                                        'few_shot_question': tokenizer.decode(inputs[0]),
                                        'question': question,
                                        'generations': generations[i],
                                        'prompt': inputs[0],
                                        'id': batch['question_id']
                                    }

                generated_texts = []
                
                for generation in generations[i]:
                    generated_texts.append(tokenizer.decode(generation[len(batch['input_ids'][i]):], skip_special_tokens=True))

                # Adding the generated responses to the response dictionary
                response_dict['generated_texts'] = generated_texts
                response_dict['most_likely_generation_ids'] = most_likely_generation[0].to('cpu')
                response_dict['most_likely_generation'] = tokenizer.decode(most_likely_generation[0][len(batch['input_ids'][i]):], skip_special_tokens=True)

                response_dict['second_most_likely_generation_ids'] = most_likely_generation[1].to('cpu')
                response_dict['second_most_likely_generation'] = tokenizer.decode(most_likely_generation[1][len(batch['input_ids'][i]):], skip_special_tokens=True)

                response_dict['semantic_variability_reference_answers'] = batch['semantic_variability'] if 'semantic_variability' in batch else None
                
                # Calculating rouge scores for the generated responses
                rouge_types = ['rouge1', 'rouge2', 'rougeL']
                for rouge_type in rouge_types:
                    if rouge_type in batch:
                        response_dict[rouge_type + '_reference_answers'] = batch[rouge_type]

                    else:
                        response_dict[rouge_type + '_reference_answers'] = None

                    response_dict[rouge_type + '_to_target'] = 0.0

                # Calculating exact match score for the generated responses
                response_dict['answer'] = batch['answer']['text'] if params['data_source'] == 'coqa' else batch['answer']
                response_dict['additional_answers'] = [x[0] for x in batch['additional_answers']] if params['data_source'] == 'coqa' else None

                response_dict['exact_match'] = 0.0

                reference_answers = batch['answer']['text'] + [x[0] for x in batch['additional_answers']] if params['data_source'] == 'coqa' else batch['answer']

                # Evaluating the generated responses using exact match and rouge metrics
                for answer in reference_answers:
                    predictions = [response_dict['most_likely_generation'].lstrip()]
                    references = [answer]
                    results = exact_match.compute(predictions=predictions,
                                                         references=references,
                                                         ignore_case=True,
                                                         ignore_punctuation=True)
                    response_dict['exact_match'] = max(results['exact_match'], response_dict['exact_match'])
                    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
                    for rouge_type in rouge_types:
                        response_dict[rouge_type + '_to_target'] = max(rouge_results[rouge_type],
                                                                       response_dict[rouge_type + '_to_target'])

                responses.append(response_dict)

    return responses

In [35]:
responses = generate_responses(model, data_loader, params['generations_per_prompt'])

100%|██████████| 7184/7184 [5:09:47<00:00,  2.59s/it]   


In [36]:
import pathlib

pathlib.Path(f'./sequences/' + run_version).mkdir(parents=True, exist_ok=True)

In [37]:
import pickle

model = params['model_version']
with open(f'./sequences/{run_version}/{model}_generations.pkl', 'wb') as outfile:
    pickle.dump(responses, outfile)