In [3]:
'''
A notebook to evaluate In-Context Learning Question Answering capabilities of base models on UnifiedQA
'''
import os
import gc
import json
import torch
import logging
import pandas as pd

from collections import defaultdict
from datasets import Dataset
import bitsandbytes

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from trl import SFTTrainer

In [None]:
# Quantization
CONFIG_4BITS = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) # For QLORA
CONFIG_4BITS_NORM = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16")) # For QLORA and GEMMA
CONFIG_4BITS_NORM_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16"), bnb_4bit_use_double_quant=True) # For QLORA and GEMMA
CONFIG_4BITS_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True) # For QLORA
CONFIG_8BITS = BitsAndBytesConfig(load_in_8bit=True)

In [4]:
# utils
def load_datasets_from_directory(directory_path: str, type='tokenized') -> tuple:
    
    expected_files = {"train.json", "dev.json", "test.json"}
    actual_files = set(os.listdir(directory_path))
    
    if expected_files != actual_files:
        raise ValueError(f"Directory must contain exactly these files: {expected_files}")
    
    if type == 'tokenized':
        train_dataset = load_tokenized_dataset(os.path.join(directory_path, "train.json"))
        dev_dataset = load_tokenized_dataset(os.path.join(directory_path, "dev.json"))
        test_dataset = load_tokenized_dataset(os.path.join(directory_path, "test.json"))
    else:
        train_dataset = load_processed_dataset(os.path.join(directory_path, "train.json"))
        dev_dataset = load_processed_dataset(os.path.join(directory_path, "dev.json"))
        test_dataset = load_processed_dataset(os.path.join(directory_path, "test.json"))

    return (train_dataset, dev_dataset, test_dataset)

def load_processed_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers

    return Dataset.from_dict(data_dict)

def load_tokenized_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers, text, input_id = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers
        data_dict['text'] = text
        data_dict['input_ids'] = input_id


    return Dataset.from_dict(data_dict)


def load_model(base_model: str, bnb_config:BitsAndBytesConfig=None, on_gpu:bool=False, use_cache:bool=False, pretraining_tp:int=1) -> AutoModelForCausalLM:
    if on_gpu:
        print("in here")
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map={"": 0})
        print(base_model)
    else:
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model)

    base_model_loaded.config.use_cache = use_cache
    base_model_loaded.config.pretraining_tp = pretraining_tp

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return base_model_loaded, tokenizer

def del_model_off_gpu(model_on_cuda):
    '''
    Deletes model from GPU and clears all the Cache!
    '''
    del model_on_cuda
    gc.collect()
    torch.cuda.empty_cache()


In [11]:
# Load dataset for testing
_, _, gemma_test = load_datasets_from_directory("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Gemma", type='tokenized')
_, _, llama_test= load_datasets_from_directory("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Llama", type='tokenized')
_, _, mistral_test = load_datasets_from_directory("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Mistral", type='tokenized')

In [12]:
print(len(gemma_test))
print(len(llama_test))
print(len(mistral_test))

31317
31317
31317


In [23]:
print(gemma_test['id'][0])
print('--------------------------------------------------------------')
print(gemma_test['questions'][0])
print('--------------------------------------------------------------')
print(gemma_test['answers'][0])
print('----------------------------------------------------------------------------')
# prompt_insert = "Answer this question truthfully:"
# text = gemma_test['questions'][0]
# insertion_point = text.find("user") + len("user")
# new_text = text[:insertion_point] + " " + prompt_insert + text[insertion_point:]
# new_text

narrativeqa-test-0
--------------------------------------------------------------
<bos><start_of_turn>user
who is mark hunter? \n  mark hunter (slater), a high school student in a sleepy suburb of phoenix, arizona, starts an fm pirate radio station that broadcasts from the basement of his parents' house. mark is a loner, an outsider, whose only outlet for his teenage angst and aggression is his unauthorized radio station. his pirate station's theme song is "everybody knows" by leonard cohen and there are glimpses of cassettes by such alternative musicians as the jesus and mary chain, camper van beethoven, primal scream, soundgarden, ice-t, bad brains, concrete blonde, henry rollins, and the pixies. by day, mark is seen as a loner, hardly talking to anyone around him; by night, he expresses his outsider views about what is wrong with american society. when he speaks his mind about what is going on at his school and in the community, more and more of his fellow students tune in to hear h

'<bos><start_of_turn>user Answer this question truthfully:\nwho is mark hunter? \\n  mark hunter (slater), a high school student in a sleepy suburb of phoenix, arizona, starts an fm pirate radio station that broadcasts from the basement of his parents\' house. mark is a loner, an outsider, whose only outlet for his teenage angst and aggression is his unauthorized radio station. his pirate station\'s theme song is "everybody knows" by leonard cohen and there are glimpses of cassettes by such alternative musicians as the jesus and mary chain, camper van beethoven, primal scream, soundgarden, ice-t, bad brains, concrete blonde, henry rollins, and the pixies. by day, mark is seen as a loner, hardly talking to anyone around him; by night, he expresses his outsider views about what is wrong with american society. when he speaks his mind about what is going on at his school and in the community, more and more of his fellow students tune in to hear his show.nobody knows the true identity of "h

In [27]:
print(llama_test['id'][0])
print('--------------------------------------------------------------')
print(llama_test['questions'][0])
print('--------------------------------------------------------------')
print(llama_test['answers'][0])
print('----------------------------------------------------------------------------')

# prompt_insert = "Answer this question truthfully:"
# text = llama_test['questions'][0]
# insertion_point = text.find("<s>") + len("<s>")
# new_text = text[:insertion_point] + " " + prompt_insert + " " +text[insertion_point:]
# new_text

narrativeqa-test-0
--------------------------------------------------------------
<s>who is mark hunter? \n  mark hunter (slater), a high school student in a sleepy suburb of phoenix, arizona, starts an fm pirate radio station that broadcasts from the basement of his parents' house. mark is a loner, an outsider, whose only outlet for his teenage angst and aggression is his unauthorized radio station. his pirate station's theme song is "everybody knows" by leonard cohen and there are glimpses of cassettes by such alternative musicians as the jesus and mary chain, camper van beethoven, primal scream, soundgarden, ice-t, bad brains, concrete blonde, henry rollins, and the pixies. by day, mark is seen as a loner, hardly talking to anyone around him; by night, he expresses his outsider views about what is wrong with american society. when he speaks his mind about what is going on at his school and in the community, more and more of his fellow students tune in to hear his show.nobody knows t

'<s> Answer this question truthfully:who is mark hunter? \\n  mark hunter (slater), a high school student in a sleepy suburb of phoenix, arizona, starts an fm pirate radio station that broadcasts from the basement of his parents\' house. mark is a loner, an outsider, whose only outlet for his teenage angst and aggression is his unauthorized radio station. his pirate station\'s theme song is "everybody knows" by leonard cohen and there are glimpses of cassettes by such alternative musicians as the jesus and mary chain, camper van beethoven, primal scream, soundgarden, ice-t, bad brains, concrete blonde, henry rollins, and the pixies. by day, mark is seen as a loner, hardly talking to anyone around him; by night, he expresses his outsider views about what is wrong with american society. when he speaks his mind about what is going on at his school and in the community, more and more of his fellow students tune in to hear his show.nobody knows the true identity of "hard harry" or "happy ha

In [10]:
print(mistral_test['id'][31000])
print('--------------------------------------------------------------')
print(mistral_test['questions'][31000])
print('--------------------------------------------------------------')
print(mistral_test['answers'][31000])
print('----------------------------------------------------------------------------')

# prompt_insert = "Answer this question truthfully:"
# text = mistral_test['questions'][0]
# insertion_point = text.find("[INST]") + len("[INST]")
# new_text = text[:insertion_point] + " " + prompt_insert + " " +text[insertion_point:]
# new_text

openbookqa-test-183
--------------------------------------------------------------
<s>[INST] what type of useful product can be made from the moving winds? \n (a) wood (b) bananas (c) electricity (d) metal [/INST]


--------------------------------------------------------------
electricity
----------------------------------------------------------------------------


In [None]:
# Load Quantized models 
# Testing all Quantization varients for hf model
# NOTE: For unquantized, you need an A100 GPU - 7b models take up roughly 26.8 GB of memory.

# https://towardsdatascience.com/in-context-learning-approaches-in-large-language-models-9c0c53b116a1
# https://rahulrajpvr7d.medium.com/zero-shot-one-shot-and-few-shot-learning-with-examples-8a3efdcbb158
# https://huggingface.co/docs/transformers/en/tasks/language_modeling

# Gemma context length = 8192
# Llama 2 context length = 4096
# mistral 7b context length = 8192 

def preprocess(hf_model: str, quant_type: str, quant_config: str, ds: Dataset, experiment, k_shot: int, 
               max_k_shot_token_length=200, seed=42, sample: int=1000):
    ds = ds.shuffle(seed=seed)
    eval_sample = ds.select(range(sample))

    print(f'Evaluating {hf_model} in {quant_type} quantization')
    loaded_model, loaded_tokenizer = AutoModelForCausalLM.from_pretrained(hf_model, quantization_config=quant_config, device_map={"": 0})
    
    print(f'Running: {experiment}')
    prompt_insert = "Answer this question truthfully:"
    
    if experiment == 'zero_shot':
        results = process_samples(eval_sample, hf_model, prompt_insert, loaded_model, loaded_tokenizer, max_k_shot_token_length)

    elif experiment == 'k_shot':
        k_shot_examples = ds.select(range(sample, sample + k_shot))
        results = process_samples(k_shot_examples, hf_model, prompt_insert, loaded_model, loaded_tokenizer, max_k_shot_token_length)

    eval_sample = eval_sample.add_column('prompt_tokenization', results['new_tokenizations'])
    eval_sample = eval_sample.add_column('qa_dataset', results['original_dataset'])
    eval_sample = eval_sample.add_column('predicted', results['predicted'])

    return eval_sample

def process_samples(sample_data, model_name, prompt_insert, model, tokenizer, max_length):
    model_to_insert_point = {
        'google/gemma-7b': "user",
        'meta-llama/Llama-2-7b-hf': "<s>",
        'mistralai/Mistral-7B-v0.1': "[INST]"
    }
    
    predicted = []
    original_dataset = []
    new_tokenizations = []

    for example in sample_data:
        text = example['questions']
        insertion_point = text.find(model_to_insert_point[model_name]) + len(model_to_insert_point[model_name])
        new_text = text[:insertion_point] + " " + prompt_insert + " " + text[insertion_point:]
        
        inputs = tokenizer(new_text, return_tensors="pt")
        if inputs.input_ids.size(1) <= max_length:
            outputs = model.generate(inputs.input_ids, max_new_tokens=100, do_sample=True, top_k=10, top_p=.1)
            predicted.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
            original_dataset.append(example['id'].split('-')[0])
            new_tokenizations.append(inputs.input_ids)
        else:
            predicted.append(None)
            original_dataset.append(example['id'].split('-')[0])
            new_tokenizations.append(None)

    return {'new_tokenizations': new_tokenizations, 'original_dataset': original_dataset, 'predicted': predicted}

