In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install transformers
!pip install evaluate
!pip install rouge_score
!pip install bert_score

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/542.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.

In [3]:
import os
import time
import gc
import json
import torch
import logging
import pandas as pd

from collections import defaultdict
from datasets import Dataset
import datasets
import bitsandbytes
import evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from trl import SFTTrainer
rouge = evaluate.load("rouge")
cosine_similarity = evaluate.load("bertscore")
perplexity = evaluate.load("perplexity", module_type="metric")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

In [4]:
# Quantization
CONFIG_4BITS = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) # For QLORA
CONFIG_4BITS_NORM = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16")) # For QLORA and GEMMA
CONFIG_4BITS_NORM_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16"), bnb_4bit_use_double_quant=True) # For QLORA and GEMMA
CONFIG_4BITS_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True) # For QLORA
CONFIG_8BITS = BitsAndBytesConfig(load_in_8bit=True)

In [30]:
# Load utils
def load_processed_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers

    return Dataset.from_dict(data_dict)

def load_tokenized_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers, text, input_id = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers
        data_dict['text'] = text
        data_dict['input_ids'] = input_id


    return Dataset.from_dict(data_dict)

def load_datasets_from_directory(directory_path: str, type='tokenized') -> tuple:

    expected_files = {"train.json", "dev.json", "test.json"}
    actual_files = set(os.listdir(directory_path))

    if expected_files != actual_files:
        raise ValueError(f"Directory must contain exactly these files: {expected_files}")

    if type == 'tokenized':
        train_dataset = load_tokenized_dataset(os.path.join(directory_path, "train.json"))
        dev_dataset = load_tokenized_dataset(os.path.join(directory_path, "dev.json"))
        test_dataset = load_tokenized_dataset(os.path.join(directory_path, "test.json"))
    else:
        train_dataset = load_processed_dataset(os.path.join(directory_path, "train.json"))
        dev_dataset = load_processed_dataset(os.path.join(directory_path, "dev.json"))
        test_dataset = load_processed_dataset(os.path.join(directory_path, "test.json"))

    return (train_dataset, dev_dataset, test_dataset)

def load_model(base_model: str, bnb_config:BitsAndBytesConfig=None, on_gpu:bool=False, use_cache:bool=False, pretraining_tp:int=1) -> AutoModelForCausalLM:
    if on_gpu:
        print("in here")
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map={"": 0})
        print(base_model)
    else:
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model)

    base_model_loaded.config.use_cache = use_cache
    base_model_loaded.config.pretraining_tp = pretraining_tp

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return base_model_loaded, tokenizer

# Input text preprocessing
def preprocess_prompt_icl(hf_model: str, loaded_tokenizer:AutoTokenizer, ds: Dataset, experiment, k_shot: int=1,
               max_k_shot_token_length=200, seed=42, sample: int=1000):
    ds = ds.shuffle(seed=seed)
    eval_sample = ds.select(range(sample))

    def filter_by_token_length(example):
        tokens = loaded_tokenizer(example['text'], return_tensors="pt", truncation=False)
        return tokens.input_ids.size(1) <= max_k_shot_token_length

    print(f'Running prompt injection for: {experiment}')
    prompt_insert = "Answer this question truthfully:"

    if experiment == 'zero_shot':
        prompt_insert = "Answer the question truthfully:"
        results = process_samples(eval_sample, hf_model, prompt_insert, loaded_tokenizer)

    elif experiment == 'k_shot':
        filtered_dataset_for_k_shot =  ds.filter(filter_by_token_length)
        print(f"Number of examples in the dataset: {len(filtered_dataset_for_k_shot)}")
        if len(filtered_dataset_for_k_shot) < k_shot:
            raise ValueError(f"Dataset has less than {k_shot} examples")

        prompt_insert = "Answer the question truthfully. Follow these examples:"
        prompt_insert += "\n".join(filtered_dataset_for_k_shot['questions'][:k_shot])
        prompt_insert += "\n"
        prompt_insert += 'Question:'

        results = process_samples(eval_sample, hf_model, prompt_insert, loaded_tokenizer)
    eval_sample = datasets.concatenate_datasets([eval_sample, results], axis=1)

    return eval_sample



def process_samples(sample_data, model_name, prompt_insert, tokenizer):
    model_to_insert_point = {
        'google/gemma-7b': "user",
        'meta-llama/Llama-2-7b-hf': "<s>",
        'mistralai/Mistral-7B-v0.1': "[INST]"
    }

    original_dataset = []
    new_tokenizations = []

    for example in sample_data:
        text = example['questions']
        insertion_point = text.find(model_to_insert_point[model_name]) + len(model_to_insert_point[model_name])
        new_text = text[:insertion_point] + " " + prompt_insert + " " + text[insertion_point:]

        inputs = tokenizer(new_text, return_tensors="pt")
        original_dataset.append(example['id'].split('-')[0])
        new_tokenizations.append(inputs.input_ids)
    processed_samples = {'prompt_tokenizations': new_tokenizations, 'original_dataset': original_dataset}
    out = Dataset.from_dict(processed_samples)
    return out



# Predict

def predict(trained_model:SFTTrainer, tokenizer:AutoTokenizer, eval_sample:Dataset, model_name:str, prompted:bool=False):
    if prompted==True:
        assert 'prompt_tokenizations' in list(eval_sample.features.keys()), f"Eval Data needs the following column: 'prompt_tokenizations', but instead has { list(eval_sample.features.keys()) }"
        token_col = 'prompt_tokenizations'
    else:
        assert 'input_ids' in list(eval_sample.features.keys()), f"Eval Data needs the following column: 'input_ids', but instead has { list(eval_sample.features.keys()) }"
        token_col = 'input_ids'
    eval_sample.set_format("torch", device="cuda")
    predictions = []
    latencies = []
    for inp in eval_sample[token_col]:
        # inp = torch.tensor(inp, dtype=int)
        start = time.time()
        outp = trained_model.generate(inp, max_new_tokens=20, return_dict_in_generate=True, output_scores=True)
        end = time.time()
        pred = tokenizer.batch_decode(outp['sequences'], skip_special_tokens=True)

        predictions.append(pred[0])
        latencies.append(end - start)

    return predictions, latencies


In [6]:
from huggingface_hub import notebook_login
notebook_login() # use your access token here!

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
# Load tokenized test datasets
gemma_test = load_tokenized_dataset(os.path.join("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized_NEW/Gemma_NEW", "test.json"))
# llama_test = load_tokenized_dataset(os.path.join("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized_NEW/Llama_NEW", "test.json"))


In [6]:
print(gemma_test['id'][0])
print('--------------------------------------------------------------')
print(gemma_test['questions'][0])
print('--------------------------------------------------------------')
print(gemma_test['answers'][0])
print('----------------------------------------------------------------------------')

narrativeqa-test-0
--------------------------------------------------------------
<bos><start_of_turn>user
who is mark hunter? \n  mark hunter (slater), a high school student in a sleepy suburb of phoenix, arizona, starts an fm pirate radio station that broadcasts from the basement of his parents' house. mark is a loner, an outsider, whose only outlet for his teenage angst and aggression is his unauthorized radio station. his pirate station's theme song is "everybody knows" by leonard cohen and there are glimpses of cassettes by such alternative musicians as the jesus and mary chain, camper van beethoven, primal scream, soundgarden, ice-t, bad brains, concrete blonde, henry rollins, and the pixies. by day, mark is seen as a loner, hardly talking to anyone around him; by night, he expresses his outsider views about what is wrong with american society. when he speaks his mind about what is going on at his school and in the community, more and more of his fellow students tune in to hear h

In [15]:
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1)
# llama_model, llama_tokenizer = load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=True, pretraining_tp=1)


in here


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

google/gemma-7b


In [31]:
def strip_output_text(output:str, model_name:str):
  processed = output
  if model_name == 'google/gemma-7b':
    if 'Answer:' in processed:
      processed = processed[processed.find("Answer:")+7:]
    if 'model' in processed:
      processed = processed[processed.find('model')+5:]
    # Returns the whole input string as well; cut off this part
    for repl in ['\n',  'Step 1/', 'Step 2/',]:
        processed = processed.replace(repl, '')
    for strp in ['The answer is:']:
        processed = processed.strip(strp)

  return processed



def strip_answers(answer_text:str, model_name:str):
  out = answer_text
  if model_name == 'google/gemma-7b':
    for strp in ['<start_of_turn>model\n', '<end_of_turn>']:
      out = out.replace(strp, '')
  elif model_name == 'meta-llama/Llama-2-7b-hf':
    out = []
  elif model_name == 'mistralai/Mistral-7B-v0.1':
    out = []
  return out


def prediction_wrapper(trained_model:SFTTrainer, tokenizer:AutoTokenizer, ds:Dataset, model_name:str, add_prompt:bool=False, sample:int=1000, seed:int=42, save_path:str=''):
    def add_dataset_name_col(ds):
        original_dataset = []
        for example in ds:
            original_dataset.append(example['id'].split('-')[0])
        eval_sample = datasets.concatenate_datasets([ds, Dataset.from_dict({'original_dataset': original_dataset})], axis=1)
        return eval_sample

    if add_prompt == True and sample > 0:
         eval_sample = preprocess_prompt_icl(model_name, tokenizer, ds, experiment='zero_shot', sample=sample, seed=seed)
    elif add_prompt == False and sample > 0:
        ds = ds.shuffle(seed=seed)
        sample_data = ds.select(range(sample))
        eval_sample = add_dataset_name_col(sample_data)
    elif add_prompt == True and sample == 0:
         eval_sample = preprocess_prompt_icl(model_name, tokenizer, ds, experiment='zero_shot', sample=ds.shape[0], seed=seed)
    else:
        eval_sample = add_dataset_name_col(ds)
    print("eval_sample generated")
    predictions, latencies = predict(trained_model, tokenizer, eval_sample, model_name, prompted=add_prompt)
    print("predictions generated")
    # predictions = [predictions[i][len(eval_sample['questions'][i]):] for i in range(len(eval_sample['questions']))]
    predictions = [strip_output_text(s, model_name) for s in predictions]

    answers_stripped = [strip_answers(s, model_name) for s in eval_sample['answers']]


    pred_ds = Dataset.from_dict({
        'predictions': [p.lower() for p in predictions],
        'ground_truth':answers_stripped,
        'original_dataset':eval_sample['original_dataset'],
        'latencies': latencies})

    if len(save_path) > 0:

        dir = save_path.split('/')[:-1]
        print(os.path.join(*dir))
        os.makedirs(f"/{os.path.join(*dir)}", exist_ok=True)
        print(save_path)
        with open(save_path, "w") as f:
            json.dump([pred_ds['predictions'], pred_ds['ground_truth'], pred_ds['original_dataset'], pred_ds['latencies']], f)

    return pred_ds

def compute_accuracy(scores:list):
    num_correct = 0
    for score in scores:
        if score == 1:
            num_correct += 1

    accuracy = 100.0 * ((num_correct) / len(scores))
    return accuracy

def compute_ppl(predictions, trained_model):
    # This is currently wrong; needs a string for model_id that points to a TRAINED model (which we don't have yet)
    ppl = perplexity.compute(predictions=predictions, model_id=trained_model)['mean_perplexity']
    return ppl

def throughput(latencies:list, predictions:list):
    print('computing throughput')
    through_put = []
    for l, p in zip(latencies, predictions):
        output_tokens = len(p)
        through_put.append(output_tokens / l)

    avg_through_put = sum(through_put) / len(through_put)
    return avg_through_put

def compute_rouge(predictions:list, ground_truth:list):
    print('computing similarity for summarization')
    scores = rouge.compute(predictions=predictions, references=ground_truth, use_aggregator=False)
    return scores['rougeL'] # longest common subsequence-based ROUGE



def jaccard(str1, str2):
    if str1 == str2:
        return 1.0
    if " " in str1 or " " in str2:
        str1_split = str1.split(" ")
        str2_split = str2.split(" ")
        overlap = list(set(str1_split) & set(str2_split))
        return len(overlap) / max(len(str1_split), len(str2_split))
    else:
        return 0.0


def compute_similarity(predictions:list, ground_truth:list):
    print('computing similarity for multiple choice')
    # scores = cosine_similarity.compute(predictions=predictions, references=ground_truth, model_type="distilbert-base-uncased")
    # scores = scores['f1']
    scores = []
    for p, l in zip(predictions, ground_truth):
      scores.append(jaccard(p,l))
    return scores


def compute_scores(original_dataset:str, predictions:list, ground_truth:list):

    ds_metric_map = {
        'ai2_science_elementary': 'mc',
        'ai2_science_middle': 'mc',
        'arc_easy': 'mc',
        'arc_hard': 'mc',
        'narrativeqa': 'rouge',
        'openbookqa' : 'mc',
        'race_string': 'mc'}

    assert original_dataset in ds_metric_map, f"Please define a metric mapping for dataset {original_dataset}"

    metric = ds_metric_map[original_dataset]

    if metric == 'rouge':
        scores = compute_rouge(predictions, ground_truth)
    elif metric == 'mc':
        scores = compute_similarity(predictions, ground_truth)

    return scores


def evaluate_predictions(pred_ds:Dataset, model_name:str):

    assert list(pred_ds.features.keys()) == ['predictions', 'ground_truth', 'original_dataset', 'latencies'], f"Prediction dataset must have ['predictions', 'ground_truth', 'original_dataset'] in columns, currently has {list(pred_ds.features.keys()) }."


    original_datasets = set(pred_ds['original_dataset'])
    filt = {}
    for ds in original_datasets:
        filt[ds] = pred_ds.filter(lambda ex: ex['original_dataset'] == ds)

    scores = []
    for ds, data in filt.items():
        scores.extend(compute_scores(ds, pred_ds['predictions'], pred_ds['ground_truth']))

    accuracy = compute_accuracy(scores)

    print("computing perplexity")

    tp = throughput(pred_ds['latencies'], pred_ds['predictions'])

    return scores, accuracy, tp

def load_saved_predictions(file_path:str):
    data_dict = {}
    with open(file_path, 'r') as fp:
        predictions, ground_truth, original_dataset, latencies = json.load(fp)

        data_dict['predictions'] = predictions
        data_dict['ground_truth'] = ground_truth
        data_dict['original_dataset'] = original_dataset
        data_dict['latencies'] = latencies

    return Dataset.from_dict(data_dict)


def predict_and_evaluate(trained_model:SFTTrainer, tokenizer:AutoTokenizer, ds:Dataset, model_name:str, add_prompt:bool=False, sample:int=1000, seed:int=42, return_predictions:bool=False):
      print("calculating predictions")
      pred_ds = prediction_wrapper(trained_model, tokenizer, ds, model_name, add_prompt, sample, seed)
      print("calculating metrics")
      metrics = evaluate_predictions(pred_ds, model_name)

      if return_predictions:
        return metrics, pred_ds
      else:
        return metrics


In [32]:
# ppl = perplexity.compute(predictions=pred_ds['predictions'], model_id=gemma_model)


In [None]:
pred_ds = prediction_wrapper(gemma_model, gemma_tokenizer, gemma_test, 'google/gemma-7b', add_prompt=False, sample=100, save_path='/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/pred_test_gemma.json')

Flattening the indices:   0%|          | 0/100 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/100 [00:00<?, ? examples/s]

eval_sample generated


In [None]:
pred_ds = prediction_wrapper(gemma_model, gemma_tokenizer, gemma_test, 'google/gemma-7b', add_prompt=True, sample=100, save_path='/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/pred_test_with_prompt_gemma.json')

In [11]:
pred_ds = load_saved_predictions('/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/pred_test_gemma.json')
pred_ds.shape

(100, 4)

In [12]:
metrics = evaluate_predictions(pred_ds, 'google/gemma-7b')
scores, accuracy, thoughput = metrics
accuracy, thoughput # throughput is tokens per second

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

computing similarity for multiple choice
computing similarity for summarization
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing perplexity
computing throughput


(28.999999999999996, 50.95321429665555)

In [59]:
metrics, pred_ds = predict_and_evaluate(gemma_model, gemma_tokenizer, gemma_test, 'google/gemma-7b', add_prompt=False, sample=50, return_predictions=True)

calculating predictions
False 50


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

eval_sample generated
predictions generated
calculating metrics


Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

computing rouge
computing cosine similarity
computing cosine similarity
computing cosine similarity
computing cosine similarity
computing cosine similarity
computing cosine similarity
computing perplexity
computing throughput


In [58]:
scores, accuracy, thoughput = metrics
accuracy, thoughput

(12.5, 31.185435494372665)