In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install transformers
!pip install evaluate
!pip install rouge_score
!pip install bert_score

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0

In [None]:
import os
import gc
import json
import torch
import logging
import pandas as pd

from collections import defaultdict
from datasets import Dataset
import datasets
import bitsandbytes
import evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from trl import SFTTrainer
rouge = evaluate.load("rouge")
cosine_similarity = evaluate.load("bertscore")

In [None]:
# Quantization
CONFIG_4BITS = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) # For QLORA
CONFIG_4BITS_NORM = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16")) # For QLORA and GEMMA
CONFIG_4BITS_NORM_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16"), bnb_4bit_use_double_quant=True) # For QLORA and GEMMA
CONFIG_4BITS_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True) # For QLORA
CONFIG_8BITS = BitsAndBytesConfig(load_in_8bit=True)

In [None]:
# Load utils
def load_processed_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers

    return Dataset.from_dict(data_dict)

def load_tokenized_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers, text, input_id = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers
        data_dict['text'] = text
        data_dict['input_ids'] = input_id


    return Dataset.from_dict(data_dict)

def load_datasets_from_directory(directory_path: str, type='tokenized') -> tuple:

    expected_files = {"train.json", "dev.json", "test.json"}
    actual_files = set(os.listdir(directory_path))

    if expected_files != actual_files:
        raise ValueError(f"Directory must contain exactly these files: {expected_files}")

    if type == 'tokenized':
        train_dataset = load_tokenized_dataset(os.path.join(directory_path, "train.json"))
        dev_dataset = load_tokenized_dataset(os.path.join(directory_path, "dev.json"))
        test_dataset = load_tokenized_dataset(os.path.join(directory_path, "test.json"))
    else:
        train_dataset = load_processed_dataset(os.path.join(directory_path, "train.json"))
        dev_dataset = load_processed_dataset(os.path.join(directory_path, "dev.json"))
        test_dataset = load_processed_dataset(os.path.join(directory_path, "test.json"))

    return (train_dataset, dev_dataset, test_dataset)

def load_model(base_model: str, bnb_config:BitsAndBytesConfig=None, on_gpu:bool=False, use_cache:bool=False, pretraining_tp:int=1) -> AutoModelForCausalLM:
    if on_gpu:
        print("in here")
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map={"": 0})
        print(base_model)
    else:
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model)

    base_model_loaded.config.use_cache = use_cache
    base_model_loaded.config.pretraining_tp = pretraining_tp

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return base_model_loaded, tokenizer

# Input text preprocessing
def preprocess_prompt_icl(hf_model: str, ds: Dataset, experiment, k_shot: int=1,
               max_k_shot_token_length=200, seed=42, sample: int=1000):
    ds = ds.shuffle(seed=seed)
    eval_sample = ds.select(range(sample))

    loaded_tokenizer = AutoTokenizer.from_pretrained(hf_model, device_map={"": 0})

    def filter_by_token_length(example):
        tokens = loaded_tokenizer(example['text'], return_tensors="pt", truncation=False)
        return tokens.input_ids.size(1) <= max_k_shot_token_length



    print(f'Running prompt injection for: {experiment}')
    prompt_insert = "Answer this question in plain unformatted text:"

    if experiment == 'zero_shot':
        prompt_insert = "Answer the question truthfully:"
        results = process_samples(eval_sample, hf_model, prompt_insert, loaded_tokenizer)

    elif experiment == 'k_shot':
        filtered_dataset_for_k_shot =  ds.filter(filter_by_token_length)
        print(f"Number of examples in the dataset: {len(filtered_dataset_for_k_shot)}")
        if len(filtered_dataset_for_k_shot) < k_shot:
            raise ValueError(f"Dataset has less than {k_shot} examples")

        prompt_insert = "Answer the question truthfully. Follow these examples:"
        prompt_insert += "\n".join(filtered_dataset_for_k_shot['questions'][:k_shot])
        prompt_insert += "\n"
        prompt_insert += 'Question:'

        results = process_samples(eval_sample, hf_model, prompt_insert, loaded_tokenizer)
    eval_sample = datasets.concatenate_datasets([eval_sample, results], axis=1)

    return eval_sample

def process_samples(sample_data, model_name, prompt_insert, tokenizer):
    model_to_insert_point = {
        'google/gemma-7b': "user",
        'meta-llama/Llama-2-7b-hf': "<s>",
        'mistralai/Mistral-7B-v0.1': "[INST]"
    }

    original_dataset = []
    new_tokenizations = []

    for example in sample_data:
        text = example['questions']
        insertion_point = text.find(model_to_insert_point[model_name]) + len(model_to_insert_point[model_name])
        new_text = text[:insertion_point] + " " + prompt_insert + " " + text[insertion_point:]

        inputs = tokenizer(new_text, return_tensors="pt")
        original_dataset.append(example['id'].split('-')[0])
        new_tokenizations.append(inputs.input_ids)
    processed_samples = {'prompt_tokenizations': new_tokenizations, 'original_dataset': original_dataset}
    out = Dataset.from_dict(processed_samples)
    print(out['prompt_tokenizations'])
    return out


# Predict

def predict(trained_model:SFTTrainer, tokenizer:AutoTokenizer, eval_sample:Dataset, model_name:str, prompted:bool=False):
    if prompted==True:
        assert 'prompt_tokenizations' in list(eval_sample.features.keys()), f"Eval Data needs the following column: 'prompt_tokenizations', but instead has { list(eval_sample.features.keys()) }"
        token_col = 'prompt_tokenizations'
    else:
        assert 'input_ids' in list(eval_sample.features.keys()), f"Eval Data needs the following column: 'input_ids', but instead has { list(eval_sample.features.keys()) }"
        token_col = 'input_ids'

    predictions = []
    for inp in eval_sample[token_col]:
        inp = torch.tensor(inp, dtype=int)
        outp = trained_model.generate(inp, max_new_tokens=20, return_dict_in_generate=True, output_scores=True)
        pred = tokenizer.batch_decode(outp['sequences'], skip_special_tokens=True)

        predictions.append(pred[0])

    return predictions

def prediction_wrapper(trained_model:SFTTrainer, tokenizer:AutoTokenizer, ds:Dataset, model_name:str, add_prompt:bool=False, sample:int=1000, seed:int=42, save_path:str=''):
    def add_dataset_name_col(ds):
        original_dataset = []
        for example in sample_data:
            original_dataset.append(example['id'].split('-')[0])
        eval_sample = datasets.concatenate_datasets([sample_data, Dataset.from_dict({'original_dataset': original_dataset})], axis=1)
        return eval_sample

    if add_prompt == True and sample > 0:
         eval_sample = preprocess_prompt_icl(model_name, tokenizer, ds, experiment='zero_shot', sample=sample, seed=seed)
    elif add_prompt == False and sample > 0:
        ds = ds.shuffle(seed=seed)
        sample_data = ds.select(range(sample))
        eval_sample = add_dataset_name_col(sample_data)
    elif add_prompt == True and sample == 0:
         eval_sample = preprocess_prompt_icl(model_name, tokenizer, ds, experiment='zero_shot', sample=ds.shape[0], seed=seed)
    else:
        eval_sample = add_dataset_name_col(ds)

    predictions = predict(trained_model, tokenizer, eval_sample, model_name, prompted=add_prompt)
    return predictions

In [None]:
from huggingface_hub import notebook_login
notebook_login() # use your access token here!

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load tokenized test datasets

# gemma_test = load_tokenized_dataset(os.path.join("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized_NEW/Gemma_NEW", "test.json"))
# llama_test = load_tokenized_dataset(os.path.join("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized_NEW/Llama_NEW", "test.json"))
mistral_test = load_tokenized_dataset(os.path.join("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized_NEW/Mistral_NEW", "test.json"))

In [None]:
# Add prompt for zero-shot (regular inference prompt)
# gemma_test_processed = preprocess_prompt_icl("google/gemma-7b", ds=gemma_test, experiment='zero_shot', sample=5)
# del gemma_test
# llama_test_processed = preprocess_prompt_icl("meta-llama/Llama-2-7b-hf", ds=llama_test, experiment='zero_shot', sample=10)
mistral_test_processed = preprocess_prompt_icl("mistralai/Mistral-7B-v0.1", ds=mistral_test, experiment='zero_shot', sample=20)

Running prompt injection for: zero_shot
[[[1, 1, 733, 16289, 28793, 26307, 272, 2996, 5307, 3071, 28747, 28705, 690, 2170, 3222, 9105, 1074, 1272, 1485, 645, 298, 3408, 354, 3601, 1098, 771, 28804, 414, 28711, 325, 28708, 28731, 17336, 381, 325, 28726, 28731, 2367, 4042, 1089, 28712, 296, 325, 28717, 28731, 7255, 28718, 1254, 325, 28715, 28731, 484, 5638, 410, 4081, 733, 28748, 16289, 28793, 13, 13]], [[1, 1, 733, 16289, 28793, 26307, 272, 2996, 5307, 3071, 28747, 28705, 693, 1235, 277, 14468, 2136, 298, 913, 778, 17663, 28742, 28713, 26210, 1491, 1729, 28804, 414, 28711, 28705, 264, 15451, 4463, 4580, 28725, 461, 1953, 1780, 1359, 28764, 28725, 349, 21670, 3101, 298, 3168, 395, 396, 7515, 3088, 1938, 3142, 486, 264, 21296, 843, 13985, 2971, 438, 516, 9585, 28723, 3153, 12414, 25646, 17663, 1191, 8377, 6694, 1002, 28725, 304, 272, 865, 12236, 349, 277, 14468, 23536, 479, 28725, 1359, 28764, 28742, 28713, 4505, 720, 840, 16391, 304, 264, 9311, 7092, 392, 693, 659, 4241, 264, 7092, 369, 

Flattening the indices:   0%|          | 0/20 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
print(gemma_test_processed['id'][0])
print('--------------------------------------------------------------')
print(gemma_test_processed['questions'][0])
print('--------------------------------------------------------------')
print(gemma_test_processed['answers'][0])
print('----------------------------------------------------------------------------')

arc_easy-test-1898
--------------------------------------------------------------
<bos><start_of_turn>user
which organelle converts glucose to energy for cellular work? \n (a) nucleus (b) mitochondrion (c) vacuole (d) chloroplast<end_of_turn>


--------------------------------------------------------------
<start_of_turn>model
mitochondrion<end_of_turn>
----------------------------------------------------------------------------


In [None]:
print(llama_test_processed['id'][0])
print('--------------------------------------------------------------')
print(llama_test_processed['questions'][0])
print('--------------------------------------------------------------')
print(llama_test_processed['answers'][0])
print('----------------------------------------------------------------------------')
print(llama_test_processed['prompt_tokenizations'][0])

arc_easy-test-1898
--------------------------------------------------------------
<s>Input:
which organelle converts glucose to energy for cellular work? \n (a) nucleus (b) mitochondrion (c) vacuole (d) chloroplast


--------------------------------------------------------------
Output:
mitochondrion
----------------------------------------------------------------------------
[[1, 1, 29871, 673, 278, 1139, 8760, 3730, 29901, 10567, 29901, 13, 4716, 2894, 1808, 29436, 3144, 1682, 852, 304, 5864, 363, 3038, 1070, 664, 29973, 320, 29876, 313, 29874, 29897, 22699, 375, 313, 29890, 29897, 1380, 2878, 898, 29878, 291, 313, 29883, 29897, 11757, 29884, 1772, 313, 29881, 29897, 521, 5095, 459, 4230, 13, 13]]


In [None]:
# print(mistral_test_processed['id'][0])
# print('--------------------------------------------------------------')
# print(mistral_test_processed['questions'][0])
# print('--------------------------------------------------------------')
# print(mistral_test_processed['answers'][0])
# print('----------------------------------------------------------------------------')

31317
--------------------------------------------------------------
narrativeqa-test-0
--------------------------------------------------------------
<bos><start_of_turn>user
who is mark hunter? \n  mark hunter (slater), a high school student in a sleepy suburb of phoenix, arizona, starts an fm pirate radio station that broadcasts from the basement of his parents' house. mark is a loner, an outsider, whose only outlet for his teenage angst and aggression is his unauthorized radio station. his pirate station's theme song is "everybody knows" by leonard cohen and there are glimpses of cassettes by such alternative musicians as the jesus and mary chain, camper van beethoven, primal scream, soundgarden, ice-t, bad brains, concrete blonde, henry rollins, and the pixies. by day, mark is seen as a loner, hardly talking to anyone around him; by night, he expresses his outsider views about what is wrong with american society. when he speaks his mind about what is going on at his school and in 

In [None]:
# Load models
# gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=True, pretraining_tp=1)
# llama_model, llama_tokenizer = load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=True, pretraining_tp=1)
mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=True,  pretraining_tp=1) # Andrey


in here


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

mistralai/Mistral-7B-v0.1


In [None]:
# This is the function that still needs to be updated for stripping for each model
import re
def strip_output_text(output:str, model_name:str):
  if model_name == 'google/gemma-7b':
    out = output[output.find("model"):output.find("Explanation")]
    # Returns the whole input string as well; cut off this part
    for repl in ['model']:
        out = out.replace(repl, '')
    out = re.sub('[^a-zA-Z\s]+', '', out)
    out = re.sub('\s+', ' ', out).strip()
    return out
  elif model_name == 'meta-llama/Llama-2-7b-hf':
        start_idx = output.find("Output:") + len("Output:")
        end_idx = output.find("\n\n", start_idx)
        if end_idx == -1:
            end_idx = len(output)
        out = output[start_idx:end_idx].strip()
        out = re.sub('[^a-zA-Z\s]+', '', out)
        out = re.sub(r'\bbinbash\b|\becho\b', '', out, flags=re.IGNORECASE)
        out = re.sub('\s+', ' ', out).strip()
        return out
  elif model_name == 'mistralai/Mistral-7B-v0.1':
        start_idx = output.find("Output:") + len("Output:")
        end_idx = output.find("\n\n", start_idx)
        if end_idx == -1:
            end_idx = len(output)
        out = output[start_idx:end_idx].strip()
        out = re.sub('[^a-zA-Z\s]+', '', out)
        out = re.sub(r'\bbinbash\b|\becho\b', '', out, flags=re.IGNORECASE)
        out = re.sub('\s+', ' ', out).strip()
        return out

In [None]:
# predictions_gemma = prediction_wrapper(gemma_model, gemma_tokenizer, gemma_test, 'google/gemma-7b', add_prompt=False, sample=10)
# predictions_llama = prediction_wrapper(llama_model, llama_tokenizer, llama_test, "meta-llama/Llama-2-7b-hf", add_prompt=False, sample=10)
predictions_mistral = prediction_wrapper(mistral_model, mistral_tokenizer, mistral_test, "mistralai/Mistral-7B-v0.1", add_prompt=False, sample=10)

Flattening the indices:   0%|          | 0/10 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/10 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

In [None]:
predictions_stripped = []
for p in predictions_llama:
  predictions_stripped.append(strip_output_text(p, 'mistralai/Mistral-7B-v0.1'))
predictions_stripped

['mitochondrion',
 'lt nielsen',
 'the civil war',
 'dantes inferno',
 'he gets an eye transplant to avoid the optical recognition program he gets an eye transplant to avoid the optical recognition program',
 'natural laws can explain everything in nature',
 'it was once underwater',
 'from a daisys leaves into its underground support system from roots to leaves of a daffodil from a roses leaves to the',
 'on the observation deck of the ge building david and elise kiss',
 'john hull']

In [None]:
predictions_llama

['Input:\nwhich organelle converts glucose to energy for cellular work? \\n (a) nucleus (b) mitochondrion (c) vacuole (d) chloroplast\n\n Output:\nmitochondrion\n\nExplanation:\n\\begin{itemize}\n\\item The mitochondr',
 "Input:\nwho does catherine pay to look into nick's psychiatric file? \\n  a retired rock star, johnny boz, is stabbed to death with an ice pick during sex by a mysterious blonde woman at his apartment. homicide detective nick curran investigates, and the only suspect is catherine tramell, boz's bisexual girlfriend and a crime novelist who has written a novel that mirrors the crime. it is concluded that either catherine herself did it or someone trying to frame her out of spite. tramell is uncooperative and taunting in the investigation, smoking in the interrogation room and exposing her bare genitalia in front of the officers. she presents alibis and passes a lie detector test. nick discovers that catherine has a habit of befriending murderers, including her girlfrien

In [None]:
predictions_stripped

['mitochondrion',
 'lt nielsen',
 'the civil war',
 'dantes inferno',
 'he gets an eye transplant to avoid the optical recognition program he gets an eye transplant to avoid the optical recognition program',
 'natural laws can explain everything in nature',
 'it was once underwater',
 'from a daisys leaves into its underground support system from roots to leaves of a daffodil from a roses leaves to the',
 'on the observation deck of the ge building david and elise kiss',
 'john hull']

In [None]:
pred = predict(llama_model, llama_tokensizer, llama_test_preprocessed, )