## Setup

In [1]:
%pip install transformers
%pip install datasets
%pip install peft
%pip install bitsandbytes
%pip install sentencepiece

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting peft
  Downloading peft-0.7.0-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
import os
import torch
from tqdm import tqdm
from datasets import Dataset
from datasets import load_dataset, concatenate_datasets
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from transformers import pipeline
from transformers import BitsAndBytesConfig
import sentencepiece

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings("ignore")

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/CS\ 182\ Final\ Project/Phase\ 3

/content/drive/.shortcut-targets-by-id/1hzhdcGA40OipfzF0SRT7omKvzCSi0q4r/CS 182 Final Project/Phase 3


## Dataset

In [5]:
def add_label_train(example):
  example['dataset'] = 'train'
  return example

def add_label_val(example):
  example['dataset'] = 'val'
  return example

dataset_path = "qwedsacf/grade-school-math-instructions"
data = load_dataset(dataset_path, split="train")
data = data.train_test_split(test_size=0.15, seed=123)
training_data, validation_data = data['train'], data['test']

training_data = training_data.train_test_split(test_size=500, seed=123)
validation_data = validation_data.train_test_split(test_size=500, seed=123)

training_data_humaneval = data['train'].train_test_split(test_size=50, seed=321)
validation_data_humaneval = data['test'].train_test_split(test_size=50, seed=321)

test_data = concatenate_datasets([training_data['test'].map(add_label_train),
                                  validation_data['test'].map(add_label_val)])

test_data_humaneval = concatenate_datasets([training_data_humaneval['test'].map(add_label_train),
                                  validation_data_humaneval['test'].map(add_label_val)])

test_data

Downloading readme:   0%|          | 0.00/852 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8792 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['INSTRUCTION', 'RESPONSE', 'SOURCE', 'dataset'],
    num_rows: 1000
})

In [6]:
# preconditioning = 'You are a helpful assistant. Given a word problem, you need to break it down into steps and solve them step-by-step.'
# def preprocess(example):
#   example['text'] = "<s>[INST] <<SYS>>"+preconditioning+"<</SYS>>"+example['INSTRUCTION']+"[/INST]"+example["RESPONSE"]+"</s>"
#   return example

# test_data = test_data.map(preprocess)

## Test pipeline

In [7]:
def get_test_ppl(model, tokenizer, dataset, exp_name, base_model=False, device='cuda'):

  nlls = []
  responses = []

  for example in tqdm(dataset):
    instruction = example['INSTRUCTION']
    response = example['RESPONSE']
    preconditioning = 'You are a helpful assistant. Given a word problem, you need to break it down into steps and solve them step-by-step.'

    if not base_model:
      instruction = "<s>[INST] <<SYS>>"+preconditioning+"<</SYS>>"+instruction+"[/INST]"
      full = instruction + response+"</s>"
    else:
      instruction = preconditioning + " " + instruction
      full = instruction + " " + response

    instruction_encoding_length = tokenizer(instruction, return_tensors="pt").input_ids.shape[1]
    full_encoding = tokenizer(full, return_tensors='pt')
    input_ids = full_encoding.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :instruction_encoding_length] = -100

    with torch.no_grad():
      output = model(input_ids.to(device), labels=target_ids.to(device))
      nlls.append(output.loss.item())

  df = pd.DataFrame(dataset)
  df['nll'] = nlls
  df['ppl'] = torch.exp(torch.tensor(nlls))
  df.to_csv(f'./inference_results/{exp_name}.csv', index=False)
  return df


def get_test_responses(model, tokenizer, dataset, exp_name, base_model=False, device='cuda'):

  responses = []

  for example in tqdm(dataset):
    instruction = example['INSTRUCTION']
    preconditioning = 'You are a helpful assistant. Given a word problem, you need to break it down into steps and solve them step-by-step.'

    if not base_model:
      instruction = "<s>[INST] <<SYS>>"+preconditioning+"<</SYS>>"+instruction+"[/INST]"
    else:
      instruction = preconditioning + " " + instruction

    with torch.no_grad():

      input_ids = tokenizer(instruction, return_tensors="pt").input_ids.to(device)
      instruction_encoding_length = input_ids.shape[1]
      generated_output = model.generate(
          input_ids=input_ids, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id
      )
      generated_text = tokenizer.decode(generated_output[0][instruction_encoding_length:])
      responses.append(generated_text)

  df = pd.DataFrame(dataset)
  df['responses'] = responses
  df.to_csv(f'./inference_results/{exp_name}.csv', index=False)
  return df

In [None]:
model_path = "./train_results/gpt2-xl-r8-a32/gpt2-xl-problem-solver"
base_model_path = "gpt2-xl"
# base_model_path = 'openlm-research/open_llama_3b_v2'

tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# del model
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
results_ppl = get_test_ppl(model, tokenizer, test_data, 'gpt2-r8-a32-ppl', base_model=False, device="cuda")

In [None]:
# results_responses = get_test_responses(model, tokenizer, test_data_humaneval, 'gpt2-r64-a32-responses', base_model=False, device="cuda")

100%|██████████| 100/100 [09:31<00:00,  5.71s/it]
