## Setup

In [None]:
%pip install datasets
%pip install transformers
%pip install sentencepiece
%pip install peft



In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import pipeline, set_seed
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from peft import AutoPeftModelForCausalLM

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/CS\ 182\ Final\ Project/Phase \2

/content/drive/.shortcut-targets-by-id/1hzhdcGA40OipfzF0SRT7omKvzCSi0q4r/CS 182 Final Project/Phase 2


## Dataset

In [None]:
dataset_gradeschool = load_dataset('qwedsacf/grade-school-math-instructions')
dataset_gradeschool = dataset_gradeschool['train'].select_columns(['INSTRUCTION', 'RESPONSE'])
dataset_gradeschool = dataset_gradeschool.train_test_split(test_size=0.2, seed=20)
train_gradeschool, val_gradeschool = dataset_gradeschool['train'], dataset_gradeschool['test']

val_gradeschool_humaneval = val_gradeschool.train_test_split(test_size=100, seed=20)['test']

Downloading readme:   0%|          | 0.00/852 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8792 [00:00<?, ? examples/s]

In [None]:
# example
print('INSTRUCTION: ', train_gradeschool[0]['INSTRUCTION'])
print('RESPONSE: ', train_gradeschool[0]['RESPONSE'])

INSTRUCTION:  Johnny is out walking his two dogs at night, and his son joins him for the walk.  How many legs' worth of organisms are traveling together for this walk?
Give me a solution to this problem
RESPONSE:  As Johnny and his son are humans, and humans walk on two legs, this means that between the two of them there are 2*2=4 legs' worth of organisms.
There are two dogs walking along as well, and since dogs walk on 4 legs this means there are 2*4=8 legs' worth of organisms.
We add these two totals together to find there are 4+8=12 legs' worth of organisms in total.


## Base model performance

In [None]:
def get_test_ppl(model, tokenizer, dataset, question_col, answer_col, exp_name, device='cuda'):

  nlls = []
  model_responses = []

  for example in tqdm(dataset):

    question = example[question_col]
    answer = example[answer_col]
    qa = 'QUESTION: ' + question + '\nANSWER: ' + answer

    q_encoding_length = tokenizer(question, return_tensors="pt").input_ids.shape[1]
    qa_encoding = tokenizer(qa, return_tensors='pt')
    input_ids = qa_encoding.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :q_encoding_length] = -100

    with torch.no_grad():
      output = model(input_ids.to(device), labels=target_ids.to(device))
      nlls.append(output.loss.item())

  results = pd.DataFrame(dataset)
  results['nll'] = nlls
  results['ppl'] = torch.exp(torch.tensor(nlls))
  results.to_csv(f'Results/{exp_name}.csv', index=False)
  return results


def get_generated_responses(model, tokenizer, dataset, question_col, answer_col, exp_name, device='cuda'):

  model_responses = []
  generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, pad_token_id=50256, device=device)

  for example in tqdm(dataset):

    question = 'QUESTION: ' + example[question_col] + '\nANSWER: '

    with torch.no_grad():
      model_responses.append(generator(question)[0]['generated_text'])

  results = pd.DataFrame(dataset)
  results['model_response'] = model_responses
  results.to_csv(f'Results/{exp_name}-humaneval.csv', index=False)

In [None]:
device = 'cuda'
models = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
test_dataset = val_gradeschool
question_col = 'INSTRUCTION'
answer_col = 'RESPONSE'

for model_id in models:

  model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
  tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  df = get_test_ppl(model, tokenizer, test_dataset, question_col, answer_col, model_id + '-base')
  get_generated_responses(model, tokenizer, val_gradeschool_humaneval, question_col, answer_col, model_id + '-base')

  print(f"{model_id}-base: {df['ppl'].mean()}")
  del model
  del tokenizer

100%|██████████| 1759/1759 [00:26<00:00, 65.69it/s]
100%|██████████| 100/100 [02:57<00:00,  1.78s/it]

gpt2-base: 16.57942008972168





config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [00:49<00:00, 35.53it/s]
100%|██████████| 100/100 [05:36<00:00,  3.36s/it]


gpt2-medium-base: 11.121452331542969


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [01:13<00:00, 23.90it/s]
100%|██████████| 100/100 [08:13<00:00,  4.93s/it]


gpt2-large-base: 9.86057186126709


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [01:46<00:00, 16.47it/s]
100%|██████████| 100/100 [11:17<00:00,  6.78s/it]

gpt2-xl-base: 8.338263511657715





In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

In [None]:
device = 'cuda'
# model_path = 'facebook/opt-2.7b'
# model = LlamaForCausalLM.from_pretrained(model_path).to(device)
# tokenizer = LlamaTokenizer.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained('gpt2-xl')
model = AutoPeftModelForCausalLM.from_pretrained('model_results/checkpoint-25000', pad_token_id=tokenizer.eos_token_id).to(device)
# generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, pad_token_id=50256, device=device)

In [None]:
model_inputs = tokenizer('10-1=', return_tensors='pt').to(device)
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=2,
    do_sample=True,
    top_p=0.92,
    top_k=50
)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

10-1=10-


In [None]:
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt').to(device)
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(device)

In [None]:
text = """
7 - 8 =
"""
print(generator(text, max_length=200, num_return_sequences=1)[0]['generated_text'])


7 - 8 = 


A: You can use the following code:
int i = 0;
int j = 0;
int k = 0;

while (i < 7)
{
    while (j < 8)
    {
        if (k == 0)
        {
            cout << " ";
        }
        else
        {
            cout << "*";
        }
        k++;
        j++;
    }
    i++;
    j = 0;
}


A: You can use a nested for loop.
for(int i = 0; i < 7; i++)
{
    for(int j = 0; j < 8; j++)
    {
        if(j == 0)
        {
            cout << " ";
        }
        else
