## Setup

In [None]:
%pip install datasets
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 sentencepiece

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from datasets import Dataset
from transformers import pipeline, set_seed
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

In [None]:
%cd drive/MyDrive/CS\ 182\ Final\ Project/Phase \2

## Dataset

In [None]:
def preprocess_dataset(example):
  example["full"] = str(example["expression"])+'='+str(example["answer"])
  return example

In [None]:
dataset = load_dataset('sethapun/arithmetic_2as_1to10') ## Fill in Dataset
dataset = dataset['train'].select_columns(["expression", "answer", "label"]) # select appropriate column names from dataset
dataset = dataset.filter(lambda example: example['label'] == 1)
dataset = dataset.map(preprocess_dataset)
dataset = dataset.train_test_split(test_size=0.2, seed=20)
train, val = dataset['train'], dataset['test']

val_humaneval = val.train_test_split(test_size=100, seed=20)['test']

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Base model performance

In [None]:
def get_test_ppl(model, tokenizer, dataset, question_col, answer_col, exp_name, device='cuda'):

  nlls = []
  model_responses = []

  for example in tqdm(dataset):

    question = example[question_col]
    answer = example[answer_col]
    qa = 'QUESTION: ' + question + '\nANSWER: ' + answer

    q_encoding_length = tokenizer(question, return_tensors="pt").input_ids.shape[1]
    qa_encoding = tokenizer(qa, return_tensors='pt')
    input_ids = qa_encoding.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :q_encoding_length] = -100

    with torch.no_grad():
      output = model(input_ids.to(device), labels=target_ids.to(device))
      nlls.append(output.loss.item())

  results = pd.DataFrame(dataset)
  results['nll'] = nlls
  results['ppl'] = torch.exp(torch.tensor(nlls))
  results.to_csv(f'Results/{exp_name}.csv', index=False)
  return results


def get_generated_responses(model, tokenizer, dataset, question_col, answer_col, exp_name, device='cuda'):

  model_responses = []
  generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, pad_token_id=50256, device=device)

  for example in tqdm(dataset):

    question = 'QUESTION: ' + example[question_col] + '\nANSWER: '

    with torch.no_grad():
      model_responses.append(generator(question)[0]['generated_text'])

  results = pd.DataFrame(dataset)
  results['model_response'] = model_responses
  results.to_csv(f'Results/{exp_name}-humaneval.csv', index=False)

In [None]:
device = 'cuda'
models = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
test_dataset = val_gradeschool
question_col = 'INSTRUCTION'
answer_col = 'RESPONSE'

for model_id in models:

  model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
  tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  df = get_test_ppl(model, tokenizer, test_dataset, question_col, answer_col, model_id + '-base')
  get_generated_responses(model, tokenizer, val_gradeschool_humaneval, question_col, answer_col, model_id + '-base')

  print(f"{model_id}-base: {df['ppl'].mean()}")
  del model
  del tokenizer

100%|██████████| 1759/1759 [00:26<00:00, 65.69it/s]
100%|██████████| 100/100 [02:57<00:00,  1.78s/it]

gpt2-base: 16.57942008972168





config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [00:49<00:00, 35.53it/s]
100%|██████████| 100/100 [05:36<00:00,  3.36s/it]


gpt2-medium-base: 11.121452331542969


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [01:13<00:00, 23.90it/s]
100%|██████████| 100/100 [08:13<00:00,  4.93s/it]


gpt2-large-base: 9.86057186126709


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [01:46<00:00, 16.47it/s]
100%|██████████| 100/100 [11:17<00:00,  6.78s/it]

gpt2-xl-base: 8.338263511657715





In [None]:
generator = pipeline('text-generation', model='gpt2-xl')

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is 4+2? This new configuration includes new features that have been requested and requested again.


The features in 4+2 are:

Add-on support

You can buy and download multiple add-ons. We will deliver all the downloaded add-ons into the main version of the add-on.


More information from the official 4+2 announcement here

Add-on distribution

The distribution method is the same for all add-ons. The


In [None]:
text = 'Question: If 1+1 is 2 what is 1+2? Answer: '
print(generator(text, max_length=100, num_return_sequences=1)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: If 1+1 is 2 what is 1+2? Answer:  1+2  (1+1)+1 (1+2)
When was there a time when there have been so many solutions to a 1+1 problem? 
Answer:  The first and most popular one was published by Jacob Klein in 1915 and that one is the famous and fundamental 1+1! answer.   Then it was solved in 1960's by David


## LoRA Fine-Tuning

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "gpt2-xl"

# Fine-tuned model name
new_model = "gpt2-xl-simple-arithmetic"

################################################################################
# LoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.1


################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "new_model_results"

# Number of training epochs
num_train_epochs = 2

# Batch size per GPU for training
per_device_train_batch_size = 8

# Batch size per GPU for evaluation
per_device_eval_batch_size = 8

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Optimizer to use
optim = "paged_adamw_32bit"

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Save checkpoint every X updates steps
save_steps = 5000

# Log every X updates steps
logging_steps = 2500

device = 'cuda'

In [None]:
# Load base model
model = GPT2LMHeadModel.from_pretrained(
    model_name,
).to(device)

# Load GPT2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    evaluation_strategy="steps",
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=val,
    peft_config=peft_config,
    dataset_text_field="full",
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
%load_ext tensorboard
%tensorboard --logdir=#PATH