## Setup

In [1]:
%pip install datasets
%pip install transformers
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 sentencepiece

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m

In [2]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from datasets import Dataset
from transformers import pipeline, set_seed
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/CS\ 182\ Final\ Project/Phase \2

/content/drive/.shortcut-targets-by-id/1hzhdcGA40OipfzF0SRT7omKvzCSi0q4r/CS 182 Final Project/Phase 2


## Dataset

In [5]:
dataset = load_dataset('abeiler/Num_Rep_Arithmetic') ## Fill in Dataset
dataset = dataset['train'].select_columns(["output"]) # select appropriate column names from dataset
dataset = dataset.train_test_split(test_size=0.2, seed=20)
train, val = dataset['train'], dataset['test']

val_humaneval = val.train_test_split(test_size=100, seed=20)['test']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/22.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
train[:10]

{'output': ['73497 / 1 = 73497',
  '6821770 + 21649 = 6843419',
  '4601711158 * 0 = 0',
  '785 * 1 = 785',
  '23579612 / 6 = 3929935 R 2',
  '5765 * 7048 = 7048 * (5000 + 700 + 60 + 5) = 7048 * 5000 + 7048 * 700 + 7048 * 60 + 7048 * 5 = 35240000 + 4933600 + 422880 + 35240 = 40173600 + 422880 + 35240 = 40596480 + 35240 = 40631720',
  '9469209 - 9981716 = -512507',
  '1885 / 2 = 942 R 1',
  '364 * 23595 = 23595 * (300 + 60 + 4) = 23595 * 300 + 23595 * 60 + 23595 * 4 = 7078500 + 1415700 + 94380 = 8494200 + 94380 = 8588580',
  '0 * 4781 = 0']}

## Base model performance

In [None]:
def get_test_ppl(model, tokenizer, dataset, question_col, answer_col, exp_name, device='cuda'):

  nlls = []
  model_responses = []

  for example in tqdm(dataset):

    question = example[question_col]
    answer = example[answer_col]
    qa = 'QUESTION: ' + question + '\nANSWER: ' + answer

    q_encoding_length = tokenizer(question, return_tensors="pt").input_ids.shape[1]
    qa_encoding = tokenizer(qa, return_tensors='pt')
    input_ids = qa_encoding.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :q_encoding_length] = -100

    with torch.no_grad():
      output = model(input_ids.to(device), labels=target_ids.to(device))
      nlls.append(output.loss.item())

  results = pd.DataFrame(dataset)
  results['nll'] = nlls
  results['ppl'] = torch.exp(torch.tensor(nlls))
  results.to_csv(f'Results/{exp_name}.csv', index=False)
  return results


def get_generated_responses(model, tokenizer, dataset, question_col, answer_col, exp_name, device='cuda'):

  model_responses = []
  generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, pad_token_id=50256, device=device)

  for example in tqdm(dataset):

    question = 'QUESTION: ' + example[question_col] + '\nANSWER: '

    with torch.no_grad():
      model_responses.append(generator(question)[0]['generated_text'])

  results = pd.DataFrame(dataset)
  results['model_response'] = model_responses
  results.to_csv(f'Results/{exp_name}-humaneval.csv', index=False)

In [None]:
device = 'cuda'
models = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
test_dataset = val_gradeschool
question_col = 'INSTRUCTION'
answer_col = 'RESPONSE'

for model_id in models:

  model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
  tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  df = get_test_ppl(model, tokenizer, test_dataset, question_col, answer_col, model_id + '-base')
  get_generated_responses(model, tokenizer, val_gradeschool_humaneval, question_col, answer_col, model_id + '-base')

  print(f"{model_id}-base: {df['ppl'].mean()}")
  del model
  del tokenizer

100%|██████████| 1759/1759 [00:26<00:00, 65.69it/s]
100%|██████████| 100/100 [02:57<00:00,  1.78s/it]

gpt2-base: 16.57942008972168





config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [00:49<00:00, 35.53it/s]
100%|██████████| 100/100 [05:36<00:00,  3.36s/it]


gpt2-medium-base: 11.121452331542969


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [01:13<00:00, 23.90it/s]
100%|██████████| 100/100 [08:13<00:00,  4.93s/it]


gpt2-large-base: 9.86057186126709


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1759/1759 [01:46<00:00, 16.47it/s]
100%|██████████| 100/100 [11:17<00:00,  6.78s/it]

gpt2-xl-base: 8.338263511657715





In [None]:
generator = pipeline('text-generation', model='gpt2-xl')

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is 4+2? This new configuration includes new features that have been requested and requested again.


The features in 4+2 are:

Add-on support

You can buy and download multiple add-ons. We will deliver all the downloaded add-ons into the main version of the add-on.


More information from the official 4+2 announcement here

Add-on distribution

The distribution method is the same for all add-ons. The


In [None]:
text = 'Question: If 1+1 is 2 what is 1+2? Answer: '
print(generator(text, max_length=100, num_return_sequences=1)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: If 1+1 is 2 what is 1+2? Answer:  1+2  (1+1)+1 (1+2)
When was there a time when there have been so many solutions to a 1+1 problem? 
Answer:  The first and most popular one was published by Jacob Klein in 1915 and that one is the famous and fundamental 1+1! answer.   Then it was solved in 1960's by David


## LoRA Fine-Tuning

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "model_results/checkpoint-25000"

# Fine-tuned model name
new_model = "gpt2-xl-arith"

################################################################################
# LoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.1


################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "model_results"

# Number of training epochs
num_train_epochs = 3

# Batch size per GPU for training
per_device_train_batch_size = 8

# Batch size per GPU for evaluation
per_device_eval_batch_size = 8

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Optimizer to use
optim = "paged_adamw_32bit"

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Save checkpoint every X updates steps
save_steps = 5000

# Log every X updates steps
logging_steps = 2500

device = 'cuda'

In [None]:
from peft import AutoPeftModelForCausalLM

In [None]:
# Load base model
model =  AutoPeftModelForCausalLM.from_pretrained(model_name).to(device)

# Load GPT2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    evaluation_strategy="steps",
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=val,
    peft_config=peft_config,
    dataset_text_field="output",
    tokenizer=tokenizer,
    args=training_arguments,
)

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
for param in model.parameters():
  param.requires_grad = True

In [None]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Step,Training Loss,Validation Loss
2500,1.419,1.370889
5000,1.413,1.370889
7500,1.3909,1.370889
10000,1.4019,1.370889
12500,1.4069,1.370889
15000,1.3997,1.370889
17500,1.4115,1.370889
20000,1.4133,1.370889


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-59603b6f35ac>", line 2, in <cell line: 2>
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1539, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1901, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2212, in _maybe_log_save_evaluate
    self.log(logs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2570, in log
    self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer_callback.py"

In [None]:
%load_ext tensorboard
%tensorboard --logdir=#PATH

Check Some Outputs

In [None]:
device = 'cuda'
# model_path = 'facebook/opt-2.7b'
# model = LlamaForCausalLM.from_pretrained(model_path).to(device)
# tokenizer = LlamaTokenizer.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained('gpt2-xl')
model = AutoPeftModelForCausalLM.from_pretrained('model_results/checkpoint-20000', pad_token_id=tokenizer.eos_token_id).to(device)
# generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, pad_token_id=50256, device=device)

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, pad_token_id=50256, device=device)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

In [None]:
text = """
3 + 5 =
"""
print(generator(text, max_length=10, num_return_sequences=1)[0]['generated_text'])


3 + 5 =
7 = 10 =
