## Setup

In [1]:
%pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m119.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer
from peft import AutoPeftModelForCausalLM

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/CS\ 182\ Final\ Project/Phase \3

/content/drive/.shortcut-targets-by-id/1hzhdcGA40OipfzF0SRT7omKvzCSi0q4r/CS 182 Final Project/Phase 3


## Dataset

In [5]:
dataset_path = "qwedsacf/grade-school-math-instructions"
data = load_dataset(dataset_path, split="train")
data

Downloading readme:   0%|          | 0.00/852 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8792 [00:00<?, ? examples/s]

Dataset({
    features: ['INSTRUCTION', 'RESPONSE', 'SOURCE'],
    num_rows: 8792
})

In [6]:
data = data.train_test_split(test_size=0.15, seed=123)
training_data, validation_data = data['train'], data['test']

In [7]:
preconditioning = 'You are a helpful assistant. Given a word problem, you need to break it down into steps and solve them step-by-step.'
def preprocess(example):
  example['text'] = "<s>[INST] <<SYS>>"+preconditioning+"<</SYS>>"+example['INSTRUCTION']+"[/INST]"+example["RESPONSE"]+"</s>"
  return example

training_data = training_data.map(preprocess)
validation_data = validation_data.map(preprocess)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [8]:
training_data

Dataset({
    features: ['INSTRUCTION', 'RESPONSE', 'SOURCE', 'text'],
    num_rows: 7473
})

In [9]:
validation_data

Dataset({
    features: ['INSTRUCTION', 'RESPONSE', 'SOURCE', 'text'],
    num_rows: 1319
})

## Model

In [None]:
# base_model_name = "NousResearch/Llama-2-7b-chat-hf"
# base_model_name = 'openlm-research/open_llama_3b_v2'
output_dir = "./train_results/aaa/"
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = output_dir + "gpt2-xl-problem-solver"

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=128,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=450,
    logging_steps=225,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=model,
    train_dataset=training_data,
    eval_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=train_params
)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [None]:
fine_tuning.evaluate()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.636303663253784,
 'eval_runtime': 372.6118,
 'eval_samples_per_second': 20.056,
 'eval_steps_per_second': 2.509}

In [None]:
# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
225,1.513,1.302718
450,1.2911,1.273565
675,1.262,1.210714
900,1.2286,1.207301
1125,1.1986,1.176751
1350,1.1882,1.167471
1575,1.1666,1.157061
1800,1.1535,1.143389
2025,1.1354,1.14749
2250,1.1166,1.129435


## Experiment

In [None]:
del base_model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2-xl')
model = AutoPeftModelForCausalLM.from_pretrained('./gpt2-xl-problem-solver', torch_dtype=torch.float16, device_map='auto',
                                                 pad_token_id=tokenizer.eos_token_id)
base_model = AutoModelForCausalLM.from_pretrained('gpt2-xl', torch_dtype=torch.float16, device_map='auto',
                                                 pad_token_id=tokenizer.eos_token_id)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2-xl and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2-xl and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
prompt = """<s>[INST] <<SYS>>
You are a helpful assistant. You are given simple problems and need to solve step-by-step.
<</SYS>>
Milton has some books about zoology and 4 times as many books about botany. If he has 80 books total, how many zoology books does he have?\nCan you solve this problem?
[/INST]"""
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=40
)
print(tokenizer.decode(generation_output[0]))

<s>[INST] <<SYS>>
You are a helpful assistant. You are given simple problems and need to solve step-by-step.
<</SYS>>
Milton has some books about zoology and 4 times as many books about botany. If he has 80 books total, how many zoology books does he have?
Can you solve this problem?
[/INST]Milton has 4 * 80 = 320 books about botany.
Therefore, Milton has 320 - 80 = 240 zoology books.</s>Milton has 240 / 4 = 60 botany books


In [None]:
prompt = """
You are a helpful assistant. You are given simple problems and need to solve step-by-step.
Milton has some books about zoology and 4 times as many books about botany. If he has 80 books total, how many zoology books does he have?\nCan you solve this problem?
"""
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

generation_output =  base_model.generate(
    input_ids=input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95
)
print(tokenizer.decode(generation_output[0]))


You are a helpful assistant. You are given simple problems and need to solve step-by-step.
Milton has some books about zoology and 4 times as many books about botany. If he has 80 books total, how many zoology books does he have?
Can you solve this problem?
Milton is a very good person to have at work. He gets the job done without giving up. If you want to learn more about him and his books, visit the Links on the Top of the page. Here's a quick summary: In many respects, he embodies "the human factor" in the workplace. He is a good worker because he never complains. He is a good friend because he doesn't seem to have any real competition in the company. If you like someone who is this


In [None]:
validation_data[5]

{'INSTRUCTION': 'Find the answer to the math problem: Barkley gets 10 new dog bones at the beginning of the month.  After 5 months, he has 8 bones available and has buried the rest.  How many bones has he buried?',
 'RESPONSE': 'He gets 10 new bones, every month for 5 months so he has 10*5 = 50 dog bones\nHe has 8 bones available which means he has buried 50-8 = 42 bones',
 'SOURCE': 'grade-school-math',
 'text': '<s>[INST] <<SYS>>You are a helpful assistant. Given a word problem, you need to break it down into steps and solve them step-by-step.<</SYS>>Find the answer to the math problem: Barkley gets 10 new dog bones at the beginning of the month.  After 5 months, he has 8 bones available and has buried the rest.  How many bones has he buried?[/INST]He gets 10 new bones, every month for 5 months so he has 10*5 = 50 dog bones\nHe has 8 bones available which means he has buried 50-8 = 42 bones</s>'}

In [None]:
training_data[3]

{'INSTRUCTION': 'In a 90-minute soccer game, Mark played 20 minutes, then rested after. He then played for another 35 minutes. How long was he on the sideline?\nGive me a solution to this problem',
 'RESPONSE': 'Mark played for a total of 20 + 35 = 55 minutes.\nSo, he was on the sideline for 90 - 55 = 35 minutes.',
 'SOURCE': 'grade-school-math',
 'text': '<s>[INST] <<SYS>>You are a helpful assistant. Given a word problem, you need to break it down into steps and solve them step-by-step.<</SYS>>In a 90-minute soccer game, Mark played 20 minutes, then rested after. He then played for another 35 minutes. How long was he on the sideline?\nGive me a solution to this problem[/INST]Mark played for a total of 20 + 35 = 55 minutes.\nSo, he was on the sideline for 90 - 55 = 35 minutes.</s>'}