# Fine-Tunning `gemma-2b-it` LLM with Lora for solving math problems

In [27]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U datasets
%pip install -U huggingface-cli

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [28]:
class CFG:
  model_name = "google/gemma-2b-it"
  dataset_name = "lighteval/MATH"
  seed = 25

In [29]:
from transformers import (
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    pipeline,
    AutoConfig,
    DataCollatorForLanguageModeling
)

from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model
)

import os, torch
import pandas as pd
from IPython.display import display, Markdown, Latex
from datasets import Dataset ,load_dataset

In [30]:
dataset = load_dataset(CFG.dataset_name, split="train+test")
dataset.description

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


'MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n'

In [31]:
def print_problem(row):
  for key in row.keys():
    display(Markdown(f"- ***{key}***".upper()))
    display(Latex(row[key]))
    display()

def get_answer(solution):
  return solution.split("boxed{")[-1].split("}$")[0]

dataset = dataset.map(lambda row: {"answer": get_answer(row["solution"])})
print_problem(dataset[0])

- ***PROBLEM***

<IPython.core.display.Latex object>

- ***LEVEL***

<IPython.core.display.Latex object>

- ***TYPE***

<IPython.core.display.Latex object>

- ***SOLUTION***

<IPython.core.display.Latex object>

- ***ANSWER***

<IPython.core.display.Latex object>

In [32]:
def int_answer(answer):
  try:
    if int(answer):
      return True
    else:
      return False
  except ValueError:
    return False

dataset = dataset.filter(lambda row: int_answer(row["answer"]))
print_problem(dataset[0])

- ***PROBLEM***

<IPython.core.display.Latex object>

- ***LEVEL***

<IPython.core.display.Latex object>

- ***TYPE***

<IPython.core.display.Latex object>

- ***SOLUTION***

<IPython.core.display.Latex object>

- ***ANSWER***

<IPython.core.display.Latex object>

In [33]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_clients = UserSecretsClient()
secret_hf = user_clients.get_secret("HUGGINGFACE_TOKEN")
login(secret_hf)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [34]:
import wandb

secret_wandb = user_clients.get_secret("wandb")

# W&B follow up session
wandb.login(key=secret_wandb)
run = wandb.init(
    project = "ft gemma for math problems",
    job_type = "training",
    anonymous = "allow"
)



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [35]:
config = AutoConfig.from_pretrained(CFG.model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    CFG.model_name,
    quantization_config = bnb_config,
    # config=config,
    device_map = {"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [37]:
gemma_2b_it_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    target_modules = gemma_2b_it_modules,
    lora_dropout = 5e-2,
    bias = "none",
    task_type = "CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [38]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
tokenizer.padding_slide = "right"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [40]:
def add_prompt(rows):
    prompt = f"Question: {rows['problem']}\nSolution: {rows['solution']}"
    return prompt

train_data = dataset
train_data = train_data.map(lambda row: tokenizer(row["solution"]), batched=True)

Map:   0%|          | 0/5476 [00:00<?, ? examples/s]

In [41]:
trainer_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    # num_train_epochs=1,
    max_steps = 25,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir = "./gemma-2b-math-solver",
    optim = "paged_adamw_8bit"
)

trainer = Trainer(
    model = model,
    train_dataset = train_data,
    args = trainer_args,
    # data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.9424
2,2.188
3,2.3986
4,2.2618
5,2.0014
6,1.902
7,2.049
8,1.8151
9,1.5352
10,1.9187


TrainOutput(global_step=25, training_loss=1.816903977394104, metrics={'train_runtime': 91.2869, 'train_samples_per_second': 1.095, 'train_steps_per_second': 0.274, 'total_flos': 233183109918720.0, 'train_loss': 1.816903977394104, 'epoch': 0.018261504747991233})

In [42]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("./gemma-2b-math-solver")

In [43]:
lora_config = LoraConfig.from_pretrained('./gemma-2b-math-solver')
peft_model = get_peft_model(model, lora_config)

In [59]:
problem = "solve this prblem: {problem}".format(problem=train_data[27]["problem"])
device = "cuda"

inputs = tokenizer(problem, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=2048)
Markdown(tokenizer.decode(outputs[0], skip_special_tokens=False))

<bos>solve this prblem: The sum of two numbers is $45$. Their difference is $3$. What is the lesser of the two numbers?<eos>The answer is 25.

Let x and y be the two numbers.

Then, x + y = 45 and x - y = 3.

Adding these two equations together, we get:

(x + y) + (x - y) = 45 + 3

2x = 48

x = 24

Therefore, y = 21.

So, the lesser of the two numbers is 25.<eos>

In [60]:
Markdown(train_data[27]["solution"])

Let $x,y$ be the larger and smaller numbers, respectively. We have $x+y=45$ and $x-y=3$. Thus: $y=\frac{1}{2}((x+y)-(x-y))=\frac{1}{2}(45-3)=\boxed{21}$.