# Finetuning Using Google Gemma's Model

In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [2]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [3]:
os.environ["HF_TOKEN"] = "******"

### Prerequisites
* nf4(4-bit NormalFloat(NF4)) : https://www.kaggle.com/code/lorentzyeung/what-s-4-bit-quantization-how-does-it-help-llama2


In [4]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Before Fine-Tuning

In [6]:
text = "Quote: The tragedy of life is not that people fail"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: The tragedy of life is not that people fail. It is that people succeed.

The above quote is a great reminder that we should not be


In [7]:
text = "Quote: All you need is a good memory and the ability to see things in their"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20, temperature=0.1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Quote: All you need is a good memory and the ability to see things in their true form.

The <b>Memory</b> is a character in the <i>Mega Man X</i>


In [8]:
text = "Quote: When you have eliminated the impossible, whatever remains, however improbable,"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: When you have eliminated the impossible, whatever remains, however improbable, must be the truth.

- Sir Arthur Conan Doyle

The above quote is a great example of


#### The Quotes are not what we wanted, the LLM is not Trained on these Quotes

## Fine-Tune Prep

In [9]:
os.environ["WANDB_DISABLED"] = "false"

In [10]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [12]:
with open('quotes.txt', 'r') as file:
    data = file.read().replace('\n', '')

quotes = data.split(".")

In [13]:
stripped_quotes = [quote.strip() for quote in quotes]

In [14]:
mydata = {'quote' : stripped_quotes}

In [15]:
from datasets import Dataset
dataset = Dataset.from_dict(mydata)

In [16]:
dataset

Dataset({
    features: ['quote'],
    num_rows: 125
})

In [17]:
def formatting_func(example):
    text = f"Quote: {example['quote'][0]}"
    return [text]

In [33]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=35,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Map:   0%|          | 0/125 [00:00<?, ? examples/s]



In [34]:
trainer.train()

Step,Training Loss
5,0.7419
10,0.4029
15,0.2253
20,0.1458
25,0.1078
30,0.0775
35,0.062


TrainOutput(global_step=35, training_loss=0.25188413177217756, metrics={'train_runtime': 11.7887, 'train_samples_per_second': 11.876, 'train_steps_per_second': 2.969, 'total_flos': 9201608908800.0, 'train_loss': 0.25188413177217756, 'epoch': 35.0})

In [35]:
text = "Quote: The tragedy of life is not that people fail"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: The tragedy of life is not that people fail, but that they could succeed with so little effort. -Modelo T. A. Edison

The


In [24]:
text = "Quote: All you need is a good memory and the ability to see things in their"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: All you need is a good memory and the ability to see things in their true colors. — <strong>George R R Martin</strong>
The tragedy of life is not that people


In [22]:
text = "Quote: When you have eliminated the impossible, whatever remains, however improbable,"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: When you have eliminated the impossible, whatever remains, however improbable, must be the truth. — Sir Arthur Conan Doyle

Quote: The tragedy of life is not that


#### LLM now accurately predict the quotes to some extent

## Saving the model

In [25]:
model.save_pretrained("gemma_finetuned_model")
tokenizer.save_pretrained("gemma_finetuned_tokenizer")

('gemma_finetuned_tokenizer/tokenizer_config.json',
 'gemma_finetuned_tokenizer/special_tokens_map.json',
 'gemma_finetuned_tokenizer/tokenizer.model',
 'gemma_finetuned_tokenizer/added_tokens.json',
 'gemma_finetuned_tokenizer/tokenizer.json')

In [30]:
from transformers import AutoTokenizer, GemmaForCausalLM

# Load the saved tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gemma_finetuned_tokenizer")
model = GemmaForCausalLM.from_pretrained("gemma_finetuned_model")

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at gemma_finetuned_model were not used when initializing GemmaForCausalLM: ['model.layers.0.mlp.down_proj.base_layer.weight', 'model.layers.0.mlp.down_proj.base_layer.weight.absmax', 'model.layers.0.mlp.down_proj.base_layer.weight.quant_map', 'model.layers.0.mlp.down_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.mlp.down_proj.lora_A.default.weight', 'model.layers.0.mlp.down_proj.lora_B.default.weight', 'model.layers.0.mlp.gate_proj.base_layer.weight', 'model.layers.0.mlp.gate_proj.base_layer.weight.absmax', 'model.layers.0.mlp.gate_proj.base_layer.weight.quant_map', 'model.layers.0.mlp.gate_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.mlp.gate_proj.lora_A.default.weight', 'model.layers.0.mlp.gate_proj.lora_B.default.weight', 'model.layers.0.mlp.up_proj.base_layer.weight', 'model.layers.0.mlp.up_proj.base_layer.weight.absmax', 'm

In [36]:
text = "Quote: The tragedy of life is not that people fail"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: The tragedy of life is not that people fail, but that they could succeed with so little effort. -Modelo T. A. Edison

The
