# Config Huggingface Token

In [1]:
from google.colab import userdata

token = userdata.get('HF_TOKEN')

# Required installations

In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

# Importing necessary libraries

In [3]:
# import transformers
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

# for LORA training
import torch
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset

### NF4 (4-bit NormalFloat)
- What is 4-bit quantization? How does it help Models like Llama2 and Gemma
- Reference: [Lorentz Kaggle Notebook](https://www.kaggle.com/code/lorentzyeung/what-s-4-bit-quantization-how-does-it-help-llama2)


# Model and Tokenizer config

In [4]:
model_id = "google/gemma-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

Model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"":0},
    token=token
)

# Testing model

In [10]:
text = "Franz Kafka quote: How about if I sleep a little bit longer and"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = Model.generate(**inputs, max_new_tokens=30)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Franz Kafka quote: How about if I sleep a little bit longer and then I'll go to the doctor?

The doctor: I'm sorry, but you're already dead.

Franz Kafka quote:


# LORA Configuration

In [13]:
import os

os.environ["WANDB_DISABLED"] = "false"

In [15]:
lora_config = LoraConfig(
    # rank
    r = 8,

    # targets
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],

    # LLM task type
    task_type = "CAUSAL_LM",
)

In [None]:
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

In [None]:
# data["train"]["quote"]

In [18]:
def formatting_func(example):
    text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}"
    return [text]

In [19]:
data["train"]

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 2508
})

# Finetuning

In [None]:
trainer = SFTTrainer(
    model=Model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)

In [None]:
trainer.train()

In [23]:
text = "Quote: A woman is like a tea bag;"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = Model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: A woman is like a tea bag; you can’t tell how strong she is until you put her in hot water
Author: Eleanor


In [32]:
text = "You only live"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = Model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

You only live once, but if you do it right, once is enough.
Quote: “The only real prison is fear, and the only real freedom is freedom from fear”
Author: Aung San Suu Kyi
Quote: “The most wasted
