In [None]:
# Installing Dependencies
# bitsandbytes : Lightweight CUDA wrappers for k‑bit quantization
# peft : (e.g., LoRA) o adapt large pretrained models by training a small set of extra parameters instead of all weights, reducing compute and storage
# trl : Tools to post‑train/align transformers with RLHF and related methods

!pip3 install bitsandbytes peft trl accelerate datasets transformers

In [4]:
import os
import transformers
import torch
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [5]:
os.environ["HF_TOKEN"] = os.get('HF_TOKEN')

In [6]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Load the model in 4‑bit quantized weights to reduce GPU Memory 
    bnb_4bit_quant_type="nf4",              # Use NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16   # Perform computations in bfloat16 while keeping weights in 4‑bit
)

In [7]:
# loads the correct tokenizer for Gemma‑2B from the Hugging Face Hub.
# The same vocabulary/subword rules the model was trained on
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])

# Downloads/loads the causal language model weights for model_id
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},                     # Places all model modules on single GPU
                                             token=os.environ['HF_TOKEN'])

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [8]:
text = "Quote: Imagination is more,"
device = "cuda:0"                                            # Chooses GPU 0 for computations

# Tokenizes the text into PyTorch tensors and moves those tensors onto GPU 0
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Imagination is more, than knowledge.

I am a self-taught artist, born in 1985 in


In [9]:
text = "Quote: Imagination is more"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Imagination is more important than knowledge. Knowledge is limited. Imagination encircles the world.

- Albert Einstein

The


In [10]:
# It sets a Weights & Biases setting via environment variable
os.environ["WANDB_DISABLED"] = "false"

In [11]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [None]:
'''
    --> The LoRA rank (size of the low‑rank adapters). Higher r increases adapter capacity (and memory/compute), 
        lower r is lighter. r=8 is a common, efficient default.
    --> target_modules : Specifies which linear layers to apply LoRA to, covers both attention and Feed Forwad Layers
    --> Tells PEFT the task is causal language modeling, so it wires the adapters appropriately for an autoregressive decoder model
'''

In [12]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

README.md: 0.00B [00:00, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [13]:
data['train']['quote']

Column(['“Be yourself; everyone else is already taken.”', "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”", "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”", '“So many books, so little time.”', '“A room without books is like a body without a soul.”'])

In [14]:
# Returns a list with one formatted string so the trainer can tokenize it internally
def formatting_func(example):
    text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}"
    return [text]

In [15]:
# Accesses the training split from the loaded/tokenized dataset
data['train']

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 2508
})

In [16]:
trainer = SFTTrainer(
    model=model,                            # Gamma model (with 4‑bit quant + LoRA adapters)
    train_dataset=data["train"],            # Trains on the train split
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,      # One sample per GPU step
        gradient_accumulation_steps=4,      # Accumulates 4 steps → effective batch size≈4 
        warmup_steps=2,                     # Small warmup before reaching full learning rate
        max_steps=100,                      # Run 100 optimization steps total
        learning_rate=2e-4,
        fp16=True,                          # Use float16 for faster, lower‑memory training
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"            # Memory‑efficient 8‑bit AdamW optimizer (works well with bitsandbytes)
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Truncating train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [17]:
trainer.train()

Step,Training Loss
1,2.5615
2,1.6267
3,2.4808
4,2.7506
5,2.2993
6,2.4736
7,2.8786
8,2.2327
9,3.1764
10,2.2137


TrainOutput(global_step=100, training_loss=2.0560099977254866, metrics={'train_runtime': 114.0965, 'train_samples_per_second': 3.506, 'train_steps_per_second': 0.876, 'total_flos': 189744345784320.0, 'train_loss': 2.0560099977254866})

In [18]:
text = "Quote: A woman is like a tea bag;"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: A woman is like a tea bag; you can't tell how strong she is until you put her in hot water.

I'


In [19]:
text = "Quote: Outside of a dog, a book is man's"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Outside of a dog, a book is man's best friend. Inside of a dog, it's too dark to read.

-Groucho
