Install the requried packages

In [36]:
# Install Pytorch & other libraries
%pip install "torch>=2.4.0" tensorboard torchvision

# Install Gemma release branch from Hugging Face
%pip install "transformers>=4.51.3"

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.15.2" \
  "peft==0.14.0" \
  "pillow==11.1.0" \
  protobuf \
  sentencepiece



Mount the cluster to the google drive

In [None]:
#Mount the notebook on to the google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set the working directory to dl_project_fall_2025
import os
os.chdir("/content/drive/MyDrive/DL_Project_2025/dl_project_fall_2025")

# Auto relaod doesnt work in google colab, so you can use reload to reload your function calls
from importlib import reload

Adding PAT (Personal Access Tokens) to both Hugging Face and Google Drive

In [27]:
# Importing git token and huggig face tokens
from google.colab import userdata
from huggingface_hub import login

# Retrieve the GitHub Token from Colab secrets
GH_TOKEN = userdata.get('git_token') # Ensure you stored your PAT under the secret name 'GH_TOKEN'
hf_token = userdata.get('hugging_face')
login(hf_token)

# Configure Git to use the PAT directly in the remote URL for the 'origin'
!git remote set-url origin https://{GH_TOKEN}@github.com/7yashwanth7/dl_project_fall_2025.git
!git config --global user.email "7yashwanth7@gmail.com" # Modify to your username and pwd
!git config --global user.name "7yashwanth7"

Importing the functions

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
from datasets import load_dataset
from PIL import Image

from src.llmft.data_preprocessing import preprocess
from src.llmft.data_preprocessing import preprocess_utils

In [28]:
!git status

Refresh index: 100% (35/35), done.
On branch google_colab_development
Your branch is up to date with 'origin/google_colab_development'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   src/llmft/__pycache__/__init__.cpython-312.pyc[m
	[31mmodified:   src/llmft/data_preprocessing/__pycache__/__init__.cpython-312.pyc[m
	[31mmodified:   src/llmft/data_preprocessing/__pycache__/preprocess.cpython-312.pyc[m
	[31mmodified:   src/llmft/data_preprocessing/__pycache__/preprocess_utils.cpython-312.pyc[m
	[31mmodified:   src/llmft/data_preprocessing/preprocess.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mgemma-3-cui-finetuned-sample1/[m
	[31mgemma-product-description/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [30]:
!git add src/llmft/data_preprocessing/preprocess.py

In [32]:
!git commit -m "added preprocessing of dataset"

[google_colab_development 8da6669] added preprocessing of dataset
 1 file changed, 51 insertions(+)


In [40]:
!git fetch origin
!git pull --rebase origin google_colab_development
!git push origin google_colab_development

remote: Enumerating objects: 11, done.[K
remote: Counting objects:   9% (1/11)[Kremote: Counting objects:  18% (2/11)[Kremote: Counting objects:  27% (3/11)[Kremote: Counting objects:  36% (4/11)[Kremote: Counting objects:  45% (5/11)[Kremote: Counting objects:  54% (6/11)[Kremote: Counting objects:  63% (7/11)[Kremote: Counting objects:  72% (8/11)[Kremote: Counting objects:  81% (9/11)[Kremote: Counting objects:  90% (10/11)[Kremote: Counting objects: 100% (11/11)[Kremote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects:  12% (1/8)[Kremote: Compressing objects:  25% (2/8)[Kremote: Compressing objects:  37% (3/8)[Kremote: Compressing objects:  50% (4/8)[Kremote: Compressing objects:  62% (5/8)[Kremote: Compressing objects:  75% (6/8)[Kremote: Compressing objects:  87% (7/8)[Kremote: Compressing objects: 100% (8/8)[Kremote: Compressing objects: 100% (8/8), done.[K
remote: Total 8 (delta 4), reused 0 (delta 0), pack-reused 0 (

Load Models Configurations and Dataset

In [None]:
# Load model_config and json files
defaults = preprocess_utils.read_yaml('src/llmft/config/defaults.yaml')
cui_mapping_json = preprocess_utils.read_json('mapping_files/cui_mapping.json')
cui_mapping = preprocess_utils.get_cui_mapping(cui_mapping_json)

# Load dataset from the hub
dataset = load_dataset("eltorio/ROCOv2-radiology", split="test")

Pre-Process the datset

In [None]:
from datasets import Image as HFImage
dataset = dataset.cast_column("image", HFImage(decode=False))

# Processed Dataset
processed_ds = dataset.map(
    lambda b: preprocess.format_batch(b, cui_mapping, defaults),
    batched=True,
    batch_size=1024,
)

Train Model

In [None]:
# Hugging Face model id
model_id = "google/gemma-3-4b-pt" # or `google/gemma-3-12b-pt`, `google/gemma-3-27-pt`
# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] < 8:
    raise ValueError("GPU does not support bfloat16, please use a GPU that supports bfloat16.")
# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch.bfloat16, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)
# BitsAndBytesConfig int-4 config
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
    bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
)
# Load model and processor
model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

In [None]:
# Adding Special CUI Tokens
tokenizer = processor.tokenizer
cui_tokens = [f"<{cui}>" for cui in cui_mapping.keys()]
num_added = tokenizer.add_tokens(cui_tokens)
model.resize_token_embeddings(len(tokenizer))
processor.tokenizer = tokenizer

In [None]:
from peft import LoraConfig
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)

In [None]:
from PIL import Image as PILImage
import io

def load_pil(img):
    """
    Handles:
    - HF Image with decode=False -> dict with {path, bytes}
    - Already-decoded PIL images
    """
    if isinstance(img, dict):
        if img.get("bytes") is not None:
            return PILImage.open(io.BytesIO(img["bytes"])).convert("RGB")
        if img.get("path"):
            return PILImage.open(img["path"]).convert("RGB")
        raise ValueError("Image dict missing both 'bytes' and 'path'.")

    if hasattr(img, "convert"):
        return img.convert("RGB")

    raise ValueError(f"Unsupported image type: {type(img)}")


def collate_fn(examples):
    texts = []
    images = []

    for example in examples:
        # Build text from messages (which contains the image placeholder)
        text = processor.apply_chat_template(
            example["messages"],
            add_generation_prompt=False,
            tokenize=False
        )
        texts.append(text.strip())

        # IMPORTANT: pull real image from the column, not from messages
        images.append(load_pil(example["image"]))

    # Tokenize & process images
    batch = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True
    )

    # Labels
    labels = batch["input_ids"].clone()

    # Mask padding
    pad_id = processor.tokenizer.pad_token_id
    if pad_id is not None:
        labels[labels == pad_id] = -100

    # Mask image special tokens (safer handling)
    boi_token = processor.tokenizer.special_tokens_map.get("boi_token", None)
    if boi_token is not None:
        boi_id = processor.tokenizer.convert_tokens_to_ids(boi_token)
        labels[labels == boi_id] = -100

    # Keep your known extra image token id mask (if it is correct in your setup)
    labels[labels == 262144] = -100

    batch["labels"] = labels
    return batch

In [None]:
from trl import SFTConfig
args = SFTConfig(
    output_dir="gemma-product-description",     # directory to save and repository id
    num_train_epochs=1,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=False,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=50,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.3,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                          # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    push_to_hub=True,                           # push model to hub
    report_to="tensorboard",                    # report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True},  # important for collator
)
args.remove_unused_columns = False # important for collator

In [None]:
# @title Default title text
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=processed_ds,
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
)
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()
trainer.save_model("gemma-3-cui-finetuned-sample1")  # saves into this directory

In [None]:
# free the memory again
torch.cuda.empty_cache()