In [6]:
# Install Pytorch & other libraries
%pip install "torch>=2.4.0" tensorboard torchvision

# Install Gemma release branch from Hugging Face
%pip install "transformers>=4.51.3"

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.15.2" \
  "peft==0.14.0" \
  "pillow==11.1.0" \
  protobuf \
  sentencepiece



In [34]:
#Mount the notebook on to the google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set the working directory to dl_project_fall_2025
import os
os.chdir("/content/drive/MyDrive/DL_Project_2025/dl_project_fall_2025")

# Auto relaod doesnt work in google colab, so you can use reload to reload your function calls
from importlib import reload

Mounted at /content/drive


In [47]:
# Importing git token and huggig face tokens
from google.colab import userdata

# Retrieve the GitHub Token from Colab secrets
GH_TOKEN = userdata.get('git_token') # Ensure you stored your PAT under the secret name 'GH_TOKEN'
hf_token = userdata.get('hugging_face')

# Configure Git to use the PAT directly in the remote URL for the 'origin'
!git remote set-url origin https://{GH_TOKEN}@github.com/7yashwanth7/dl_project_fall_2025.git
!git config --global user.email "7yashwanth7@gmail.com" # Modify to your username and pwd
!git config --global user.name "7yashwanth7"

Git remote 'origin' URL updated successfully with GitHub Personal Access Token.


In [48]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
from datasets import load_dataset
from PIL import Image

from src.llmft.data_preprocessing import preprocess
from src.llmft.data_preprocessing import preprocess_utils

In [58]:
# Load model_config and json files
defaults = preprocess_utils.read_yaml('src/llmft/config/defaults.yaml')
cui_mapping_json = preprocess_utils.read_json('mapping_files/cui_mapping.json')
cui_mapping = preprocess_utils.get_cui_mapping(cui_mapping_json)

# Load dataset from the hub
dataset = load_dataset("eltorio/ROCOv2-radiology", split="test")

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
i = 0
sample_dataset = []
while i <100:
  sample_dataset.append(format_data(dataset[i], cui_mapping))
  i += 1

In [None]:
import json

with open("mapping_files/cui_mapping.json", "r") as f:
    cui_mapping_json = json.load(f)

In [None]:
def get_cui_mapping(items):
    """
    Takes a list of dicts with keys:
        - 'CUI'
        - 'Canonical name'
    Returns a dict mapping 'CUI' -> 'Canonical name'
    """
    return {item['CUI']: item['Canonical name'] for item in items}

cui_mapping = get_cui_mapping(cui_mapping_json)

In [None]:
# Login into Hugging Face Hub
from huggingface_hub import login

hf_token = userdata.get('hugging_face') # If you are running inside a Google Colab
login(hf_token)

In [None]:
# Hugging Face model id
model_id = "google/gemma-3-4b-pt" # or `google/gemma-3-12b-pt`, `google/gemma-3-27-pt`
# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] < 8:
    raise ValueError("GPU does not support bfloat16, please use a GPU that supports bfloat16.")
# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch.bfloat16, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)
# BitsAndBytesConfig int-4 config
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
    bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
)
# Load model and processor
model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

config.json:   0%|          | 0.00/815 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [None]:
# Adding Special CUI Tokens
tokenizer = processor.tokenizer
cui_tokens = [f"<{cui}>" for cui in cui_mapping.keys()]
num_added = tokenizer.add_tokens(cui_tokens)
model.resize_token_embeddings(len(tokenizer))
processor.tokenizer = tokenizer

In [None]:
from peft import LoraConfig
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)

In [None]:
def process_vision_info(messages: list[dict]) -> list[Image.Image]:
    image_inputs = []
    # Iterate through each conversation
    for msg in messages:
        # Get content (ensure it's a list)
        content = msg.get("content", [])
        if not isinstance(content, list):
            content = [content]
        # Check each content element for images
        for element in content:
            if isinstance(element, dict) and (
                "image" in element or element.get("type") == "image"
            ):
                # Get the image and convert to RGB
                if "image" in element:
                    image = element["image"]
                else:
                    image = element
                image_inputs.append(image.convert("RGB"))
    return image_inputs

In [None]:
from trl import SFTConfig
args = SFTConfig(
    output_dir="gemma-product-description",     # directory to save and repository id
    num_train_epochs=1,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=5,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.3,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                          # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    push_to_hub=True,                           # push model to hub
    report_to="tensorboard",                    # report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True},  # important for collator
)
args.remove_unused_columns = False # important for collator
# Create a data collator to encode text and image pairs
def collate_fn(examples):
    texts = []
    images = []
    for example in examples:
        image_inputs = process_vision_info(example["messages"])
        text = processor.apply_chat_template(
            example["messages"], add_generation_prompt=False, tokenize=False
        )
        texts.append(text.strip())
        images.append(image_inputs)
    # Tokenize the texts and process the images
    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
    # The labels are the input_ids, and we mask the padding tokens and image tokens in the loss computation
    labels = batch["input_ids"].clone()
    # Mask image tokens
    image_token_id = [
        processor.tokenizer.convert_tokens_to_ids(
            processor.tokenizer.special_tokens_map["boi_token"]
        )
    ]
    # Mask tokens for not being used in the loss computation
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    labels[labels == 262144] = -100 # This specific ID is likely another special image-related token for Gemma.
    batch["labels"] = labels
    return batch

In [None]:
# @title Default title text
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=sample_dataset,
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
)
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,2.8819
10,1.315
15,0.9505
20,0.7609
25,0.6176


TrainOutput(global_step=25, training_loss=1.3051609325408935, metrics={'train_runtime': 228.9694, 'train_samples_per_second': 0.437, 'train_steps_per_second': 0.109, 'total_flos': 1022662821906720.0, 'train_loss': 1.3051609325408935})

In [None]:
trainer.save_model("gemma-3-cui-finetuned-sample1")  # saves into this directory

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ription/training_args.bin: 100%|##########| 6.16kB / 6.16kB            

  ...85108.421a8837f0f2.7076.0: 100%|##########| 9.50kB / 9.50kB            

  ...scription/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...escription/tokenizer.json:  74%|#######4  | 25.1MB / 33.7MB            

  ...adapter_model.safetensors:   0%|          | 8.32MB / 2.86GB            

No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
# free the memory again

torch.cuda.empty_cache()

In [None]:
output_dir="gemma-3-cui-finetuned"

In [None]:
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset

base_model_id = "google/gemma-3-4b-pt"
processor_id  = "google/gemma-3-4b-it"          # <-- IMPORTANT: same as training
adapter_dir   = "gemma-3-cui-finetuned-sample1"

# 1) Load processor (IT) and tokenizer
processor = AutoProcessor.from_pretrained(processor_id)
tokenizer = processor.tokenizer

# 2) Re-add CUI tokens exactly like during training
#    (you used plain codes like "C0041618", not "<C0041618>")
cui_tokens = list(cui_mapping.keys())
num_added = tokenizer.add_tokens(cui_tokens)
print("Added", num_added, "CUI tokens")

processor.tokenizer = tokenizer

# 3) Load base model (PT) with 4-bit quant
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16,
    ),
)

model = AutoModelForImageTextToText.from_pretrained(
    base_model_id,
    **model_kwargs,
)

# 4) Resize embeddings to match tokenizer (base vocab + CUI tokens)
model.resize_token_embeddings(len(tokenizer))

# 5) Attach LoRA adapter you trained
model = PeftModel.from_pretrained(model, adapter_dir)
model.eval()

Added 1934 CUI tokens


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForConditionalGeneration(
      (model): Gemma3Model(
        (vision_tower): SiglipVisionModel(
          (vision_model): SiglipVisionTransformer(
            (embeddings): SiglipVisionEmbeddings(
              (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
              (position_embedding): Embedding(4096, 1152)
            )
            (encoder): SiglipEncoder(
              (layers): ModuleList(
                (0-26): 27 x SiglipEncoderLayer(
                  (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
                  (self_attn): SiglipAttention(
                    (k_proj): lora.Linear4bit(
                      (base_layer): Linear4bit(in_features=1152, out_features=1152, bias=True)
                      (lora_dropout): ModuleDict(
                        (default): Dropout(p=0.05, inplace=False)
                      )
                  

In [None]:
sample_generate = dataset[1001]
image = sample_generate["image"]

In [None]:
system_message = "You are a digital radiologist who can understand the medical scan of images code the concepts and provide captions"

user_prompt = """Create a description based on the provided image and return the description of the image with details of the scan as captions, the concepts and their descriptions, only the concepts that are extracted"""

messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": system_message},
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": user_prompt},
            {"type": "image", "image": image},
        ],
    },
]


In [None]:
# Turn chat messages into a single string prompt
chat_text = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,   # <-- important for inference
    tokenize=False,
)

# Build model inputs (batch size 1)
inputs = processor(
    text=[chat_text],
    images=[image],
    return_tensors="pt",
    padding=True,
)

# Move to correct device
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.9,
        top_p=0.9,
    )

# Option 1: decode only the new tokens (without re-printing the prompt)
gen_only_ids = generated_ids[:, inputs["input_ids"].shape[-1]:]
output_text = tokenizer.decode(gen_only_ids[0], skip_special_tokens=True)

print("MODEL OUTPUT:\n")
print(output_text)

MODEL OUTPUT:

Caption: Anterior view of a patient with a 13-month-old posterior-facing craniopagus conjoined twins (one twin is on the left). Note the posterior fused craniocaudal fusion of the spinal cord with a large cervicomedullary junction. Arrows indicate the cervicomedullary junction.
Concept descriptions: C0040405: Ultrasonography
Concepts: C0040405


In [None]:
dataset[1001]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=473x330>,
 'image_id': 'ROCOv2_2023_train_001004',
 'caption': 'Apical four chamber view showing near complete dissolution of right ventricular thrombus (red arrow) and marked decrease in left ventricular thrombus (white arrow) after six months of anticoagulation.',
 'cui': ['C0041618']}