In [1]:
import os
# Fixes potential memory fragmentation issues
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Optional: Add this to potentially reduce overall memory usage if needed
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
# =========================
# 0. INSTALL (run once)
# =========================
!pip install "transformers>=4.41.0" "datasets>=2.19.0" "accelerate>=0.30.0" \
              peft trl bitsandbytes pillow

# If you use Unsloth:
!pip install unsloth





In [3]:
from google.colab import drive
drive.mount('/content/drive')

# My Drive/medgemma_finetune/data
!cp -r "/content/drive/MyDrive/data" "/content/medgemma_finetune/"

Mounted at /content/drive


In [4]:
%cd /content/medgemma_finetune
!python make_instructions.py


/content/medgemma_finetune
[INFO] Dataset: kidney CT
  - Class: Normal
    -> Used 40 images
  - Class: Cyst
    -> Used 40 images
  - Class: Tumor
    -> Used 40 images
  - Class: Stone
    -> Used 40 images
[INFO] Dataset: Breast MRI
  - Class: Malignant
    -> Used 40 images
  - Class: Benign
    -> Used 40 images
[INFO] Dataset: mammography
  - Class: Malignant
    -> Used 40 images
  - Class: Benign
    -> Used 40 images
[INFO] Dataset: Brain Tumor MRI images
  - Class: Healthy
    -> Used 40 images
  - Class: Tumor
    -> Used 40 images
[INFO] Dataset: lung cancer
  - Class: Malignant cases
    -> Used 40 images
  - Class: Bengin cases
    -> Used 40 images
  - Class: Normal cases
    -> Used 40 images
[INFO] Dataset: Brain Tumor CT scan Images
  - Class: Healthy
    -> Used 40 images
  - Class: Tumor
    -> Used 40 images
[INFO] Collected 600 examples total.
[INFO] Train: 510 | Val: 90
[OK] Wrote train_instructions.jsonl and val_instructions.jsonl in /content/medgemma_finetune


In [3]:
import os
from pathlib import Path
from typing import Any

import torch
from datasets import load_dataset
from PIL import Image

from transformers import (
    AutoProcessor,
    AutoModelForImageTextToText,
    BitsAndBytesConfig,
)
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer



In [4]:
PROJECT_ROOT = Path(".")  # current folder
TRAIN_JSONL = "/content/medgemma_finetune/train_instructions.jsonl"
VAL_JSONL   = "/content/medgemma_finetune/val_instructions.jsonl"


MODEL_ID = "unsloth/medgemma-4b-it"   # <-- CHANGE THIS to your Unsloth model id if needed

# Tiny ‚Äúsafer than your GPA‚Äù training hyperparams
NUM_EPOCHS = 1
LEARNING_RATE = 1e-5       # you can go 5e-6 if you want to be extra safe
BATCH_SIZE = 4             # adjust by VRAM
GRAD_ACCUM = 2             # effective batch = BATCH_SIZE * GRAD_ACCUM




In [5]:
data = load_dataset(
    "json",
    data_files={
        "train": str(TRAIN_JSONL),
        "validation": str(VAL_JSONL),
    },
)
print(data)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image_path', 'dataset', 'class_name', 'prompt', 'target'],
        num_rows: 510
    })
    validation: Dataset({
        features: ['image_path', 'dataset', 'class_name', 'prompt', 'target'],
        num_rows: 90
    })
})


In [6]:
# =========================
# 3. FORMAT DATA: image + messages
# =========================

def format_data(example: dict[str, Any]) -> dict[str, Any]:
    """
    Expects each JSONL row to have:
      - image_path: path to image
      - prompt: user prompt text
      - target: JSON string with {label_key, confidence, summary}
    Adds:
      - image: PIL image (RGB)
      - messages: chat-style messages for MedGemma
    """
    img_path = Path(example["image_path"])
    if not img_path.is_absolute():
        img_path = PROJECT_ROOT / img_path

    image = Image.open(img_path).convert("RGB")
    example["image"] = image

    example["messages"] = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": example["prompt"]},
            ],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": example["target"]},
            ],
        },
    ]
    return example


data = data.map(format_data)
print(data["train"][0])

Map:   0%|          | 0/510 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

{'image_path': '/content/medgemma_finetune/data/kidney CT/Stone/Stone- (442).jpg', 'dataset': 'kidney CT', 'class_name': 'Stone', 'prompt': 'You are an assistant radiologist. Modality: CT abdomen (kidneys). Task: Identify the main kidney-related finding and summarize key observations. Analyze the given medical image and respond ONLY with valid JSON.\n\nThe JSON must contain the keys: "kidney_ct_finding", "confidence", "summary".\n', 'target': '{"kidney_ct_finding": "Stone", "confidence": 0.75, "summary": "Hyperdense focus in the urinary tract compatible with a renal/ureteric stone. Recommend review by a qualified specialist and additional imaging if clinically indicated."}', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512 at 0x79B268D97F50>, 'messages': [{'content': [{'text': None, 'type': 'image'}, {'text': 'You are an assistant radiologist. Modality: CT abdomen (kidneys). Task: Identify the main kidney-related finding and summarize key observations. Analyze the 

In [7]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [9]:
from google.colab import userdata
access_token = userdata.get('HF_API_KEY')

In [10]:
# =======================================================
# STEP 1: INSPECT MODEL STRUCTURE
# =======================================================
import torch.nn as nn

print("\n--- Model Submodule Names ---")
# List all top-level module names to find the Vision Encoder
for name, module in model.named_children():
    print(f"Top-level Module: {name} (Type: {type(module).__name__})")

print("\n--- Vision Encoder Check ---")
# If 'vision_tower' exists, show its structure
if hasattr(model, 'vision_tower'):
    print("\n'vision_tower' structure:")
    for name, param in model.vision_tower.named_parameters():
        print(f"  {name}: {param.requires_grad}")
        # Only print the first 5 params to avoid huge output
        if name.split('.')[0] == name.split('.')[-1] and len(list(model.vision_tower.named_parameters())) > 5:
            break


--- Model Submodule Names ---


NameError: name 'model' is not defined

In [11]:
import torch
import gc
# ‚ö†Ô∏è Ensure this is one of the first imports in your entire notebook!
from unsloth import FastLanguageModel
from peft import LoraConfig

# =======================================================
# 1. MODEL LOADING
# =======================================================
model_id = "google/medgemma-4b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer.padding_side = "right"


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


  import trl.experimental.openenv.utils as openenv_utils


config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [12]:
from peft import get_peft_model

In [13]:
# === Step 5: LoRA configuration ===
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"]
)
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

In [14]:
model = get_peft_model(model, peft_config)

In [15]:
model.print_trainable_parameters()

trainable params: 16,394,240 || all params: 4,316,473,712 || trainable%: 0.3798


In [16]:
# =========================
# 6. COLLATE FN (WITH MAX LENGTH FIX)
# =========================

def collate_fn(examples: list[dict[str, Any]]):
    texts = []
    images = []

    for example in examples:
        images.append([example["image"].convert("RGB")])
        texts.append(
            processor.apply_chat_template(
                example["messages"],
                add_generation_prompt=False,
                tokenize=False,
            ).strip()
        )

    batch = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True,  # ADD THIS
        max_length=2048,   # ADD THIS to prevent overflow
    )

    # Create labels
    labels = batch["input_ids"].clone()

    # Mask padding tokens
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # Simple approach: Mask everything except the assistant response
    # This is often better for image-text models
    labels[:, :batch["input_ids"].shape[1] - len(example["messages"][1]["content"][0]["text"])] = -100

    batch["labels"] = labels
    return batch

In [17]:
output_dir = "medgemma-qlora-finetune"
os.makedirs(output_dir, exist_ok=True)

In [18]:
from trl import SFTConfig

args = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,        # optimized for low VRAM
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    learning_rate=1e-5,
    bf16=False,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    push_to_hub=True,
    logging_steps=0.1,
    eval_strategy="no",  # train-only dataset
    report_to="none",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataset_kwargs={"skip_prepare_dataset": True},
    remove_unused_columns=False,
    label_names=["labels"],
    dataloader_num_workers=8,
    dataloader_pin_memory=True
)

In [19]:
# =========================
# 8. TRAINER & TRAIN
# =========================

# You can optionally subsample validation to speed up:
eval_dataset = data["validation"]  # .shuffle().select(range(200))

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=data["train"],
    eval_dataset=eval_dataset,
    processing_class=processor,
    data_collator=collate_fn,
)

print("[INFO] Starting training...")
trainer.train()

[INFO] Starting training...


Step,Training Loss
7,
14,
21,
28,
35,
42,
49,
56,
63,


TrainOutput(global_step=64, training_loss=nan, metrics={'train_runtime': 1841.0462, 'train_samples_per_second': 0.277, 'train_steps_per_second': 0.035, 'total_flos': 4247702075325600.0, 'train_loss': nan, 'entropy': nan, 'num_tokens': 194465.0, 'mean_token_accuracy': 0.0, 'epoch': 1.0})

In [20]:
print("[INFO] Starting training...")
trainer.train()

[INFO] Starting training...


KeyboardInterrupt: 

In [21]:
# Save final adapter
trainer.save_model()  # saves LoRA weights into output_dir
print("[OK] Training complete. LoRA adapter saved in:", args.output_dir)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...inetune/training_args.bin: 100%|##########| 6.42kB / 6.42kB            

  ...-finetune/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...adapter_model.safetensors:  25%|##5       | 16.7MB / 65.7MB            

  ...a-finetune/tokenizer.json:  50%|#####     | 16.7MB / 33.4MB            

No files have been modified since last commit. Skipping to prevent empty commit.


[OK] Training complete. LoRA adapter saved in: medgemma-qlora-finetune


In [2]:
import shutil

FOLDER_PATH = "/content/medgemma-qlora-finetune"      # <- example: "medgemma_adapter"
ZIP_NAME = FOLDER_PATH + ".zip"

shutil.make_archive(FOLDER_PATH, 'zip', FOLDER_PATH)
print("[OK] Created ZIP:", ZIP_NAME)


[OK] Created ZIP: /content/medgemma-qlora-finetune.zip


In [3]:
from google.colab import files
files.download(ZIP_NAME)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
save_path = "medgemma-4b-finetuned123"

In [24]:
trainer.save_model(save_path)
print(f"[OK] Complete model saved to: {save_path}")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...inetune/training_args.bin: 100%|##########| 6.42kB / 6.42kB            

  ...-finetune/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...a-finetune/tokenizer.json:  50%|#####     | 16.7MB / 33.4MB            

  ...adapter_model.safetensors:  25%|##5       | 16.7MB / 65.7MB            

No files have been modified since last commit. Skipping to prevent empty commit.


[OK] Complete model saved to: medgemma-4b-finetuned123


In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM   # or other class you used
from peft import PeftModel

# 1) Same base model you used for training
BASE_MODEL_ID = "google/medgemma-4b-it"   # <-- change if you used another MedGemma variant

# 2) Directory where your LoRA adapter is saved
#    Typically this is trainer.args.output_dir, e.g. "medgemma_finetune" or similar
LORA_DIR = "/content/medgemma_finetune"   # <-- PUT YOUR ACTUAL ADAPTER / OUTPUT DIR HERE

# 3) Where to save the full-precision merged model
MERGED_FP16_DIR = "/content/medgemma-4b-finetuned-merged-fp16"
os.makedirs(MERGED_FP16_DIR, exist_ok=True)

print("Loading base model in FP16 (no bitsandbytes)...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="cpu"   # or "auto" if you have enough VRAM, but cpu is safest for Colab
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(
    base_model,
    LORA_DIR
)

print("Merging LoRA into base weights...")
merged_model = model.merge_and_unload()   # this produces a standard FP16 model

print("Saving merged FP16 model...")
merged_model.save_pretrained(MERGED_FP16_DIR, safe_serialization=True)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.save_pretrained(MERGED_FP16_DIR)

print("[OK] Merged FP16 model saved to:", MERGED_FP16_DIR)


ModuleNotFoundError: Could not import module 'PreTrainedModel'. Are this object's requirements defined correctly?

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Use the NEW merged directory (no bitsandbytes)
HF_MODEL_DIR = "/content/medgemma-4b-finetuned-merged-fp16"

OUT_FP16 = "/content/medgemma-4b-finetuned-f16.gguf"

print("HF_MODEL_DIR exists:", os.path.isdir(HF_MODEL_DIR))

%cd /content/llama.cpp

!python convert_hf_to_gguf.py "$HF_MODEL_DIR" \
    --outfile "$OUT_FP16" \
    --outtype f16

import os
print("FP16 GGUF exists:", os.path.isfile(OUT_FP16))


