# Qwen 3.5 Vision-Language Fine-Tuning

Colab-friendly notebook to fine-tune a Qwen 3.5 (vision-language) model on ROCO radiology data with LoRA. Mirrors the existing Gemma training flow but swaps in the Qwen model, batching, and image handling.


In [None]:
# Install dependencies (run once per runtime)
%pip install -q --upgrade   torch torchvision tensorboard   transformers==4.45.2   datasets==3.3.2   accelerate==1.4.0   bitsandbytes==0.45.3   trl==0.15.2   peft==0.14.0   pillow==11.1.0   protobuf sentencepiece


In [None]:
# (Optional) Mount Google Drive in Colab and set project root
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    %cd /content/drive/MyDrive/DL_Project_2025/dl_project_fall_2025
except Exception as e:
    print('Drive not mounted; staying in local runtime.')


In [None]:
# Environment and paths
import os
import sys
from pathlib import Path

project_root = Path.cwd()
sys.path.append(str(project_root / "src"))

try:
    from dotenv import load_dotenv
    load_dotenv(project_root / ".env")
except Exception:
    pass

hf_token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
token_kwargs = {"token": hf_token} if hf_token else {}
print("HF token loaded" if hf_token else "HF token not set; proceeding without one.")


In [None]:
# Imports
import torch
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from peft import PeftModel, LoraConfig
from trl import SFTConfig, SFTTrainer
from pathlib import Path
from importlib import reload

from llmft.data_preprocessing import preprocess_utils


In [None]:
# Load prompts and CUI mapping
defaults = preprocess_utils.read_yaml(project_root / 'src/llmft/config/defaults.yaml')
cui_mapping_json = preprocess_utils.read_json(project_root / 'mapping_files/cui_mapping.json')
cui_mapping = preprocess_utils.get_cui_mapping(cui_mapping_json)
print(f"Loaded {len(cui_mapping)} CUI mapping entries.")


In [None]:
# Dataset loading (ROCO radiology)
from datasets import Image as HFImage

train_split = "train"  # change to "validation" or custom split
max_samples = None      # set to an int to subset

raw_ds = load_dataset("eltorio/ROCOv2-radiology", split=train_split, **token_kwargs)
if max_samples:
    raw_ds = raw_ds.select(range(max_samples))
raw_ds = raw_ds.cast_column("image", HFImage(decode=False))  # keep bytes/paths to decode later
print(raw_ds)


## Model & LoRA setup
Using a Qwen 3.5 VL model with 4-bit loading + LoRA. Adjust model IDs for your hardware (smaller models if needed).


In [None]:
# Model IDs (pick a size your GPU can handle)
base_model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

# Load processor/tokenizer
processor = AutoProcessor.from_pretrained(base_model_id)
tokenizer = processor.tokenizer

# Add CUI tokens (wrapped in <> to avoid collisions)
cui_tokens = [f"<{cui}>" for cui in cui_mapping.keys()]
num_added = tokenizer.add_tokens(cui_tokens)
print(f"Added {num_added} CUI tokens")
processor.tokenizer = tokenizer

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load base model
model = AutoModelForVision2Seq.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# Resize embeddings to include new tokens
model.resize_token_embeddings(len(tokenizer))

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"],
)


In [None]:
# Helper: decode HF image entries to PIL
from PIL import Image as PILImage
import io

def load_pil(img):
    if isinstance(img, dict):
        if img.get("bytes") is not None:
            return PILImage.open(io.BytesIO(img["bytes"])).convert("RGB")
        if img.get("path"):
            return PILImage.open(img["path"]).convert("RGB")
    if hasattr(img, "convert"):
        return img.convert("RGB")
    raise ValueError("Unsupported image format: {type(img)}")


In [None]:
# Data collator for TRL SFTTrainer
system_message = defaults["system_message"]
user_prompt = defaults["user_prompt"]

# Build messages for a single example
def build_messages(example):
    return [
        {"role": "system", "content": [{"type": "text", "text": system_message}]},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                {"type": "image", "image": load_pil(example["image"])}
            ],
        },
    ]

def collate_fn(batch):
    prompts = [processor.apply_chat_template(build_messages(ex), add_generation_prompt=True, tokenize=False) for ex in batch]
    images = [load_pil(ex["image"]) for ex in batch]
    model_inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True)
    labels = model_inputs["input_ids"].clone()
    model_inputs["labels"] = labels
    return {k: v.to(model.device) for k, v in model_inputs.items()}


## Training
Configure epochs, batch size, and run SFTTrainer.


In [None]:
# Training configuration
output_dir = "qwen-vl-cui-finetuned"
args = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
)


In [None]:
# Create trainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=raw_ds,
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
)


In [None]:
# Train and save
trainer.train()
trainer.save_model(output_dir)


In [None]:
# Optional: push to Hub (set HUGGINGFACE_TOKEN)
# trainer.push_to_hub()


In [None]:
# Cleanup
import torch
torch.cuda.empty_cache()
