# LoRA/QLoRA fine-tune Qwen2-VL on local images + captions

Use this notebook when you already have a folder of images and a CSV mapping `image_name.jpg` to caption text.

What it does:
- Loads train/val CSVs and image files from local paths
- Sets up Qwen2-VL-2B-Instruct with 4-bit (QLoRA) to fit on a single Colab GPU
- Runs a short training loop to verify loss decreases
- Saves LoRA adapters

Adjust paths and column names in the config cell below.

In [None]:
# Install deps (pin to avoid ABI issues on Colab)
!pip install -q --force-reinstall \
    torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 \
    transformers==4.44.2 accelerate peft \
    numpy==1.26.4 pandas==2.2.2 pillow==10.3.0 datasets tqdm

In [None]:
# Disable TensorFlow/Flax imports to avoid pulling jax/tf built against different numpy
import os
os.environ['USE_TF'] = '0'
os.environ['TRANSFORMERS_NO_TF'] = '1'
os.environ['USE_FLAX'] = '0'
os.environ['TRANSFORMERS_NO_FLAX'] = '1'


In [None]:
# Config: set paths and caption map (image_id -> text). Images are image_id + '.jpg' in IMAGE_DIR.
from pathlib import Path

BASE_DIR = Path('/content/local_data')  # change to your root folder
IMAGE_DIR = BASE_DIR / 'images'  # folder containing image_id.jpg

# Provide your caption mapping here. Keys are image_ids (without .jpg), values are text.
caption_map = {
    # 'image_001': 'Your caption text here',
    # 'image_002': 'Another caption',
}

# Split IDs into train/val lists. Replace with your own splits.
TRAIN_IDS = list(caption_map.keys())[:20]
VAL_IDS = list(caption_map.keys())[20:30]

# Training params
BATCH_SIZE = 2
EPOCHS = 1
LR = 2e-4
MAX_STEPS = 30  # stop early for a quick loss check

MODEL_ID = 'Qwen/Qwen2.5-VL-3B-Instruct'  # Qwen2.5 3B vision-language


In [None]:
# Build datasets from dict + image folder
from PIL import Image
from torch.utils.data import Dataset

class ImageCaptionDataset(Dataset):
    def __init__(self, ids, caption_map, image_dir):
        self.ids = list(ids)
        self.caption_map = caption_map
        self.image_dir = Path(image_dir)

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        image_id = self.ids[idx]
        img_path = self.image_dir / f"{image_id}.jpg"
        image = Image.open(img_path).convert('RGB')
        caption = self.caption_map[image_id]
        return {'image': image, 'caption': caption}

train_ds = ImageCaptionDataset(TRAIN_IDS, caption_map, IMAGE_DIR)
val_ds = ImageCaptionDataset(VAL_IDS, caption_map, IMAGE_DIR)
print('Train samples:', len(train_ds), 'Val samples:', len(val_ds))
print('Example IDs:', TRAIN_IDS[:3])


In [None]:
# Processor and prompt builder
from transformers import AutoProcessor

# Qwen2-VL processor needs trust_remote_code to enable multimodal processing
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

def build_prompt(caption):
    messages = [
        {
            'role': 'user',
            'content': [
                {'type': 'image'},
                {'type': 'text', 'text': 'Provide a concise caption for this image.'},
            ],
        }
    ]
    return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + caption


In [None]:
# Collate fn
import torch
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts = [build_prompt(ex['caption']) for ex in batch]
    images = [ex['image'] for ex in batch]
    inputs = processor(text=texts, images=images, padding=True, return_tensors='pt')
    labels = inputs['input_ids'].clone()
    labels[inputs['attention_mask'] == 0] = -100
    return {**inputs, 'labels': labels}

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

batch = next(iter(train_loader))
print({k: (v.shape if torch.is_tensor(v) else type(v)) for k, v in batch.items()})

In [None]:
# Model without 4-bit (bf16/fp16), avoiding bitsandbytes issues on Python 3.12
from transformers import AutoModelForImageTextToText
from peft import LoraConfig, get_peft_model

model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map='auto',
    trust_remote_code=True,
)
model.gradient_checkpointing_enable()

target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias='none',
    task_type='SEQ_2_SEQ_LM',
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
# Training loop (short run to check loss decline)
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=LR)
total_steps = min(MAX_STEPS, EPOCHS * len(train_loader))
warmup = max(1, int(0.03 * total_steps))
scheduler = get_linear_schedule_with_warmup(optimizer, warmup, total_steps)

model.train()
step = 0
for epoch in range(EPOCHS):
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')
    for batch in pbar:
        if step >= total_steps:
            break
        batch = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        pbar.set_postfix({'loss': round(loss.item(), 4)})
        step += 1
    if step >= total_steps:
        break

print('Finished steps:', step)

In [None]:
# Simple validation loss pass (optional)
import torch.nn.functional as F

model.eval()
val_losses = []
with torch.no_grad():
    for batch in tqdm(val_loader, desc='Val'):
        batch = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in batch.items()}
        outputs = model(**batch)
        val_losses.append(outputs.loss.item())
avg_val = sum(val_losses) / len(val_losses)
print('Val loss:', round(avg_val, 4))

In [None]:
# Save adapters
OUT_DIR = BASE_DIR / 'qwen2vl_lora_adapters'
OUT_DIR.mkdir(parents=True, exist_ok=True)
model.save_pretrained(OUT_DIR)
processor.save_pretrained(OUT_DIR)
print('Saved adapters to', OUT_DIR)

In [None]:
# Quick qualitative check
model.eval()
sample = next(iter(val_loader))
sample = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in sample.items()}
with torch.no_grad():
    generated_ids = model.generate(**sample, max_new_tokens=64)
    out_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print('Generated:', out_text)
print('Reference:', train_df.iloc[0][TEXT_COL])