In [1]:
import json
import torch
import logging
import datasets
from PIL import Image
from tqdm.auto import tqdm
from datasets import Dataset
from datasets import load_dataset
from torchvision import transforms

### Set Up Logger

In [2]:
# Clear previous handlers to avoid duplicate logs in Jupyter
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Change to DEBUG for more verbosity
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]  # Ensures it logs to Jupyter cell output
)

logger = logging.getLogger(__name__)
logger.info("Logging is set up in the notebook!")

2025-07-08 06:27:09,558 - INFO - Logging is set up in the notebook!


### Load the MultiDomain Data

In [3]:
prefix = "Generate a one word or single number answer for the given image and question"

In [4]:
def prepend_prefix(example):
    example['question'] = prefix + ': ' + example['question']
    return example

In [5]:
dataset = load_dataset("dutta18/multi-domain-VQA-1.5K")

In [6]:
train_set, val_set = dataset['train'], dataset['validation']

In [7]:
train_set = train_set.map(prepend_prefix)
val_set = val_set.map(prepend_prefix)

In [8]:
len(train_set), len(val_set)

(1500, 600)

### Importing Model

In [9]:
import torch
from torch.utils.data import DataLoader
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration

In [10]:
model_id = "HuggingFaceTB/SmolVLM-Instruct"

In [11]:
device = 'cuda:0'

### Initialize Quantisation Configs

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

### Load the Model

In [13]:
model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16, 
    _attn_implementation = "flash_attention_2",
    device_map = 'auto'
)

processor = AutoProcessor.from_pretrained(model_id, local_files_only=True)

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


In [14]:
#model

### Intialize DORA Configs

In [15]:
dora_config = LoraConfig(
    r = 16,
    lora_alpha = 16*2,
    lora_dropout = 0.05,
    target_modules = ['o_proj','k_proj','q_proj', 'v_proj'],
    init_lora_weights = "gaussian",
    inference_mode = False,
    use_dora = True
)

In [16]:
model = prepare_model_for_kbit_training(model)
smolvlm_qdora_model = get_peft_model(model, dora_config)

### Report the Trainable Params: ~ 9.6 M

In [17]:
def report_trainable_params():
    
    # Simple param report
    trainable = sum(p.numel() for p in smolvlm_qdora_model.parameters() if p.requires_grad)
    print(f"Total trainable params: {trainable/1e6:.1f} M")

In [18]:
report_trainable_params()

Total trainable params: 9.6 M


In [19]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[processor.tokenizer.additional_special_tokens.index("<image>")]

In [20]:
def collate_fn(examples):
    texts = []
    images = []

    for example in examples:
        image = example["image"]
        question = example["question"]
        answer = example["answer"]
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Answer briefly."},
                    {"type": "image"},
                    {"type": "text", "text": question}
                ]
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": answer}]
            }
        ]
        text = processor.apply_chat_template(messages, add_generation_prompt=False)
        texts.append(text.strip())
        images.append([image])

    # Batch using processor
    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

    # Manually set labels
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    batch["labels"] = labels

    # Now cast pixel_values explicitly
    batch["pixel_values"] = batch["pixel_values"].to(torch.bfloat16)

    return batch

### Validation Function

In [21]:
@torch.no_grad
def do_validation():

    smolvlm_qdora_model.eval()
    val_loss = 0.0
    
    for batch in tqdm(val_loader, desc="Validating"):
        batch = {k: v.to(device) for k, v in batch.items()}

        with autocast(device_type='cuda', dtype=torch.bfloat16):
            outputs = smolvlm_qdora_model.eval()(**batch)
            
        loss = outputs["loss"]
        val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    smolvlm_qdora_model.eval().train()
    return avg_val_loss

### Training Hyperparams

In [22]:
from torch.amp import autocast

In [23]:
batch_ = 4
max_epochs = 10
grad_accum_steps = 2

In [24]:
train_loader = DataLoader(train_set, batch_size=batch_, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=batch_, shuffle=False, collate_fn=collate_fn)

In [25]:
optimizer = torch.optim.AdamW(smolvlm_qdora_model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [26]:
global_step = 0
best_val_loss = float("inf")

In [27]:
_ = smolvlm_qdora_model.train()
smolvlm_qdora_model.use_cache = False

## Native PyTorch Training Loop

##### I am using val_loss as the checkpointing criteria, but any other metric which test text generation quality can be used here.

##### MAX GPU USAGE = 23 GB on NVIDIA A40 Card (Adjust LORA Rank, batch size, grad_accum_steps accordingly)

In [None]:
for epoch in tqdm(range(max_epochs)):  
    
    accumulated_loss = 0
    
    for idx, batch in enumerate(train_loader):
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}

        with autocast(device_type='cuda', dtype=torch.bfloat16):
            outputs = smolvlm_qdora_model(**batch)
            loss = outputs["loss"] / grad_accum_steps

        loss.backward()
        accumulated_loss += loss.item()
        
        if (idx + 1) % grad_accum_steps == 0: 
            
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

            logger.info(f"[ Epoch {epoch+1} | idx: {idx} | Optim Step {global_step} | Train Loss: {loss.item():.4f} ]")

            if global_step % 60 == 0:
                avg_val_loss = do_validation()
                logger.info(f"Val Loss @ Optim step: {global_step} -> {avg_val_loss:.4f}\n")
            
                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    smolvlm_qdora_model.save_pretrained('../chkpts/SmolVLM-MultiDomain-QDORA-chkpt-16R.pt')
                    logger.info(f"***** ✅ Checkpoint Saved *****\n")
    
    scheduler.step() 
    logger.info(f"Epoch {epoch+1} completed. Avg loss: {accumulated_loss / len(train_loader):.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
2025-07-08 06:28:05,054 - INFO - [ Epoch 1 | idx: 1 | Optim Step 1 | Train Loss: 0.6868 ]
2025-07-08 06:28:20,267 - INFO - [ Epoch 1 | idx: 3 | Optim Step 2 | Train Loss: 0.6764 ]
2025-07-08 06:28:35,542 - INFO - [ Epoch 1 | idx: 5 | Optim Step 3 | Train Loss: 0.7361 ]
2025-07-08 06:28:51,724 - INFO - [ Epoch 1 | idx: 7 | Optim Step 4 | Train Loss: 0.6870 ]
2025-07-08 06:29:06,961 - INFO - [ Epoch 1 | idx: 9 | Optim Step 5 | Train Loss: 0.7205 ]
