In [2]:
!pip install transformers accelerate timm datasets -q

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [3]:
# 2. DEVICE CONFIG

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

Using device: cpu


In [4]:
# 3. PATHS (HARDCODED)

IMAGES_PATH = "/kaggle/input/flickr30k/Images"
CAPTIONS_FILE = "/kaggle/input/flickr30k/captions.txt"

In [6]:
# 4. LOAD DATASET

df = pd.read_csv(CAPTIONS_FILE)
df.columns = ['image', 'caption']  # Ensure only two columns: image, caption
print("Sample data:")
print(df.head())

# Split into Train (90%) and Validation (10%)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)


Sample data:
            image                                            caption
0  1000092795.jpg   Two young guys with shaggy hair look at their...
1  1000092795.jpg   Two young , White males are outside near many...
2  1000092795.jpg   Two men in green shirts are standing in a yard .
3  1000092795.jpg       A man in a blue shirt standing in a garden .
4  1000092795.jpg            Two friends enjoy time spent together .


In [None]:
# 5. DATASET CLASS

class Flickr30kDataset(Dataset):
    def __init__(self, dataframe, image_path, processor):
        self.dataframe = dataframe
        self.image_path = image_path
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_file = os.path.join(self.image_path, row['image'])
        image = Image.open(image_file).convert('RGB')
        caption = row['caption']

        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            max_length=30,
            truncation=True
        )
        return {
            "pixel_values": inputs["pixel_values"].squeeze(),
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze()
        }

# 6. INIT PROCESSOR & MODEL

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model = model.to(DEVICE)

# 7. CREATE DATASETS & DATALOADERS

train_dataset = Flickr30kDataset(train_df, IMAGES_PATH, processor)
val_dataset = Flickr30kDataset(val_df, IMAGES_PATH, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)

# 8. TRAINING SETUP

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda")) 

epochs = 3 
best_val_loss = float("inf")


# 9. TRAINING + VALIDATION LOOP

def evaluate(model, val_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch["pixel_values"].to(DEVICE)
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)

            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            total_loss += outputs.loss.item()
    return total_loss / len(val_loader)

for epoch in range(epochs):
    model.train()
    running_loss = 0
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        pixel_values = batch["pixel_values"].to(DEVICE)
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            loss = outputs.loss

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
        loop.set_postfix(train_loss=loss.item())

    avg_train_loss = running_loss / len(train_loader)
    val_loss = evaluate(model, val_loader)

    print(f"\nEpoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained("/kaggle/working/blip-flickr30k-best")
        processor.save_pretrained("/kaggle/working/blip-flickr30k-best")
        print("✅ Saved Best Model!")


# 10. FINAL SAVE MODEL

model.save_pretrained("/kaggle/working/blip-flickr30k-final")
processor.save_pretrained("/kaggle/working/blip-flickr30k-final")
print("Final Model saved!")


# 11. INFERENCE (Caption Generation)

model.eval()
test_image_path = os.path.join(IMAGES_PATH, df.iloc[0]['image'])
test_image = Image.open(test_image_path).convert('RGB')

inputs = processor(images=test_image, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    generated_ids = model.generate(**inputs, max_length=30)
    caption = processor.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))  # Mixed precision only if GPU

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):

Epoch [1/3]:   0%|          | 0/17878 [00:54<?, ?it/s][A
Epoch [1/3]:   0%|          | 0/17878 [00:54<?, ?it/s, train_loss=8.17][A
Epoch [1/3]:   0%|          | 1/17878 [00:54<269:59:40, 54.37s/it, train_loss=8.17][A
Epoch [1/3]:   0%|          | 1/17878 [01:35<269:59:40, 54.37s/it, train_loss=8.17][A
Epoch [1/3]:   0%|          | 1/17878 [01:35<269:59:40, 54.37s/it, train_loss=7]   [A
Epoch [1/3]:   0%|          | 2/17878 [01:35<230:58:39, 46.52s/it, train_loss=7][A
Epoch [1/3]:   0%|          | 2/17878 [02:11<230:58:39, 46.52s/it, train_loss=7][A
Epoch [1/3]:   0%|          | 2/17878 [02:11<230:58:39, 46.52s/it, train_loss=7.71][A
Epoch [1/3]:   0%|          | 3/17878 [02:11<208:28:23, 41.99s/it, train_loss=7.71][A
Epoch [1/3]:   0%|          | 3/17878 [02:47<208:28:23, 41.99s/it, train_loss=7.71][A
Epoch [1/3]:   0%|     

In [13]:
!ls
