In [2]:
!pip install transformers accelerate timm datasets nltk -q

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# 2. DEVICE CONFIG

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

Using device: cuda


In [4]:
# 3. PATHS (HARDCODED)

IMAGES_PATH = "/kaggle/input/flickr30k/Images"
CAPTIONS_FILE = "/kaggle/input/flickr30k/captions.txt"

In [5]:
# 4. LOAD DATASET

df = pd.read_csv(CAPTIONS_FILE)
df.columns = ['image', 'caption']  # only 2 columns

# Remove any NaN values
df = df.dropna()

print("Sample data:")
print(df.head())


Sample data:
            image                                            caption
0  1000092795.jpg   Two young guys with shaggy hair look at their...
1  1000092795.jpg   Two young , White males are outside near many...
2  1000092795.jpg   Two men in green shirts are standing in a yard .
3  1000092795.jpg       A man in a blue shirt standing in a garden .
4  1000092795.jpg            Two friends enjoy time spent together .


In [6]:
# 5. SPLIT INTO TRAIN, VAL, TEST

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Reducing test size for quick evaluation, earlier it was taking too lng
test_df = test_df.sample(n=100, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 127131, Val: 15891, Test: 100


In [7]:
# 6. DATASET CLASS

class Flickr30kDataset(Dataset):
    def __init__(self, dataframe, image_path, processor):
        self.dataframe = dataframe
        self.image_path = image_path
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_file = os.path.join(self.image_path, row['image'])
        image = Image.open(image_file).convert('RGB')
        caption = row['caption']

        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            max_length=30,
            truncation=True
        )
        return {
            "pixel_values": inputs["pixel_values"].squeeze(),
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze()
        }

In [8]:
# 7. INIT PROCESSOR & MODEL

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model = model.to(DEVICE)


# 8. CREATE DATALOADERS

train_dataset = Flickr30kDataset(train_df, IMAGES_PATH, processor)
val_dataset = Flickr30kDataset(val_df, IMAGES_PATH, processor)
test_dataset = Flickr30kDataset(test_df, IMAGES_PATH, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [9]:
# 9. TRAINING SETUP

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = torch.amp.GradScaler("cuda")

epochs = 3 

In [None]:
# 10. TRAINING LOOP

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        pixel_values = batch["pixel_values"].to(DEVICE)
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.autocast("cuda", enabled=(DEVICE=="cuda")):
            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            loss = outputs.loss

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
        loop.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch["pixel_values"].to(DEVICE)
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)

            with torch.autocast("cuda", enabled=(DEVICE=="cuda")):
                outputs = model(
                    pixel_values=pixel_values,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=input_ids
                )
                val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")


Epoch [1/3]:   1%|▏         | 202/15892 [01:43<2:15:26,  1.93it/s, loss=1.49] 

In [None]:
# 11. SAVING MODEL

model.save_pretrained("/kaggle/working/blip-flickr30k")
processor.save_pretrained("/kaggle/working/blip-flickr30k")
print("Model saved!")


In [None]:
# 12. TESTING AND BLEU SCORE

model.eval()
bleu_scores = []

for idx in range(len(test_dataset)):
    data = test_dataset[idx]
    pixel_values = data["pixel_values"].unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        generated_ids = model.generate(pixel_values=pixel_values, max_length=30)
        caption = processor.decode(generated_ids[0], skip_special_tokens=True)

    reference = [test_df.iloc[idx]['caption'].split()]
    candidate = caption.split()
    bleu = sentence_bleu(reference, candidate)
    bleu_scores.append(bleu)

print(f"Average BLEU Score on Test Set: {sum(bleu_scores)/len(bleu_scores):.4f}")

In [13]:
!ls
