In [13]:
# blip_kaggle_train_infer_fixed.py
# pip install -q transformers timm sentencepiece accelerate

#import os
#import random
#from pathlib import Path
#from collections import defaultdict

#import pandas as pd
#from PIL import Image

#import torch
#from torch.utils.data import Dataset, DataLoader
#from torch.optim import AdamW
#from transformers import BlipProcessor, BlipForConditionalGeneration, get_linear_schedule_with_warmup
#from torch.cuda.amp import GradScaler, autocast


In [None]:
# 1. INSTALL & IMPORT
!pip install transformers accelerate timm datasets -q

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm

In [None]:
# 2. DEVICE CONFIG

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

In [None]:
# 3. PATHS (HARDCODED)

IMAGES_PATH = "/kaggle/input/flickr30k/Images"
CAPTIONS_FILE = "/kaggle/input/flickr30k/captions.txt"

# 4. LOAD DATASET

df = pd.read_csv(CAPTIONS_FILE)
df.columns = ['image', 'caption']  # only 2 columns

print("Sample data:")
print(df.head())

In [None]:
# 5. DATASET CLASS

class Flickr30kDataset(Dataset):
    def __init__(self, dataframe, image_path, processor):
        self.dataframe = dataframe
        self.image_path = image_path
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_file = os.path.join(self.image_path, row['image'])
        image = Image.open(image_file).convert('RGB')
        caption = row['caption']

        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            max_length=30,
            truncation=True
        )
        return {
            "pixel_values": inputs["pixel_values"].squeeze(),
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze()
        }


# 6. INIT PROCESSOR & MODEL

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model = model.to(DEVICE)


# 7. CREATE DATASET & DATALOADER

dataset = Flickr30kDataset(df, IMAGES_PATH, processor)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=2, pin_memory=True)


# 8. TRAINING SETUP

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))  # Mixed precision only if GPU

epochs = 1  # Increase for better results
model.train()

for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        pixel_values = batch["pixel_values"].to(DEVICE)
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            loss = outputs.loss

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
        loop.set_postfix(loss=loss.item())


# 9. SAVE MODEL

model.save_pretrained("/kaggle/working/blip-flickr30k")
processor.save_pretrained("/kaggle/working/blip-flickr30k")

print("Model saved!")


# 10. INFERENCE (Caption Generation)

model.eval()
test_image_path = os.path.join(IMAGES_PATH, df.iloc[0]['image'])
test_image = Image.open(test_image_path).convert('RGB')

inputs = processor(images=test_image, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    generated_ids = model.generate(**inputs, max_length=30)
    caption = processor.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m989.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-08-31 12:27:50.879186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756643271.255236      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756643271.363519      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cpu


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Sample data:
            image                                            caption
0  1000092795.jpg   Two young guys with shaggy hair look at their...
1  1000092795.jpg   Two young , White males are outside near many...
2  1000092795.jpg   Two men in green shirts are standing in a yard .
3  1000092795.jpg       A man in a blue shirt standing in a garden .
4  1000092795.jpg            Two friends enjoy time spent together .


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))  # Mixed precision only if GPU

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):

Epoch [1/1]:   0%|          | 0/19865 [00:43<?, ?it/s][A
Epoch [1/1]:   0%|          | 0/19865 [00:43<?, ?it/s, loss=6.87][A
Epoch [1/1]:   0%|          | 1/19865 [00:43<238:34:55, 43.24s/it, loss=6.87][A
Epoch [1/1]:   0%|          | 1/19865 [01:16<238:34:55, 43.24s/it, loss=6.87][A
Epoch [1/1]:   0%|          | 1/19865 [01:16<238:34:55, 43.24s/it, loss=6.55][A
Epoch [1/1]:   0%|          | 2/19865 [01:16<207:23:23, 37.59s/it, loss=6.55][A
Epoch [1/1]:   0%|          | 2/19865 [01:47<207:23:23, 37.59s/it, loss=6.55][A
Epoch [1/1]:   0%|          | 2/19865 [01:47<207:23:23, 37.59s/it, loss=6.6] [A
Epoch [1/1]:   0%|          | 3/19865 [01:47<190:02:55, 34.45s/it, loss=6.6][A
Epoch [1/1]:   0%|          | 3/19865 [02:17<190:02:55, 34.45s/it, loss=6.6][A
Epoch [1/1]:   0%|          | 3/19865 [02:17<190:02:55, 34.45s/it, loss=7