# CLIP + GPT-2 Image Captioning Model

This notebook implements an image captioning system that combines CLIP's vision encoder with GPT-2's language generation capabilities. The model learns to generate descriptive captions for images using the Flickr30k dataset.

## 🔧 Setup and Dependencies


In [1]:

!pip install transformers torch torchvision datasets accelerate wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    CLIPVisionModel, CLIPProcessor,
    GPT2LMHeadModel, GPT2Tokenizer,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
import pandas as pd
import numpy as np
from PIL import Image
import json
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

2025-09-10 13:46:21.400969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757511981.788420      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757511981.898376      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


## 📊 Dataset Preparation

Setting up the Flickr30k dataset loader with robust CSV parsing to handle different file formats.


In [2]:
class Flickr30kDataset(Dataset):
    def __init__(self, csv_file, images_dir, clip_processor, gpt2_tokenizer, max_length=50):
        self.images_dir = images_dir
        self.clip_processor = clip_processor
        self.gpt2_tokenizer = gpt2_tokenizer
        self.max_length = max_length
        
        # Load CSV with correct separator
        print(f"Loading dataset from: {csv_file}")
        self.data = self.load_csv_robust(csv_file)
        
        print(f"Dataset shape: {self.data.shape}")
        print(f"Columns: {list(self.data.columns)}")
        self.prepare_data()
        self.gpt2_tokenizer.pad_token = self.gpt2_tokenizer.eos_token
        try:
            image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            print(f"Found {len(image_files)} images in {images_dir}")
        except:
            print(f"Warning: Could not access images directory {images_dir}")
        
    def load_csv_robust(self, csv_file):
        strategies = [
            {'sep': '|'},
            
            {'sep': '|', 'on_bad_lines': 'skip'},
            
            {'sep': ',', 'on_bad_lines': 'skip'},
            
            {'sep': '\t', 'on_bad_lines': 'skip'},
        ]
        
        for i, params in enumerate(strategies):
            try:
                print(f"Trying strategy {i+1}: {params}")
                df = pd.read_csv(csv_file, **params)
                if len(df.columns) > 1:
                    print(f"✅ Successfully loaded with strategy {i+1}")
                    print(f"Shape: {df.shape}, Columns: {list(df.columns)}")
                    return df
                else:
                    print(f"⚠️ Strategy {i+1} only found 1 column, trying next...")
            except Exception as e:
                print(f"❌ Strategy {i+1} failed: {str(e)[:100]}")
                continue
        raise ValueError("All CSV loading strategies failed!")
    
    def prepare_data(self):
        """Prepare data based on the actual CSV structure"""
        print("Original columns:", list(self.data.columns))
        if len(self.data.columns) >= 3:
            # Standard format: image_name | comment_number | comment
            self.data['image'] = self.data.iloc[:, 0]  # First column: image name
            self.data['caption'] = self.data.iloc[:, 2]  # Third column: caption
            print("Using columns 0 (image) and 2 (caption)")
        elif len(self.data.columns) >= 2:
            # Two columns: image | caption
            self.data['image'] = self.data.iloc[:, 0]
            self.data['caption'] = self.data.iloc[:, 1]
            print("Using columns 0 (image) and 1 (caption)")
        else:
            raise ValueError("CSV must have at least 2 columns")

        initial_size = len(self.data)
        self.data = self.data.dropna(subset=['caption'])
        self.data = self.data[self.data['caption'].astype(str).str.len() > 0]
        final_size = len(self.data)
        
        print(f"Removed {initial_size - final_size} rows with missing/empty captions")
        self.data = self.data.reset_index(drop=True)
        
        print(f"Final dataset prepared with {len(self.data)} samples")
        if len(self.data) > 0:
            sample = self.data.iloc[0]  # ✅ Fixed: added [0] index
            print(f"Sample - Image: {sample.get('image', 'N/A')}")
            print(f"Sample - Caption: {str(sample.get('caption', 'N/A'))[:100]}...")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        image_name = str(row['image']).strip()
        if not image_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif')):
            image_name += '.jpg'
            
        image_path = os.path.join(self.images_dir, image_name)
        caption = str(row['caption']).strip()
        

        try:
            image = Image.open(image_path).convert('RGB')
            image_inputs = self.clip_processor(images=image, return_tensors="pt")
            pixel_values = image_inputs['pixel_values'].squeeze(0)
        except Exception as e:
            # Create dummy image for missing files
            pixel_values = torch.zeros(3, 224, 224)
        
        # Process caption
        caption_tokens = self.gpt2_tokenizer.encode(
            caption, 
            add_special_tokens=True, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True,
            return_tensors='pt'
        ).squeeze(0)
        
        return {
            'pixel_values': pixel_values,
            'caption_tokens': caption_tokens,
            'caption_text': caption
        }


## 🏗️ Model Architecture

Building the CLIP-GPT2 model that bridges vision and language understanding.


In [15]:
class ClipGPT2Model(nn.Module):
    def __init__(self, clip_model_name="openai/clip-vit-base-patch32", 
                 gpt2_model_name="gpt2", mapping_dim=512):
        super().__init__()
        

        self.clip_vision = CLIPVisionModel.from_pretrained(clip_model_name)
        self.clip_vision.eval()  # Freeze CLIP during training
        for param in self.clip_vision.parameters():
            param.requires_grad = False
            

        self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
        

        if self.gpt2.config.pad_token_id is None:
            self.gpt2.config.pad_token_id = self.gpt2.config.eos_token_id

        clip_dim = self.clip_vision.config.hidden_size  # 512 for base CLIP
        gpt2_dim = self.gpt2.config.hidden_size  # 768 for GPT-2
        
        self.clip_to_gpt2 = nn.Sequential(
            nn.Linear(clip_dim, mapping_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(mapping_dim, gpt2_dim),
            nn.LayerNorm(gpt2_dim)
        )
        self.prefix_length = 10
        
    def get_image_features(self, pixel_values):
        with torch.no_grad():
            image_features = self.clip_vision(pixel_values=pixel_values).pooler_output
        return image_features
    
    def forward(self, pixel_values, caption_tokens):
        batch_size = pixel_values.shape[0]
        
        image_features = self.get_image_features(pixel_values)  # [batch_size, 512]
        
        image_embeddings = self.clip_to_gpt2(image_features)  # [batch_size, 768]

        image_embeddings = image_embeddings.unsqueeze(1).expand(
            batch_size, self.prefix_length, -1
        )
        
        caption_embeddings = self.gpt2.transformer.wte(caption_tokens)  # [batch_size, seq_len, 768]
        
        
        combined_embeddings = torch.cat([image_embeddings, caption_embeddings], dim=1)
        
        
        image_mask = torch.ones(batch_size, self.prefix_length, device=pixel_values.device)
        caption_mask = (caption_tokens != self.gpt2.config.pad_token_id).float()
        attention_mask = torch.cat([image_mask, caption_mask], dim=1)
        
        
        outputs = self.gpt2(
            inputs_embeds=combined_embeddings,
            attention_mask=attention_mask,
            labels=torch.cat([
                torch.full((batch_size, self.prefix_length), -100, device=pixel_values.device),
                caption_tokens
            ], dim=1)
        )
        
        return outputs
    
    def generate_caption(self, pixel_values, tokenizer, max_length=30, num_beams=None):
        self.eval()
        with torch.no_grad():
            B = pixel_values.size(0)
            img_feats = self.get_image_features(pixel_values)                 
            img_embeds = self.clip_to_gpt2(img_feats).unsqueeze(1)           
            img_embeds = img_embeds.expand(B, self.prefix_length, img_embeds.size(-1))  
    
            generated = torch.zeros(B, 0, dtype=torch.long, device=pixel_values.device)
            finished = torch.zeros(B, dtype=torch.bool, device=pixel_values.device)
    
            for _ in range(max_length):
                if generated.size(1) > 0:
                    cap_embeds = self.gpt2.transformer.wte(generated)         # [B, T, gpt2_dim]
                    inputs_embeds = torch.cat([img_embeds, cap_embeds], dim=1)
                else:
                    inputs_embeds = img_embeds
    
                out = self.gpt2(inputs_embeds=inputs_embeds)
                logits = out.logits[:, -1, :]                                 # [B, vocab]
                next_token = torch.argmax(logits, dim=-1, keepdim=True)       # [B, 1]
    

                generated = torch.cat([generated, next_token], dim=1)         # [B, T+1]
    

                eos = (next_token.squeeze(1) == tokenizer.eos_token_id)       # [B]
                finished |= eos
    
                if finished.all():
                    break
    
            return generated



## ⚙️ Training Configuration

Setting up the training pipeline with data loaders and model initialization.


In [4]:
# Configuration
config = {
    'batch_size': 16,
    'learning_rate': 5e-5,
    'num_epochs': 2,
    'warmup_steps': 1000,
    'max_grad_norm': 1.0,
    'save_steps': 1000,
    'eval_steps': 500,
    'logging_steps': 100,
}

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token


model = ClipGPT2Model().to(device)

full_dataset = Flickr30kDataset(
    csv_file='/kaggle/input/flickr-image-dataset/flickr30k_images/results.csv',
    images_dir='/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images', 
    clip_processor=clip_processor,
    gpt2_tokenizer=gpt2_tokenizer
)

train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")


print("\n=== Testing Dataset ===")
try:
    sample = next(iter(train_loader))
    print(f"✅ Batch pixel_values shape: {sample['pixel_values'].shape}")
    print(f"✅ Batch caption_tokens shape: {sample['caption_tokens'].shape}")
    print(f"✅ Sample caption: {sample['caption_text'][0][:100]}...")
except Exception as e:
    print(f"❌ Error: {e}")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading dataset from: /kaggle/input/flickr-image-dataset/flickr30k_images/results.csv
Trying strategy 1: {'sep': '|'}
✅ Successfully loaded with strategy 1
Shape: (158915, 3), Columns: ['image_name', ' comment_number', ' comment']
Dataset shape: (158915, 3)
Columns: ['image_name', ' comment_number', ' comment']
Original columns: ['image_name', ' comment_number', ' comment']
Using columns 0 (image) and 2 (caption)
Removed 1 rows with missing/empty captions
Final dataset prepared with 158914 samples
Sample - Image: 1000092795.jpg
Sample - Caption:  Two young guys with shaggy hair look at their hands while hanging out in the yard ....
Found 31783 images in /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images
Training samples: 143022
Validation samples: 15892

=== Testing Dataset ===
✅ Batch pixel_values shape: torch.Size([16, 3, 224, 224])
✅ Batch caption_tokens shape: torch.Size([16, 50])
✅ Sample caption: A man cuts a cake while others crowd around him ....


### Data Loading & Model Setup

Initializing processors, tokenizers, and creating train/validation splits.


In [16]:
import sys
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm  # Better for Kaggle notebooks
import matplotlib.pyplot as plt

optimizer = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=0.01)
total_steps = len(train_loader) * config['num_epochs']  # Now uses 2 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=config['warmup_steps'], 
    num_training_steps=total_steps
)

print(f"Total training steps: {total_steps}")
print(f"Steps per epoch: {len(train_loader)}")
sys.stdout.flush()  
def train_epoch(model, train_loader, optimizer, scheduler, epoch):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}', 
                       dynamic_ncols=True, leave=True)
    
    for step, batch in enumerate(progress_bar):
        
        pixel_values = batch['pixel_values'].to(device)
        caption_tokens = batch['caption_tokens'].to(device)
        
       
        outputs = model(pixel_values, caption_tokens)
        loss = outputs.loss
        
        
        loss.backward()
        
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)

        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'avg': f'{avg_loss:.4f}',
            'lr': f'{scheduler.get_last_lr()[0]:.2e}'
        })
        

        if step % config['logging_steps'] == 0 and step > 0:
            print(f"\n[Step {step:4d}] Loss: {loss.item():.4f}, Avg: {avg_loss:.4f}")
            sys.stdout.flush()
    
    return total_loss / len(train_loader)

def validate_epoch(model, val_loader):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc='Validating', 
                           dynamic_ncols=True, leave=True)
        for batch in progress_bar:
            pixel_values = batch['pixel_values'].to(device)
            caption_tokens = batch['caption_tokens'].to(device)
            
            outputs = model(pixel_values, caption_tokens)
            total_loss += outputs.loss.item()
            
            current_val_loss = total_loss / (len([b for b in progress_bar if True]) + 1)
            progress_bar.set_postfix({'val_loss': f'{current_val_loss:.4f}'})
    
    return total_loss / len(val_loader)

train_losses = []
val_losses = []
best_val_loss = float('inf')

print("🚀 Starting 2-epoch training...")
print(f"Dataset: {len(train_loader) * config['batch_size']} training samples")
print(f"Batch size: {config['batch_size']}")
print(f"Learning rate: {config['learning_rate']}")
sys.stdout.flush()

for epoch in range(config['num_epochs']):  # Will loop 2 times
    print(f"\n{'='*50}")
    print(f"--- EPOCH {epoch + 1}/{config['num_epochs']} ---")
    print(f"{'='*50}")
    sys.stdout.flush()
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, epoch)
    train_losses.append(train_loss)
    
    print(f"\n📊 Epoch {epoch + 1} Training Results:")
    print(f"   Final Training Loss: {train_loss:.4f}")
    sys.stdout.flush()
    
    
    print("🔍 Running validation...")
    sys.stdout.flush()
    val_loss = validate_epoch(model, val_loader)
    val_losses.append(val_loss)
    
    print(f"📊 Epoch {epoch + 1} Validation Results:")
    print(f"   Training Loss: {train_loss:.4f}")
    print(f"   Validation Loss: {val_loss:.4f}")
    
    if len(val_losses) > 1:
        improvement = val_losses[-2] - val_losses[-1]
        print(f"   Improvement: {improvement:+.4f}")
    
    sys.stdout.flush()
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'config': config
        }, 'best_clip_gpt2_model.pth')
        print(f"🏆 NEW BEST MODEL SAVED!")
        print(f"   Best Validation Loss: {val_loss:.4f}")
    else:
        print(f"   Previous best: {best_val_loss:.4f}")
    
    sys.stdout.flush()

print(f"\n{'='*50}")
print("✅ 2-EPOCH TRAINING COMPLETED!")
print(f"{'='*50}")
print(f"Final Training Loss: {train_losses[-1]:.4f}")
print(f"Final Validation Loss: {val_losses[-1]:.4f}")
print(f"Best Validation Loss: {best_val_loss:.4f}")
sys.stdout.flush()

# Plot training curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
epochs = range(1, len(train_losses) + 1)
plt.plot(epochs, train_losses, 'b-o', label='Training Loss', linewidth=2, markersize=8)
plt.plot(epochs, val_losses, 'r-s', label='Validation Loss', linewidth=2, markersize=8)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training Progress (2 Epochs)', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.xticks(epochs)

plt.subplot(1, 2, 2)
x = ['Epoch 1', 'Epoch 2']
plt.bar([i-0.2 for i in range(1, len(train_losses)+1)], train_losses, 
        width=0.4, alpha=0.7, label='Train Loss', color='blue')
plt.bar([i+0.2 for i in range(1, len(val_losses)+1)], val_losses, 
        width=0.4, alpha=0.7, label='Val Loss', color='red')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Loss Comparison', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.xticks(range(1, len(train_losses)+1), x)

plt.tight_layout()
plt.show()

print("📈 Training curves displayed successfully!")


Total training steps: 17878
Steps per epoch: 8939
🚀 Starting 2-epoch training...
Dataset: 143024 training samples
Batch size: 16
Learning rate: 5e-05

--- EPOCH 1/2 ---


Epoch 1:   0%|          | 0/8939 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [7]:
import torch

torch.save({
    'epoch': 0,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss': train_losses[-1] if len(train_losses) > 0 else None,
    'val_loss': val_losses[-1] if len(val_losses) > 0 else None,
    'config': config
}, 'ckpt_after_epoch1.pth')

print("Manual checkpoint saved: ckpt_after_epoch1.pth")


Manual checkpoint saved: ckpt_after_epoch1.pth


### Model Checkpointing

Saving model states for resuming training later.


In [19]:
import os
print('CWD:', os.getcwd())
print('Exists:', os.path.exists('ckpt_after_epoch1.pth'))


CWD: /kaggle/working
Exists: True


In [21]:
import torch

ckpt_path = '/kaggle/working/ckpt_after_epoch1.pth'
ckpt = torch.load(ckpt_path, map_location=device)

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

model = ClipGPT2Model().to(device)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

print("Loaded epoch index:", ckpt.get('epoch'))
print("Train loss saved:", ckpt.get('train_loss'))
print("Val loss saved:", ckpt.get('val_loss'))


Loaded epoch index: 0
Train loss saved: 0.7523651102776362
Val loss saved: None


In [22]:
from PIL import Image

@torch.no_grad()
def caption_image(image_path, model, clip_processor, tokenizer, device, max_length=30):
    image = Image.open(image_path).convert('RGB')
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    pixel_values = inputs['pixel_values']
    tokens = model.generate_caption(pixel_values, tokenizer, max_length=max_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)


In [23]:
batch = next(iter(val_loader))
images = batch['pixel_values'][:4].to(device)
true_caps = batch['caption_text'][:4]

model.eval()
with torch.no_grad():
    gen_tokens = model.generate_caption(images, gpt2_tokenizer, max_length=30)
pred_caps = [gpt2_tokenizer.decode(t, skip_special_tokens=True) for t in gen_tokens]

for i, (gt, pr) in enumerate(zip(true_caps, pred_caps)):
    print(f"\n[{i}]")
    print("GT :", gt)
    print("PR :", pr)



[0]
GT : A kid with Green shoes , red shorts , and a blue hat and shirt stands on a stretch of pavement .
PR : A young boy in a red shirt and blue jeans is riding a skateboard .

[1]
GT : Construction men working on scaffolds .
PR : Construction workers are working on a building .A man in a blue shirt is standing

[2]
GT : A woman in a beaded hat holding a sleeping baby .
PR : A woman is holding a baby in a red coat .A woman is holding a

[3]
GT : A man eating olives and drinking .
PR : A man with glasses and a green shirt is sitting at a table with a drink .


In [24]:
import torch

ckpt = torch.load('/kaggle/working/ckpt_after_epoch1.pth', map_location=device)

model.load_state_dict(ckpt['model_state_dict'])
optimizer.load_state_dict(ckpt['optimizer_state_dict'])
scheduler.load_state_dict(ckpt['scheduler_state_dict'])

start_epoch = ckpt.get('epoch', 0) + 1 
print("Resuming from epoch index:", start_epoch)


Resuming from epoch index: 1


In [25]:
for epoch in range(start_epoch, config['num_epochs']):
    print(f"\n--- EPOCH {epoch + 1}/{config['num_epochs']} ---")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, epoch)
    print(f"Train Loss: {train_loss:.4f}")

    val_loss = validate_epoch(model, val_loader)
    print(f"Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'config': config
        }, 'best_clip_gpt2_model.pth')
        print("🏆 NEW BEST MODEL SAVED!")



--- EPOCH 2/2 ---


Epoch 2:   0%|          | 0/8939 [00:00<?, ?it/s]


[Step  100] Loss: 0.8123, Avg: 0.6774

[Step  200] Loss: 0.5878, Avg: 0.6762

[Step  300] Loss: 0.5588, Avg: 0.6779

[Step  400] Loss: 0.6740, Avg: 0.6794

[Step  500] Loss: 0.5406, Avg: 0.6799

[Step  600] Loss: 0.6473, Avg: 0.6779

[Step  700] Loss: 0.7479, Avg: 0.6782

[Step  800] Loss: 0.6385, Avg: 0.6793

[Step  900] Loss: 0.6327, Avg: 0.6802

[Step 1000] Loss: 0.6947, Avg: 0.6805

[Step 1100] Loss: 0.6506, Avg: 0.6801

[Step 1200] Loss: 0.5248, Avg: 0.6812

[Step 1300] Loss: 0.6622, Avg: 0.6805

[Step 1400] Loss: 0.8951, Avg: 0.6806

[Step 1500] Loss: 0.6997, Avg: 0.6825

[Step 1600] Loss: 0.6088, Avg: 0.6821

[Step 1700] Loss: 0.7029, Avg: 0.6828

[Step 1800] Loss: 0.6418, Avg: 0.6846

[Step 1900] Loss: 0.6646, Avg: 0.6840

[Step 2000] Loss: 0.6708, Avg: 0.6830

[Step 2100] Loss: 0.7133, Avg: 0.6831

[Step 2200] Loss: 0.7303, Avg: 0.6834

[Step 2300] Loss: 0.6080, Avg: 0.6829

[Step 2400] Loss: 0.7446, Avg: 0.6823

[Step 2500] Loss: 0.5653, Avg: 0.6825

[Step 2600] Loss: 0.8701

Validating:   0%|          | 0/994 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [26]:
import torch, os

train_loss_last = train_losses[-1] if 'train_losses' in globals() and len(train_losses)>0 else None
val_loss_last   = val_losses[-1] if 'val_losses'   in globals() and len(val_losses)>0 else None

save_path = '/kaggle/working/ckpt_after_epoch2.pth'
torch.save({
    'epoch': 1,  # epoch index for epoch 2
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss': train_loss_last,
    'val_loss': val_loss_last,
    'config': config
}, save_path)

print('Manual checkpoint saved to:', save_path, '| Exists:', os.path.exists(save_path))


Manual checkpoint saved to: /kaggle/working/ckpt_after_epoch2.pth | Exists: True


In [27]:
import os, torch
path = '/kaggle/working/ckpt_after_epoch2.pth'
print('Exists:', os.path.exists(path))
ckpt = torch.load(path, map_location='cpu')
print('Keys:', list(ckpt.keys()))
print('Epoch index:', ckpt.get('epoch'))  
print('Train loss:', ckpt.get('train_loss'))
print('Val loss:', ckpt.get('val_loss'))


Exists: True
Keys: ['epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'train_loss', 'val_loss', 'config']
Epoch index: 1
Train loss: None
Val loss: None


In [29]:
import torch
from transformers import CLIPProcessor, GPT2Tokenizer

ckpt = torch.load('/kaggle/working/ckpt_after_epoch2.pth', map_location=device)

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

model = ClipGPT2Model().to(device)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

print("Model loaded successfully for evaluation")



Model loaded successfully for evaluation


## 📊 Model Evaluation

Evaluating the model using BLEU scores and other metrics.


In [30]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
import numpy as np

def evaluate_bleu_scores(model, val_loader, tokenizer, device, num_samples=1000):
    model.eval()
    references = []  
    candidates = []  
    
    sample_count = 0
    with torch.no_grad():
        for batch in val_loader:
            if sample_count >= num_samples:
                break
                
            pixel_values = batch['pixel_values'].to(device)
            true_caps = batch['caption_text']
            
            gen_tokens = model.generate_caption(pixel_values, tokenizer, max_length=30)
            pred_caps = [tokenizer.decode(t, skip_special_tokens=True) for t in gen_tokens]
            
            for true_cap, pred_cap in zip(true_caps, pred_caps):
                references.append([true_cap.split()])  # Reference as list of words
                candidates.append(pred_cap.split())     # Candidate as list of words
                sample_count += 1
                
                if sample_count >= num_samples:
                    break
    
    bleu1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    bleu2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0))  
    bleu3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, candidates, weights=(0.25, 0.25, 0.25, 0.25))
    
    return {
        'BLEU-1': bleu1,
        'BLEU-2': bleu2, 
        'BLEU-3': bleu3,
        'BLEU-4': bleu4,
        'samples_evaluated': len(candidates)
    }

results = evaluate_bleu_scores(model, val_loader, gpt2_tokenizer, device)
print("=== BLEU Score Results ===")
for metric, score in results.items():
    if metric != 'samples_evaluated':
        print(f"{metric}: {score:.4f}")
print(f"Evaluated on {results['samples_evaluated']} samples")


=== BLEU Score Results ===
BLEU-1: 0.2631
BLEU-2: 0.1479
BLEU-3: 0.0875
BLEU-4: 0.0517
Evaluated on 1000 samples


### Advanced Metrics

Computing METEOR, ROUGE, and CIDEr scores for comprehensive evaluation.


In [32]:
!pip install pycocoevalcap

# Comprehensive evaluation function
def comprehensive_evaluation(model, val_loader, tokenizer, device, num_samples=500):
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.meteor.meteor import Meteor
    from pycocoevalcap.rouge.rouge import Rouge
    from pycocoevalcap.cider.cider import Cider
    
    model.eval()
    gts = {}  
    res = {}  
    img_id = 0
    with torch.no_grad():
        for batch in val_loader:
            if img_id >= num_samples:
                break
                
            pixel_values = batch['pixel_values'].to(device)
            true_caps = batch['caption_text']
            
            gen_tokens = model.generate_caption(pixel_values, tokenizer, max_length=30)
            pred_caps = [tokenizer.decode(t, skip_special_tokens=True) for t in gen_tokens]
            
            for true_cap, pred_cap in zip(true_caps, pred_caps):
                gts[img_id] = [true_cap]
                res[img_id] = [pred_cap]
                img_id += 1
                
                if img_id >= num_samples:
                    break
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(), "METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr")
    ]
    
    eval_results = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(gts, res)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                eval_results[m] = sc
        else:
            eval_results[method] = score
    
    return eval_results
try:
    comp_results = comprehensive_evaluation(model, val_loader, gpt2_tokenizer, device)
    print("=== Comprehensive Evaluation Results ===")
    for metric, score in comp_results.items():
        print(f"{metric}: {score:.4f}")
except ImportError:
    print("pycocoevalcap not available. Install it for comprehensive metrics.")


Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2
{'testlen': 8882, 'reflen': 6646, 'guess': [8882, 8382, 7882, 7382], 'correct': [2306, 686, 226, 94]}
ratio: 1.3364429732167715
=== Comprehensive Evaluation Results ===
Bleu_1: 0.2596
Bleu_2: 0.1458
Bleu_3: 0.0848
Bleu_4: 0.0528
METEOR: 0.1382
ROUGE_L: 0.2742
CIDEr: 0.3776


In [33]:
import torch
from PIL import Image
from transformers import CLIPProcessor, GPT2Tokenizer
ckpt = torch.load('/kaggle/working/ckpt_after_epoch2.pth', map_location=device)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

print("✅ Model loaded and ready for caption generation!")


✅ Model loaded and ready for caption generation!


## 🎯 Sample Caption Generation

Testing the trained model on sample images from the dataset.


In [34]:
@torch.no_grad()
def generate_caption_for_image(img_path, model, clip_processor, tokenizer, device, max_length=30):
    """Generate caption for a single image"""
    image = Image.open(img_path).convert('RGB')
    inputs = clip_processor(images=image, return_tensors='pt').to(device)
    pixel_values = inputs['pixel_values']
    tokens = model.generate_caption(pixel_values, tokenizer, max_length=max_length)
    caption = tokenizer.decode(tokens[0], skip_special_tokens=True)
    return caption
test_images = [
    '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg',
    '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000268201.jpg',
    '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000344755.jpg'
]

print("🎯 Generated Captions:")
for i, img_path in enumerate(test_images):
    caption = generate_caption_for_image(img_path, model, clip_processor, gpt2_tokenizer, device)
    print(f"[{i+1}] {caption}")


🎯 Generated Captions:
[1] A man and a woman are walking down a sidewalk .
[2] A little girl is sitting on a wooden bench outside .
[3] A man is standing on a ladder fixing a window .
