In [107]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

CUDA available: True
Device: cuda
GPU Name: GRID A100X-10C


In [None]:
!pip install transformers datasets torch torchvision evaluate pycocoevalcap scikit-learn matplotlib tqdm


Step 2: Load 5k Images from MSCOCO (25%)

In [None]:
!pip install ipywidgets --upgrade
!jupyter nbextension enable --py widgetsnbextension


In [110]:
import urllib.request
import zipfile
import os

# Create destination directory
data_dir = r"deepreel/data"
os.makedirs(data_dir, exist_ok=True)

# Download val2017.zip
val_url = "http://images.cocodataset.org/zips/val2017.zip"
val_zip_path = os.path.join(data_dir, "val2017.zip")
urllib.request.urlretrieve(val_url, val_zip_path)

# Extract val2017.zip
with zipfile.ZipFile(val_zip_path, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

# Download annotations_trainval2017.zip
ann_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
ann_zip_path = os.path.join(data_dir, "annotations_trainval2017.zip")
urllib.request.urlretrieve(ann_url, ann_zip_path)

# Extract annotations zip
with zipfile.ZipFile(ann_zip_path, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

In [111]:
import json
from pathlib import Path
from collections import defaultdict

# Paths
data_dir = Path("deepreel/data")
images_dir = data_dir / "val2017"
annotations_file = data_dir / "annotations" / "captions_val2017.json"

# Load COCO caption annotations
with open(annotations_file, 'r') as f:
    captions_data = json.load(f)

# Map image_id ->  filename
id_to_filename = {img['id']: img['file_name'] for img in captions_data['images']}

# Map image_id -> all its captions
id_to_captions = defaultdict(list)
for ann in captions_data['annotations']:
    id_to_captions[ann['image_id']].append(ann['caption'])

image_id = next(iter(id_to_filename))
print(f"Image file: {id_to_filename[image_id]}")
print("Captions:")
print(id_to_captions[image_id])


Image file: 000000397133.jpg
Captions:
['A man is in a kitchen making pizzas.', 'Man in apron standing on front of oven with pans and bakeware', 'A baker is working in the kitchen rolling dough.', 'A person standing by a stove in a kitchen.', 'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']


In [112]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os

# Load pretrained CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").eval().cuda()
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to("cuda")
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    return embedding.squeeze(0)  # [512]


In [113]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_caption(caption, max_length=32):
    tokens = tokenizer(
        caption,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return tokens.input_ids.squeeze(0)  # [max_length]


In [114]:
import torch
from torch.utils.data import Dataset
import random

class CocoGPTDataset(Dataset):
    def __init__(self, image_paths, image_ids, id_to_captions, tokenizer, max_length=32):
        self.image_paths = image_paths
        self.image_ids = image_ids
        self.id_to_captions = id_to_captions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        image_path = self.image_paths[idx]
    
        # Get a random caption
        raw_caption = random.choice(self.id_to_captions[img_id])
    
        prompt = "An image of "
        full_caption = prompt + raw_caption
    
        # Get image features
        img_embedding = get_image_embedding(image_path)  
    
        # Tokenize with padding and truncation
        tokens = self.tokenizer.encode(
            full_caption,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        ).squeeze(0)  # shape: [max_length]
    
        return img_embedding, tokens



In [121]:
image_ids = list(id_to_filename.keys())[:1250]
dataset = CocoGPTDataset(image_ids, images_dir, id_to_filename, id_to_captions)



In [116]:
from transformers import GPT2LMHeadModel, GPT2Config
import torch.nn as nn

class ImageCaptioningGPT2(nn.Module):
    def __init__(self, gpt_model_name="gpt2", embed_dim=512):
        super().__init__()
        self.gpt = GPT2LMHeadModel.from_pretrained(gpt_model_name)
        self.gpt.resize_token_embeddings(len(tokenizer))

        self.map_img_to_gpt = nn.Linear(embed_dim, self.gpt.config.n_embd)
        self.tokenizer_pad_id = tokenizer.pad_token_id

    def forward(self, img_embedding, input_ids, labels=None):

        # Map image -> GPT2 embedding space
        img_token = self.map_img_to_gpt(img_embedding).unsqueeze(1)  
        tok_embeds = self.gpt.transformer.wte(input_ids)  

        # Concatenate image token at start
        input_embeds = torch.cat([img_token, tok_embeds], dim=1) 

        # Shift labels right to align with logits
        if labels is not None:
            labels = torch.cat([
                torch.full((labels.shape[0], 1), -100).to(labels.device),
                labels
            ], dim=1)

        output = self.gpt(inputs_embeds=input_embeds, labels=labels)
        return output


In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 16
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

print("Dataset ready with", len(dataset), "samples.")

model = ImageCaptioningGPT2().cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [None]:
import torch.nn.functional as F
import random


EPOCHS = 20
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for img_feats, cap_tokens in loader:
        img_feats = img_feats.cuda()
        cap_tokens = cap_tokens.cuda()

        outputs = model(img_feats, cap_tokens, labels=cap_tokens)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}")


Step 10: Prepare Validation Set for Inference. Let’s reuse 200 images (e.g. last 200 of your dataset):


In [122]:
val_ids = image_ids[-200:]
val_dataset = CocoGPTDataset(val_ids, images_dir, id_to_filename, id_to_captions)
val_loader = DataLoader(val_dataset, batch_size=1)


Step 11: Generate Captions from Model

In [123]:
def generate_caption(model, img_embedding, tokenizer, max_length=32, num_beams=5):
    model.eval()

    img_embedding = img_embedding.cuda().unsqueeze(0) 

    # 1. Get the prompt text and its embeddings
    prompt_text = "An image of"
    prompt_ids = tokenizer.encode(prompt_text, return_tensors="pt").cuda()  
    prompt_embeds = model.gpt.transformer.wte(prompt_ids)  

    # 2. Project the image and prepend as token
    img_token = model.map_img_to_gpt(img_embedding).unsqueeze(1)  

    # 3. Combine image token + prompt embeddings
    input_embeds = torch.cat([img_token, prompt_embeds], dim=1) 

    # 4. Generate from this combined context
    generated_ids = model.gpt.generate(
        inputs_embeds=input_embeds,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id,
    )

    # 5. Decode and return
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)


In [None]:
print("Checking sample predictions:\n")

for i in range(5):
    img_id = val_ids[i]
    img_path = images_dir / id_to_filename[img_id]
    img_embedding = get_image_embedding(img_path)
    caption = generate_caption(model, img_embedding, tokenizer)

    print(f"Image ID: {img_id}")
    print("Predicted:", generate_caption(model, img_embedding, tokenizer))
    print("Reference:", refs[i][:2]) 
    print("—" * 50)


In [125]:
!pip install evaluate nltk pycocoevalcap




Evaluate with BLEU & CIDEr

In [None]:
!git clone https://github.com/tylin/coco-caption
!pip install -e coco-caption


In [136]:
preds = []
refs = []

In [137]:
for img_id in val_ids:
    img_path = images_dir / id_to_filename[img_id]
    img_embedding = get_image_embedding(img_path)

    # Generate predicted caption
    generated_caption = generate_caption(model, img_embedding, tokenizer)
    preds.append(generated_caption)

    # Append reference captions (COCO allows multiple per image)
    refs.append(id_to_captions[img_id])

In [138]:
# Save predictions
results = [
    {"image_id": int(img_id), "caption": caption}
    for img_id, caption in zip(val_ids, preds)
]

# Save references
from collections import defaultdict
gts = defaultdict(list)
for img_id, captions in zip(val_ids, refs):
    for cap in captions:
        gts[int(img_id)].append({"caption": cap})


In [None]:
!rm -rf coco-caption
!git clone https://github.com/tylin/coco-caption.git

with open("coco-caption/setup.py", "w") as f:
    f.write("""
from setuptools import setup, find_packages
setup(
    name='coco-caption',
    version='1.0',
    packages=find_packages(),
    package_dir={'': '.'},
)
""")
!pip install -e coco-caption
!2to3 -w coco-caption/pycocoevalcap



In [140]:
from pycocoevalcap.eval import COCOEvalCap

# Monkey-patch the evaluate function to exclude METEOR
def patched_evaluate(self):
    imgIds = self.params['image_id']
    gts = {}
    res = {}
    for imgId in imgIds:
        gts[imgId] = self.coco.imgToAnns[imgId]
        res[imgId] = self.cocoRes.imgToAnns[imgId]

    from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
    tokenizer = PTBTokenizer()
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    # Set up scorers (no METEOR)
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Cider(), "CIDEr"),
    ]

    for scorer, method in scorers:
        print(f"computing {scorer.method()} score...")
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                self.setEval(sc, m)
                self.setImgToEvalImgs(scs, gts.keys(), m)
                print(f"{m}: {sc:.3f}")
        else:
            self.setEval(score, method)
            self.setImgToEvalImgs(scores, gts.keys(), method)
            print(f"{method}: {score:.3f}")
    self.setEvalImgs()

COCOEvalCap.evaluate = patched_evaluate


In [141]:
from collections import defaultdict
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
import tempfile, json

# Step 1: Normalize image IDs to int (very important)
results_fixed = [{"image_id": int(r["image_id"]), "caption": r["caption"]} for r in results]

# Step 2: Wrap references in COCO-style dicts
gts_dict = defaultdict(list)
for img_id, cap_list in zip(val_ids, refs):
    for cap in cap_list:
        gts_dict[int(img_id)].append({"caption": cap})

# Step 3: Save both to JSON files for pycocoevalcap
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as gt_file:
    gt_data = {
        "images": [{"id": int(k)} for k in gts_dict.keys()],
        "annotations": [
            {"image_id": int(k), "id": i, "caption": d["caption"]}
            for i, (k, v) in enumerate(gts_dict.items())
            for d in v
        ],
        "type": "captions",
        "info": "evaluation"
    }
    json.dump(gt_data, gt_file)
    gt_path = gt_file.name

with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as pred_file:
    json.dump(results_fixed, pred_file)
    pred_path = pred_file.name

# Step 4: Monkey-patch tokenizer (to avoid Java)
import pycocoevalcap.tokenizer.ptbtokenizer as pt_mod
import re, string

def simple_tokenize(self, captions_for_image):
    final = {}
    for img_id, caps in captions_for_image.items():
        toks = []
        for c in caps:
            text = c['caption'].lower()
            text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
            toks.append(" ".join(text.split()))
        final[img_id] = toks
    return final

pt_mod.PTBTokenizer.tokenize = simple_tokenize




In [None]:
# Step 5: Run COCOEval
coco = COCO(gt_path)
cocoRes = coco.loadRes(pred_path)
cocoEval = COCOEvalCap(coco, cocoRes)
cocoEval.evaluate()

# Print scores
for metric, score in cocoEval.eval.items():
    print(f"{metric}: {score:.4f}")

In [None]:
import matplotlib.pyplot as plt

metrics = list(cocoEval.eval.keys())
scores = list(cocoEval.eval.values())

plt.figure(figsize=(10, 5))
plt.bar(metrics, scores)
plt.title("Evaluation Metrics for DeepReel GPT Captioning")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.show()


In [None]:
print("Sample result:", results[0])
print("Sample refs:", refs[0])


In [145]:
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

def train_and_evaluate(dataset_fraction, image_ids, image_paths, image_id_to_captions, tokenizer):
    print(f"\n Training with {int(dataset_fraction * 100)}% of dataset")

    # Sample a subset
    subset_size = int(len(image_ids) * dataset_fraction)
    selected_ids = image_ids[:subset_size]
    selected_paths = image_paths[:subset_size]
    print(f" Selected {subset_size} samples")

    # Create dataset and dataloader
    print("Creating dataset and dataloader")
    dataset = CocoGPTDataset(selected_paths, selected_ids, image_id_to_captions, tokenizer)
    loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
    print(f" DataLoader ready with {len(dataset)} samples")

    # Initialize model and optimizer
    print(" Initializing model...")
    model = ImageCaptioningGPT2().cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Train
    print(" Starting training...")
    model.train()
    for epoch in range(20):
        total_loss = 0
        for img_feats, cap_tokens in loader:
            img_feats, cap_tokens = img_feats.cuda(), cap_tokens.cuda()
            outputs = model(img_feats, cap_tokens, labels=cap_tokens)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(loader)
        print(f"Epoch {epoch+1}/20 - Loss: {avg_loss:.4f}")

    # Generate captions
    print(" Generating predictions")
    preds, refs, val_ids = [], [], []
    for img_id, img_path in zip(selected_ids[:100], selected_paths[:100]):
        img_embedding = get_image_embedding(img_path)
        caption = generate_caption(model, img_embedding, tokenizer)
        preds.append(caption)
        refs.append(image_id_to_captions[img_id][:5])  
        val_ids.append(int(img_id))
    print("Caption generation complete")

    # Format predictions for COCOEval
    print("Formatting results for COCOEval")
    results = [{"image_id": int(iid), "caption": c} for iid, c in zip(val_ids, preds)]
    gts = defaultdict(list)
    for img_id, caps in zip(val_ids, refs):
        for c in caps:
            gts[img_id].append({"caption": c})

    print("Patching tokenizer...")
    import pycocoevalcap.tokenizer.ptbtokenizer as pt_mod
    def simple_tokenize(self, captions_for_image):
        import re, string
        final = {}
        for img_id, caps in captions_for_image.items():
            toks = []
            for c in caps:
                text = c['caption'].lower()
                text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
                toks.append(" ".join(text.split()))
            final[img_id] = toks
        return final
    pt_mod.PTBTokenizer.tokenize = simple_tokenize

    # Save to temp files
    import tempfile, json
    print("Writing temp JSON files...")
    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as gt_file:
        gt_data = {
            "images": [{"id": iid} for iid in val_ids],
            "annotations": [
                {"image_id": iid, "id": i, "caption": cap}
                for i, (iid, caps) in enumerate(zip(val_ids, refs))
                for cap in caps
            ],
            "type": "captions",
            "info": "generated"
        }
        json.dump(gt_data, gt_file)
        gt_path = gt_file.name

    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as pred_file:
        json.dump(results, pred_file)
        pred_path = pred_file.name

    # Run COCOEval
    print("Running COCO Evaluation")
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap
    coco = COCO(gt_path)
    cocoRes = coco.loadRes(pred_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.evaluate()

    print(" Evaluation complete.")
    return {metric: float(score) for metric, score in cocoEval.eval.items()}


In [146]:
# Required setup
image_ids = list(id_to_filename.keys())
image_paths = [images_dir / id_to_filename[img_id] for img_id in image_ids]

In [None]:
#Run the model without distillation
sizes = [0.50]
all_results = {}

for size in sizes:
    scores = train_and_evaluate(size, image_ids, image_paths, id_to_captions, tokenizer)
    all_results[f"{int(size * 100)}%"] = scores

In [None]:
sizes_str = list(all_results.keys())
metrics = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "CIDEr"]

for metric in metrics:
    values = [all_results[size][metric] for size in sizes_str]
    plt.figure()
    plt.plot(sizes_str, values, marker='o')
    plt.title(f"{metric} vs Dataset Size")
    plt.xlabel("Dataset Size")
    plt.ylabel(metric)
    plt.grid(True)
    plt.show()


In [24]:
import random

def random_distill(image_ids, percentage):
    """
    Randomly select a subset of image_ids based on percentage.
    """
    k = int(len(image_ids) * percentage)
    return random.sample(image_ids, k)


In [25]:
id_to_filepath = {img_id: path for img_id, path in zip(image_ids, image_paths)}

In [None]:
sampled_ids = random_distill(image_ids, 0.25)
print("Sampled:", len(sampled_ids), "out of", len(image_ids))
print("Example:", sampled_ids[:5])

In [27]:
import time
def train_and_evaluate_distilled(selected_ids, id_to_filepath, id_to_captions, tokenizer, model_label=""):
    print(f"\n Training on distilled subset: {len(selected_ids)} images")
    start_time = time.time()

    # Build paths from selected IDs
    selected_paths = [id_to_filepath[img_id] for img_id in selected_ids]

    # Create dataset and dataloader
    dataset = CocoGPTDataset(selected_paths, selected_ids, id_to_captions, tokenizer)
    loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
    print(f" Model: {model_label} | Samples: {len(dataset)}")

    # Initialize model
    model = ImageCaptioningGPT2().cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Train
    model.train()
    for epoch in range(20):
        total_loss = 0
        for img_feats, cap_tokens in loader:
            img_feats, cap_tokens = img_feats.cuda(), cap_tokens.cuda()
            outputs = model(img_feats, cap_tokens, labels=cap_tokens)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f" Epoch {epoch+1}/20 - Loss: {total_loss / len(loader):.4f}")

    # Generate predictions for eval
        print("Generating predictions for 100 validation images...")
        preds, refs, val_ids = [], [], []
        start_infer = time.time()
        
        for img_id in selected_ids[:100]:
            img_path = id_to_filepath[img_id]
            img_embedding = get_image_embedding(img_path)
            caption = generate_caption(model, img_embedding, tokenizer)
            preds.append(caption)
            refs.append(id_to_captions[img_id][:5])
            val_ids.append(int(img_id))

        total_infer_time = time.time() - start_infer
        avg_infer_time = total_infer_time / len(preds)
        print(f" Inference time per caption: {avg_infer_time:.4f} seconds")

    # Format for COCOEval
    from collections import defaultdict
    results = [{"image_id": int(iid), "caption": c} for iid, c in zip(val_ids, preds)]
    gts = defaultdict(list)
    for img_id, caps in zip(val_ids, refs):
        for c in caps:
            gts[img_id].append({"caption": c})

    # Tokenizer patch
    import pycocoevalcap.tokenizer.ptbtokenizer as pt_mod
    def simple_tokenize(self, captions_for_image):
        import re, string
        final = {}
        for img_id, caps in captions_for_image.items():
            toks = []
            for c in caps:
                text = c['caption'].lower()
                text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
                toks.append(" ".join(text.split()))
            final[img_id] = toks
        return final
    pt_mod.PTBTokenizer.tokenize = simple_tokenize

    # Save temporary prediction/reference files
    import tempfile, json
    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as gt_file:
        gt_data = {
            "images": [{"id": iid} for iid in val_ids],
            "annotations": [
                {"image_id": iid, "id": i, "caption": cap}
                for i, (iid, caps) in enumerate(zip(val_ids, refs))
                for cap in caps
            ],
            "type": "captions", "info": "distilled"
        }
        json.dump(gt_data, gt_file)
        gt_path = gt_file.name

    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as pred_file:
        json.dump(results, pred_file)
        pred_path = pred_file.name

    # Evaluate
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap
    coco = COCO(gt_path)
    cocoRes = coco.loadRes(pred_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.evaluate()

    print(" Evaluation complete.")
    total_time = time.time() - start_time
    return (
    {metric: float(score) for metric, score in cocoEval.eval.items()},
    total_time,
    avg_infer_time)



In [None]:
all_scores_random = {}
all_times_random = {}
all_infer_times_random = {}
fractions = [0.25, 0.5, 0.75, 1.0]


for frac in fractions:
    print(f"\n RANDOM SAMPLING | Dataset: {int(frac * 100)}%")
    sampled_ids = random_distill(image_ids, frac)
    
    scores, duration, infer_time = train_and_evaluate_distilled(
        selected_ids=sampled_ids,
        id_to_filepath=id_to_filepath,
        id_to_captions=id_to_captions,
        tokenizer=tokenizer,
        model_label=f"Random - {int(frac * 100)}%"
    )

    all_scores_random[int(frac * 100)] = scores
    all_times_random[int(frac * 100)] = duration
    all_infer_times_random[int(frac * 100)] = infer_time



In [None]:
import matplotlib.pyplot as plt

metrics = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "CIDEr"]
x_vals = sorted(all_scores_random.keys())

for metric in metrics:
    y_vals = [all_scores_random[size][metric] for size in x_vals]
    plt.figure()
    plt.plot(x_vals, y_vals, marker='o', label='Random Distillation')
    plt.title(f"{metric} vs Dataset Size (Random)")
    plt.xlabel("Dataset Size (%)")
    plt.ylabel(metric)
    plt.xticks(x_vals)
    plt.grid(True)
    plt.legend()
    plt.show()


In [28]:
import torch
from torch.utils.data import DataLoader

def gradient_based_distill(image_ids, image_paths, id_to_captions, tokenizer, percentage):
    """
    Select images with highest per-sample gradient norm using a small probe model.
    """
    print("Starting true gradient-based scoring (via gradient norms)...")

    # Build image paths
    id_to_path = {img_id: path for img_id, path in zip(image_ids, image_paths)}
    selected_paths = [id_to_path[img_id] for img_id in image_ids]

    dataset = CocoGPTDataset(selected_paths, image_ids, id_to_captions, tokenizer)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)

    model = ImageCaptioningGPT2().cuda()
    model.train() 

    id_gradnorm = []

    for (img_feat, cap_token), img_id in zip(loader, image_ids):
        img_feat, cap_token = img_feat.cuda(), cap_token.cuda()

        model.zero_grad()
        output = model(img_feat, cap_token, labels=cap_token)
        loss = output.loss
        loss.backward()

        total_norm = 0.0
        for p in model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** 0.5

        id_gradnorm.append((img_id, total_norm))

    id_gradnorm.sort(key=lambda x: x[1], reverse=True)
    k = int(len(image_ids) * percentage)
    top_ids = [img_id for img_id, _ in id_gradnorm[:k]]

    print(f" Selected top {k} samples with highest gradient norms.")
    return top_ids


In [None]:
all_scores_gradient = {}
all_times_gradient = {}
all_infer_times_gradient = {}
fractions = [0.5, 0.75, 1.0]

for frac in fractions:
    print(f"\n GRADIENT-BASED | Dataset: {int(frac * 100)}%")
    sampled_ids = gradient_based_distill(image_ids, image_paths, id_to_captions, tokenizer, frac)

    scores, duration, infer_time = train_and_evaluate_distilled(
        selected_ids=sampled_ids,
        id_to_filepath=id_to_filepath,
        id_to_captions=id_to_captions,
        tokenizer=tokenizer,
        model_label=f"Gradient-{int(frac * 100)}%"
    )

    all_scores_gradient[int(frac * 100)] = scores
    all_times_gradient[int(frac * 100)] = duration
    all_infer_times_gradient[int(frac * 100)] = infer_time


In [None]:
import matplotlib.pyplot as plt

metrics = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "CIDEr"]
sizes = sorted(all_scores_random.keys())  # [25, 50, 75, 100]

for metric in metrics:
    y_random = [all_scores_random[size][metric] for size in sizes]
    y_gradient = [all_scores_gradient[size][metric] for size in sizes]

    plt.figure()
    plt.plot(sizes, y_random, marker='o', label='Random')
    plt.plot(sizes, y_gradient, marker='o', label='Gradient-Based')
    plt.title(f"{metric} vs Dataset Size")
    plt.xlabel("Dataset Size (%)")
    plt.ylabel(metric)
    plt.xticks(sizes)
    plt.grid(True)
    plt.legend()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

sizes = sorted(all_times_random.keys())
time_random = [all_times_random[size] for size in sizes]
time_gradient = [all_times_gradient[size] for size in sizes]

plt.figure()
plt.plot(sizes, time_random, marker='o', label="Random")
plt.plot(sizes, time_gradient, marker='o', label="Gradient-Based")
plt.title("Training Time vs Dataset Size")
plt.xlabel("Dataset Size (%)")
plt.ylabel("Time (seconds)")
plt.xticks(sizes)
plt.grid(True)
plt.legend()
plt.show()


In [None]:
sizes = sorted(all_infer_times_random.keys())
infer_random = [all_infer_times_random[size] for size in sizes]
infer_gradient = [all_infer_times_gradient[size] for size in sizes]

plt.figure()
plt.plot(sizes, infer_random, marker='o', label="Random")
plt.plot(sizes, infer_gradient, marker='o', label="Gradient-Based")
plt.title("Inference Time per Caption vs Dataset Size")
plt.xlabel("Dataset Size (%)")
plt.ylabel("Time (seconds)")
plt.xticks(sizes)
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Save the model weights
torch.save(model.state_dict(), "trained_gpt_captioner.pth")
print("Model saved as 'trained_gpt_captioner.pth'")


In [None]:

model = ImageCaptioningGPT2()
model.load_state_dict(torch.load("trained_gpt_captioner.pth"))
model = model.cuda()
model.eval()


In [103]:
def generate_caption2(model, img_embedding, tokenizer, max_length=32):
    model.eval()
    device = img_embedding.device

    # Convert image embedding to GPT2 token space
    img_token = model.map_img_to_gpt(img_embedding).unsqueeze(0).unsqueeze(1)

    # Use a textual prompt to help guide generation
    prompt = "An image of"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    prompt_embeds = model.gpt.transformer.wte(input_ids)

    # Concatenate image token with prompt
    input_embeds = torch.cat([img_token, prompt_embeds], dim=1)

    # Generate
    with torch.no_grad():
        generated_ids = model.gpt.generate(
            inputs_embeds=input_embeds,
            max_length=max_length,
            do_sample=True,
            top_p=0.95,
            temperature=1.0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)


In [None]:
# Save the model weights
torch.save(model.state_dict(), "trained_gpt_captioner01.pth")
print("Model saved as 'trained_gpt_captioner01.pth'")
model = ImageCaptioningGPT2()
model.load_state_dict(torch.load("trained_gpt_captioner01.pth"))
model = model.cuda()
model.eval()
caption = generate_caption2(model, get_image_embedding("test.jpg"), tokenizer)
print("Caption:", caption)


In [None]:
caption = generate_caption2(model, get_image_embedding("test2.jpg"), tokenizer)
print("Caption:", caption)