In [2]:
import torch
import time
import random
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision.transforms.functional as F
from torchvision.datasets import VOCDetection
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# Config
NUM_EPOCHS = 5
BATCH_SIZE = 4
NUM_WORKERS = 4
PATIENCE = 2
PRINT_INTERVAL = 100
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Config => epochs: {NUM_EPOCHS}, batch_size: {BATCH_SIZE}, workers: {NUM_WORKERS}, patience: {PATIENCE}")
print(f"Using device: {DEVICE}")

Config => epochs: 5, batch_size: 4, workers: 4, patience: 2
Using device: cpu


In [3]:
VOC_CLASSES = [
    "aeroplane","bird","car", "motorbike","person"
]

def prepare_sample(img, target):
    # Random horizontal flip
    if random.random() < 0.5:
        w = img.width
        img = F.hflip(img)
        objs = target['annotation']['object']
        if not isinstance(objs, list): objs = [objs]
        for o in objs:
            xmin, xmax = int(o['bndbox']['xmin']), int(o['bndbox']['xmax'])
            o['bndbox']['xmin'] = w - xmax
            o['bndbox']['xmax'] = w - xmin
    # To tensor & parse boxes
    img_tensor = F.to_tensor(img)
    objs = target['annotation']['object']
    if not isinstance(objs, list): objs = [objs]
    boxes = torch.tensor([
        [int(o['bndbox']['xmin']), int(o['bndbox']['ymin']),
         int(o['bndbox']['xmax']), int(o['bndbox']['ymax'])]
        for o in objs
    ], dtype=torch.float32)
    labels = torch.tensor([
        VOC_CLASSES.index(o['name'])+1 for o in objs
    ], dtype=torch.int64)
    return img_tensor, {"boxes": boxes, "labels": labels}

def collate_fn(batch):
    return tuple(zip(*batch))

# Load datasets
train_root = "pascalvoc/VOCtrainval_06-Nov-2007"
test_root  = "pascalvoc/VOCtest_06-Nov-2007"
train_ds = VOCDetection(root=train_root, year='2007', image_set='trainval', download=False, transforms=prepare_sample)
val_ds   = VOCDetection(root=test_root, year='2007', image_set='test', download=False, transforms=prepare_sample)
print("Train size:", len(train_ds), "Val size:", len(val_ds))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=1,          shuffle=False, collate_fn=collate_fn, num_workers=2)
total_batches = len(train_loader)
print(f"Batches per epoch: {total_batches}")

Train size: 5011 Val size: 4952
Batches per epoch: 1253


In [4]:
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights

model = fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
in_feat = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor.cls_score = torch.nn.Linear(in_feat, len(VOC_CLASSES)+1)
model.roi_heads.box_predictor.bbox_pred  = torch.nn.Linear(in_feat, 4*(len(VOC_CLASSES)+1))
model.to(DEVICE)
# Freeze backbone initially
for p in model.backbone.parameters():
    p.requires_grad = False

In [5]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=1e-4, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=PATIENCE)
scaler = GradScaler()

  scaler = GradScaler()


In [6]:
import h5py

history = {
    'total_loss': [], 'cls_loss': [], 'box_loss': [],
    'obj_loss': [], 'rpn_box_loss': [], 'val_loss': [], 'val_map50': []
}
best_val = float('inf')
epochs_no_improve = 0

from torchmetrics.detection import MeanAveragePrecision
val_metric = MeanAveragePrecision(iou_type="bbox", iou_thresholds=[0.5])

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = time.time()
    model.train()
    running = {'total': 0, 'cls': 0, 'box': 0, 'obj': 0, 'rpn': 0}
    print(f"\n=== Epoch {epoch}/{NUM_EPOCHS} ===")

    # Unfreeze backbone after first epoch
    if epoch == 2:
        for p in model.backbone.parameters():
            p.requires_grad = True

    # --- Training Pass ---
    for batch_idx, (imgs, targets) in enumerate(train_loader, start=1):
        imgs = [im.to(DEVICE) for im in imgs]
        tgts = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        with autocast():
            loss_dict = model(imgs, tgts)
            loss = sum(loss for loss in loss_dict.values())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Accumulate losses
        running['cls']   += loss_dict['loss_classifier'].item()
        running['box']   += loss_dict['loss_box_reg'].item()
        running['obj']   += loss_dict['loss_objectness'].item()
        running['rpn']   += loss_dict['loss_rpn_box_reg'].item()
        running['total'] += loss.item()

        if batch_idx % PRINT_INTERVAL == 0 or batch_idx == total_batches:
            print(f"Batch {batch_idx}/{total_batches} - Loss: {loss.item():.4f}")

    # Record training metrics
    nB = total_batches
    history['cls_loss'].append(running['cls'] / nB)
    history['box_loss'].append(running['box'] / nB)
    history['obj_loss'].append(running['obj'] / nB)
    history['rpn_box_loss'].append(running['rpn'] / nB)
    history['total_loss'].append(running['total'] / nB)
    print(f"Epoch {epoch} done in {(time.time() - start_time)/60:.1f}m, "
          f"avg total loss: {history['total_loss'][-1]:.4f}")

    # --- Validation Pass ---
    model.eval()
    val_running = 0
    val_metric.reset()
    with torch.no_grad():
        for imgs, tgts in val_loader:
            imgs = [im.to(DEVICE) for im in imgs]
            tgts = [{k: v.to(DEVICE) for k, v in t.items()} for t in tgts]

            # validation loss
            model.train()
            loss_dict = model(imgs, tgts)
            val_running += sum(l.item() for l in loss_dict.values())

            # predictions for mAP
            model.eval()
            preds = model(imgs)
            val_metric.update(preds, tgts)

    val_loss = val_running / len(val_loader)
    history['val_loss'].append(val_loss)
    val_map50 = val_metric.compute()['map_50'].item()
    history['val_map50'].append(val_map50)
    print(f"Val loss: {val_loss:.4f}, mAP@0.5: {val_map50:.4f}")

    # --- Scheduler & Early Stopping ---
    scheduler.step(val_loss)
    if val_loss < best_val:
        best_val = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping triggered")
            break
    
    with h5py.File("/kaggle/working/faster_rcnn_scratch.h5", "w") as f:
        for key, value in model.state_dict().items():
            f.create_dataset(key, data=value.cpu().numpy())

    print("Best model saved as faster_rcnn_scratch.h5")

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

avg total loss: 0.3225
Val loss: 0.3211, mAP@0.5: 0.7698

Link to working project: https://drive.google.com/file/d/1whxdxrjAr0PDklQI44-poPF6Tb3KGOo4/view?usp=sharing

In [7]:
import h5py
import torch

h5_path = "faster_rcnn_scratch.h5"

state_dict = {}
with h5py.File(h5_path, "r") as f:
    for key in f.keys():
        state_dict[key] = torch.tensor(f[key][:])

model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()

RuntimeError: Error(s) in loading state_dict for FasterRCNN:
	size mismatch for roi_heads.box_predictor.cls_score.weight: copying a param with shape torch.Size([21, 1024]) from checkpoint, the shape in current model is torch.Size([6, 1024]).
	size mismatch for roi_heads.box_predictor.cls_score.bias: copying a param with shape torch.Size([21]) from checkpoint, the shape in current model is torch.Size([6]).
	size mismatch for roi_heads.box_predictor.bbox_pred.weight: copying a param with shape torch.Size([84, 1024]) from checkpoint, the shape in current model is torch.Size([24, 1024]).
	size mismatch for roi_heads.box_predictor.bbox_pred.bias: copying a param with shape torch.Size([84]) from checkpoint, the shape in current model is torch.Size([24]).

In [None]:
# === Section 7: Visualizing Test Results (Filtered & Annotated) ===

import random
import matplotlib.pyplot as plt
from torchvision.ops import box_iou
import torchvision.transforms.functional as F

# Ensure model is in eval mode
model.eval()

# Underlying VOC dataset
val_dataset = val_loader.dataset

# Number of examples to visualize
NUM_SAMPLES = 5
random_indices = random.sample(range(len(val_dataset)), k=NUM_SAMPLES)

# Confidence threshold for displayed boxes
SCORE_THRESH = 0.5

with torch.no_grad():
    for idx in random_indices:
        # 1) Load single sample
        img_tensor, target = val_dataset[idx]
        input_img = img_tensor.to(DEVICE).unsqueeze(0)

        # 2) Forward pass for raw predictions
        outputs = model(input_img)
        pred = outputs[0]

        # 3) Filter predictions by score
        keep = pred["scores"] > SCORE_THRESH
        filtered_boxes  = pred["boxes"][keep].cpu()
        filtered_scores = pred["scores"][keep].cpu()
        filtered_labels = pred["labels"][keep].cpu()

        # 4) Ground-truth boxes
        gt_boxes = target["boxes"].to(DEVICE)

        # 5) Compute IoU between filtered preds and GT
        if filtered_boxes.numel() and gt_boxes.numel():
            ious = box_iou(filtered_boxes.to(DEVICE), gt_boxes)
            iou_vals = ious.max(dim=1).values.cpu().tolist()
            avg_iou = sum(iou_vals) / len(iou_vals)
        else:
            avg_iou = 0.0

        # 6) Prepare image for plotting
        img_np = img_tensor.permute(1, 2, 0).cpu().numpy()
        fig, ax = plt.subplots(1, figsize=(8, 6))
        ax.imshow(img_np)

        # 7) Draw filtered predicted boxes (red) with label and score text
        for box, score, label in zip(filtered_boxes, filtered_scores, filtered_labels):
            x1, y1, x2, y2 = box.numpy()
            cls_name = VOC_CLASSES[label.item() - 1]  # assuming VOC_CLASSES defined
            rect = plt.Rectangle(
                (x1, y1),
                x2 - x1,
                y2 - y1,
                fill=False,
                edgecolor="red",
                linewidth=2,
            )
            ax.add_patch(rect)
            ax.text(
                x1,
                y1 - 5,
                f"{cls_name}: {score:.2f}",
                color="red",
                fontsize=10,
                backgroundcolor="white",
            )

        # 8) Draw ground-truth boxes (green dashed)
        for box in gt_boxes.cpu():
            x1, y1, x2, y2 = box.numpy()
            rect = plt.Rectangle(
                (x1, y1),
                x2 - x1,
                y2 - y1,
                fill=False,
                edgecolor="green",
                linewidth=2,
                linestyle="--",
            )
            ax.add_patch(rect)

        ax.set_title(f"Sample {idx} — Avg IoU (filtered preds→GT): {avg_iou:.3f}")
        ax.axis("off")
        plt.show()
