This codebase can be used for many purposes including:

1. **Object Detection Pipeline**:
- Train/evaluate models (Faster R-CNN, Mask R-CNN, etc)
- Process COCO-format datasets
- Calculate evaluation metrics (mAP, IoU)

2. **Key Components Available:**

Data Preparation:
- Image transforms/normalization
- Annotation processing
- Dataset utilities

Training Infrastructure:
- Distributed training setup
- Learning rate scheduling
- Loss tracking

Evaluation Tools:
- COCO metric calculation
- Prediction visualization
- Model performance analysis

3. **Experiment Ideas:**
- Transfer Learning:
- Fine-tune pretrained models on custom data
- Modify head architectures
- Data Augmentation:
- Test different transform combinations
- Visualize augmented samples

4. **Performance Optimization:**
- Benchmark training speed
- Profile memory usage
- Test mixed-precision

5. **Visualization Capabilities:**
- Plot training metrics
- Visualize predictions vs ground truth
- Generate confusion matrices

In [None]:
# In vision_core.ipynb
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torch.utils.data import Dataset, DataLoader
import os
import sys

# ------------------- PATH SETUP -------------------
# This assumes your notebook is in 'lectures/vision_core/'
# and 'vision_core' is the package directory itself.
# Adds the 'lectures' directory to sys.path so 'from vision_core import ...' works.
notebook_dir = os.path.dirname(os.path.abspath("__file__" if "__file__" in locals() else os.getcwd()))
parent_dir_of_vision_core = os.path.dirname(notebook_dir) # This should be 'lectures'
if parent_dir_of_vision_core not in sys.path:
    sys.path.append(parent_dir_of_vision_core)

# Now import from vision_core
from vision_core import utils, engine # Assuming transforms is not strictly needed for this example

# ------------------- DEVICE SETUP -------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ------------------- CUSTOM COLLATE FUNCTION FOR DETECTION -------------------
def collate_fn(batch):
    return tuple(zip(*batch))

# ------------------- MOCK DATASET -------------------
class FakeDataset(Dataset):
    def __init__(self, num_samples=100, image_size=(3, 300, 400)):
        self.num_samples = num_samples
        self.image_size = image_size

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        # Create a random image
        image = torch.rand(self.image_size)

        # Create a corresponding target dictionary
        # Faster R-CNN expects specific keys with specific data types
        num_boxes = torch.randint(1, 5, (1,)).item() # Random number of boxes
        
        boxes = torch.rand(num_boxes, 4) * torch.tensor([self.image_size[2], self.image_size[1], self.image_size[2], self.image_size[1]])
        # Ensure x2 > x1 and y2 > y1
        boxes[:, 2] = torch.clamp(boxes[:, 0] + boxes[:, 2]* (self.image_size[2] - boxes[:,0]), min=boxes[:,0]+1)
        boxes[:, 3] = torch.clamp(boxes[:, 1] + boxes[:, 3]* (self.image_size[1] - boxes[:,1]), min=boxes[:,1]+1)
        
        labels = torch.randint(1, 91, (num_boxes,), dtype=torch.int64) # COCO has 90 classes + background
        
        target = {}
        target["boxes"] = boxes.to(dtype=torch.float32)
        target["labels"] = labels.to(dtype=torch.int64)
        target["image_id"] = torch.tensor([idx], dtype=torch.int64)
        target["area"] = (target["boxes"][:, 3] - target["boxes"][:, 1]) * \
                         (target["boxes"][:, 2] - target["boxes"][:, 0])
        target["iscrowd"] = torch.zeros((num_boxes,), dtype=torch.uint8) # uint8 or int64 depending on PyTorch version, uint8 is safer for older versions

        return image, target

    # This method is needed for engine.evaluate if using COCO-style evaluation
    def get_coco_api(self):
        # Return a mock COCO API or a real one if you have annotations
        from pycocotools.coco import COCO
        mock_coco = COCO()
        # You might need to populate it with dataset_ids if evaluate uses it
        # For this example, a minimal mock should suffice to avoid AttributeError
        mock_coco.dataset = {'images': [{'id': i} for i in range(self.num_samples)], 'annotations': []}
        mock_coco.createIndex()
        return mock_coco

# ------------------- MODEL AND OPTIMIZER -------------------
# Initialize model
model = fasterrcnn_resnet50_fpn(pretrained=True, num_classes=91) # Ensure num_classes matches your dataset (COCO default is 91)
model.to(device)

# Initialize optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Learning rate scheduler (optional, but good practice)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# ------------------- DATA LOADERS -------------------
train_dataset = FakeDataset(num_samples=100)
val_dataset = FakeDataset(num_samples=50) # Separate validation set

train_loader = DataLoader(
    train_dataset, 
    batch_size=2, 
    shuffle=True, 
    num_workers=0, # Set to 0 for simplicity, can increase if not on Windows or if __main__ guard is used
    collate_fn=collate_fn # Use the custom collate_fn
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=1, # Typically 1 for validation in detection
    shuffle=False, 
    num_workers=0,
    collate_fn=collate_fn
)

# ------------------- TRAINING AND VALIDATION LOOP -------------------
num_epochs = 3
print(f"Starting training for {num_epochs} epochs...")

for epoch in range(num_epochs):
    # Train for one epoch
    engine.train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=10)
    
    # Update the learning rate
    if lr_scheduler is not None:
        lr_scheduler.step()
    
    # Evaluate on the validation set
    # The evaluate function in torchvision's engine.py expects a COCO-like API from the dataset
    engine.evaluate(model, val_loader, device=device)

print("Training finished.")

Using device: cpu
Starting training for 3 epochs...
Epoch: [0]  [ 0/50]  eta: 0:02:59  lr: 0.000107  loss: 4.8147 (4.8147)  loss_classifier: 0.0143 (0.0143)  loss_box_reg: 0.0001 (0.0001)  loss_objectness: 0.2087 (0.2087)  loss_rpn_box_reg: 4.5915 (4.5915)  time: 3.5884  data: 0.0033
Epoch: [0]  [10/50]  eta: 0:02:09  lr: 0.001126  loss: 13.2961 (22.3868)  loss_classifier: 0.0214 (0.0233)  loss_box_reg: 0.0001 (0.0002)  loss_objectness: 0.2087 (0.2198)  loss_rpn_box_reg: 12.9991 (22.1435)  time: 3.2410  data: 0.0022


In [None]:
# Example of making a prediction (after training)
model.eval()
sample_image, _ = val_dataset[0]
with torch.no_grad():
    prediction = model([sample_image.to(device)])
print("\nExample prediction:")
print(prediction)