In [None]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
import cv2
import matplotlib.pyplot as plt
import numpy as np


In [None]:
# Load Faster R-CNN model pretrained on COCO dataset
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()  # Set model to evaluation mode

# Load COCO class names (ID 1: person, ID 37: sports ball)
COCO_CLASSES = {1: "person", 37: "sports ball"}


In [None]:
# Path to the video
video_path = 'https://www.youtube.com/watch?v=vUnuDTVHwGE&t=1s'  

# Load video
cap = cv2.VideoCapture(video_path)
frame_rate = cap.get(cv2.CAP_PROP_FPS)

frames = []
frame_indices = []

# Extract every 30th frame for processing
frame_id = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break
    if frame_id % 30 == 0:  # Take every 30th frame
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert to RGB
        frame_indices.append(frame_id)
    frame_id += 1

cap.release()

# Show one example frame
plt.imshow(frames[0])
plt.axis("off")
plt.title(f"Example Frame (Frame {frame_indices[0]})")
plt.show()


In [None]:
# Function to run Faster R-CNN on a single frame
def detect_objects(model, frame, threshold=0.5):
    # Convert frame to tensor
    frame_tensor = F.to_tensor(frame).unsqueeze(0)
    
    # Perform inference
    with torch.no_grad():
        detections = model(frame_tensor)[0]
    
    # Filter detections by score threshold
    boxes, labels, scores = [], [], []
    for i in range(len(detections["scores"])):
        if detections["scores"][i] >= threshold:
            boxes.append(detections["boxes"][i].numpy())
            labels.append(detections["labels"][i].item())
            scores.append(detections["scores"][i].item())
    
    return boxes, labels, scores


In [None]:
# Visualization function
def visualize_detections(frame, boxes, labels, scores, class_map):
    for box, label, score in zip(boxes, labels, scores):
        label_name = class_map.get(label, "Unknown")
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"{label_name} {score:.2f}", 
                    (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 
                    0.5, (0, 255, 0), 2)
    return frame


In [None]:
# Run detection on the first frame
sample_frame = frames[0]
boxes, labels, scores = detect_objects(model, sample_frame, threshold=0.5)

# Visualize detections
output_frame = sample_frame.copy()
output_frame = visualize_detections(output_frame, boxes, labels, scores, COCO_CLASSES)


plt.imshow(output_frame)
plt.axis("off")
plt.title("Detections")
plt.show()


In [None]:

output_frames = []
for frame in frames:
    boxes, labels, scores = detect_objects(model, frame, threshold=0.5)
    output_frame = frame.copy()
    output_frame = visualize_detections(output_frame, boxes, labels, scores, COCO_CLASSES)
    output_frames.append(cv2.cvtColor(output_frame, cv2.COLOR_RGB2BGR))  # Convert back to BGR

output_video_path = 'output_video.avi'
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (output_frames[0].shape[1], output_frames[0].shape[0]))

for frame in output_frames:
    out.write(frame)
out.release()
print(f"Output video saved at {output_video_path}")


## Faster R-CNN Workflow

1. **Region Proposal Network (RPN)**:
   - Proposes regions where objects might be located.
2. **Feature Pyramid Network (FPN)**:
   - Extracts features at different scales for accurate detection.
3. **RoI Pooling**:
   - Converts region proposals into fixed-size inputs for classification.
4. **Classification and Bounding Box Regression**:
   - Classifies objects and refines bounding boxes.

We demonstrated each stage visually through bounding boxes and labels.
