In [None]:
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import cv2
import numpy as np

# Load the pre-trained Faster R-CNN model with ResNet-50 backbone
model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Define the transformation pipeline to preprocess frames
transform = transforms.Compose([
    transforms.ToTensor()
])

# Input and output video file paths
input_file = 'ball.mp4'
output_file = 'output_video.mp4'

# Open mp4
cap = cv2.VideoCapture(input_file)

# Get frame rate, width, and height of ball.mp4
fps = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Initialize the video writer for the output file
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_file, fourcc, fps, (frame_width, frame_height))

# Process each frame in the video
while cap.isOpened():
    # Read a frame from the video
    ret, frame = cap.read()

    # If frame was not successfully read, exit the loop
    if not ret:
        break

    # Preprocess the frame by converting it to a tensor
    frame_tensor = transform(frame).unsqueeze(0)

    # Forward pass through the model and get predictions
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Extract boxes & scores from the predictions
    boxes = predictions[0]['boxes']
    scores = predictions[0]['scores']

    # Filter low-confidence detections based on a threshold
    threshold = 0.5
    boxes = boxes[scores > threshold]

    # Draw bounding boxes and centroids on the frame
    for box in boxes:
        # Convert the box coordinates to integers
        box = box.detach().cpu().numpy().astype(np.int32)

        # Draw a rectangle around the detected object
        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)

        # Calculate the centroid coordinates of the box
        centroid_x = (box[0] + box[2]) // 2
        centroid_y = (box[1] + box[3]) // 2

        # Draw a circle at the centroid to represent it
        cv2.circle(frame, (centroid_x, centroid_y), 3, (0, 255, 0), -1)

    # Write the frame to the output video
    out.write(frame)

# Free used resources
cap.release()
out.release()

# Close any open windows from the code
cv2.destroyAllWindows()
