In [1]:
# %pip install torch torchvision opencv-python pillow numpy matplotlib deep-sort-realtime

In [2]:
import cv2
import torch
import torchvision
import numpy as np
from PIL import Image
import torchvision.transforms as T
import matplotlib.pyplot as plt
from deep_sort_realtime.deepsort_tracker import DeepSort
import torchvision.models.detection

In [3]:
VIDEO_NAME = 'DSC_2411.mp4'
video_path = fr"tracking_rukomet\{VIDEO_NAME}"
output_path = fr"tracking_rukomet\output_deepsort\{VIDEO_NAME}"

# Load Faster R-CNN model with updated argument
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.eval()

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, n_init=3, nms_max_overlap=1.0)

# Use raw string (r"") or replace \ with /
cap = cv2.VideoCapture(video_path)

# Get video properties
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = int(cap.get(5))

# Define output video writer (Fix codec to 'mp4v' for MP4 format)
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))


  self.model.load_state_dict(torch.load(model_wts_path))


In [4]:
# Preprocessing function
def preprocess(frame):
    transform = T.Compose([T.ToTensor()])
    return transform(frame).unsqueeze(0)

# Detection function
def detect(frame, model):
    with torch.no_grad():
        prediction = model(frame)
    return prediction

In [5]:
# Process video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to PIL Image
    frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Preprocess and detect objects
    frame_tensor = preprocess(frame_pil)
    predictions = detect(frame_tensor, model)

    # Extract detections
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    # Filter detections (Only keep "person" class, label == 1)
    confidence_threshold = 0.5
    detections = []
    for box, score, label in zip(boxes, scores, labels):
        if score > confidence_threshold and label == 1:
            x1, y1, x2, y2 = map(int, box)
            detections.append([[x1, y1, x2, y2], score])  # Correct format for DeepSORT

    # Update DeepSORT tracker
    tracks = tracker.update_tracks(detections, frame=frame)

    # Draw bounding boxes & track IDs
    for track in tracks:
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        bbox = track.to_tlbr()  # Convert to (x1, y1, x2, y2)

        # Draw bounding box
        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 2)
        
        # Draw ID label
        cv2.putText(frame, f"ID: {track_id}", (int(bbox[0]), int(bbox[1] - 10)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Write frame to output video
    out.write(frame)

    # Display frame
    # cv2.imshow("Tracking", frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()