In [1]:
# %pip install torch torchvision opencv-python pillow numpy matplotlib deep-sort-realtime

In [2]:
import cv2
import torch
import torchvision
import numpy as np
from PIL import Image
import torchvision.transforms as T
from deep_sort_realtime.deepsort_tracker import DeepSort
import os

In [3]:
VIDEO_NAME = 'DSC_2411.MOV'
video_path = fr"tracking_rukomet\{VIDEO_NAME}"
output_txt_path = fr"tracking_rukomet\predictions\{VIDEO_NAME.replace('.MOV', '_deepsort.txt')}"

# Load Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.eval()

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, n_init=3, nms_max_overlap=1.0)

# Use raw string (r"") or replace \ with /
cap = cv2.VideoCapture(video_path)

# Open file for writing bounding boxes
os.makedirs(os.path.dirname(output_txt_path), exist_ok=True)

  self.model.load_state_dict(torch.load(model_wts_path))


In [4]:
# Preprocessing function
def preprocess(frame):
    transform = T.Compose([T.ToTensor()])
    return transform(frame).unsqueeze(0)

# Detection function
def detect(frame, model):
    with torch.no_grad():
        prediction = model(frame)
    return prediction 

In [None]:
with open(output_txt_path, "w") as f:
    frame_id = 0

    # Process video frames
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_id += 1  # Increment frame count

        # Convert frame to PIL Image
        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Preprocess and detect objects
        frame_tensor = preprocess(frame_pil)
        predictions = detect(frame_tensor, model)

        # Extract detections
        boxes = predictions[0]['boxes'].cpu().numpy()
        scores = predictions[0]['scores'].cpu().numpy()
        labels = predictions[0]['labels'].cpu().numpy()

        # Filter detections (Only keep "person" class, label == 1)
        confidence_threshold = 0.5
        detections = []
        for box, score, label in zip(boxes, scores, labels):
            if score > confidence_threshold and label == 1:
                x1, y1, x2, y2 = map(int, box)
                detections.append([[x1, y1, x2, y2], score])  # Correct format for DeepSORT

        # Update DeepSORT tracker
        tracks = tracker.update_tracks(detections, frame=frame)

        # Write bounding boxes to file
        for track in tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            bbox = track.to_tlbr()  # Convert to (x1, y1, x2, y2)

            # Convert to (x, y, w, h)
            x, y = int(bbox[0]), int(bbox[1])
            w, h = int(bbox[2] - bbox[0]), int(bbox[3] - bbox[1])

            # Write line in format: frame_id, track_id, x, y, w, h, 1,-1,-1,-1
            f.write(f"{frame_id},{track_id},{x},{y},{w},{h},1,-1,-1,-1\n")

# Release resources
cap.release()
cv2.destroyAllWindows()

print(f"Bounding boxes saved to: {output_txt_path}")