In [1]:
import cv2
import torch
import torchvision
import numpy as np
import os
from PIL import Image
import torchvision.transforms as T
from norfair import Detection, Tracker

In [2]:
# ========== SETTINGS ==========
VIDEO_NAME = 'DSC_2411.MOV'
video_path = fr"tracking_rukomet\{VIDEO_NAME}"
output_txt_path = fr"tracking_rukomet\predictions\{VIDEO_NAME.replace('.MOV', '_norfair.txt')}"

# ========== LOAD MODEL ==========
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.eval()

# ========== INITIALIZE TRACKER ==========
tracker = Tracker(distance_function="euclidean", distance_threshold=30)

# ========== VIDEO CAPTURE ==========
cap = cv2.VideoCapture(video_path)
os.makedirs(os.path.dirname(output_txt_path), exist_ok=True)

In [3]:
# ========== PREPROCESS FUNCTION ==========
def preprocess(frame):
    transform = T.Compose([T.ToTensor()])
    return transform(frame).unsqueeze(0)

# ========== DETECTION FUNCTION ==========
def detect(frame, model):
    with torch.no_grad():
        prediction = model(frame)
    return prediction

In [None]:
# ========== MAIN LOOP ==========
with open(output_txt_path, "w") as f:
    frame_id = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_id += 1  # Increment frame count

        # Convert frame to tensor
        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        frame_tensor = preprocess(frame_pil)

        # Run Faster R-CNN
        predictions = detect(frame_tensor, model)

        # Extract detections (only class "person" - label == 1)
        confidence_threshold = 0.5
        detections = []
        for box, score, label in zip(predictions[0]['boxes'], predictions[0]['scores'], predictions[0]['labels']):
            if score > confidence_threshold and label == 1:
                x1, y1, x2, y2 = map(int, box.cpu().numpy())
                detections.append(Detection(points=np.array([[(x1 + x2) / 2, (y1 + y2) / 2]]), scores=np.array([score])))

        # Update Norfair tracker
        tracked_objects = tracker.update(detections=detections)

        # Save tracking results
        for obj in tracked_objects:
            track_id = obj.id
            x, y = int(obj.estimate[0][0]), int(obj.estimate[0][1])
            w, h = int(box[2] - box[0]), int(box[3] - box[1])  # Bounding box width and height

            f.write(f"{frame_id},{track_id},{x},{y},{w},{h},1,-1,-1,-1\n")

# Release resources
cap.release()
cv2.destroyAllWindows()

print(f"Bounding boxes saved to: {output_txt_path}")