In [2]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
import torchvision.transforms as transforms
import torch.nn.functional as F

In [2]:
CONFIDENCE_THRESHOLD = 0.1  # Confidence threshold for detecting persons
IOU_THRESHOLD = 0.1  # IoU threshold for tracking

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
def compute_iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1_2, y1_2, x2_2, y2_2 = box2

    inter_x1 = max(x1, x1_2)
    inter_y1 = max(y1, y1_2)
    inter_x2 = min(x2, x2_2)
    inter_y2 = min(y2, y2_2)
    
    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    
    box1_area = (x2 - x1) * (y2 - y1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
    
    # Compute the union area
    union_area = box1_area + box2_area - inter_area
    
    # Compute IoU
    iou = inter_area / union_area if union_area > 0 else 0
    return iou

# Function to extract a simple feature from the person's bounding box (e.g., color histogram)
def extract_features(frame, bbox):
    x1, y1, x2, y2 = bbox
    person_img = frame[y1:y2, x1:x2]  # Crop the person from the frame
    # Convert to HSV for better color distribution matching
    person_img_hsv = cv2.cvtColor(person_img, cv2.COLOR_BGR2HSV)
    # Compute a normalized color histogram
    hist = cv2.calcHist([person_img_hsv], [0, 1], None, [256, 256], [0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()  # Flatten to a 1D vector
    return hist

# Load YOLOv8 model
model = YOLO(r'D:\Computer Vision\FYP\TASK 1\env\TrackNet-X\yolov8m.pt')
model.to('cpu')

# Open video file
cap = cv2.VideoCapture(r'C:\Users\ahmad\Downloads\853889-hd_1920_1080_25fps.mp4')

# To store the unique IDs, bounding boxes, and their feature vectors
person_ids = []
person_features = []
unique_id = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Perform inference on the frame
    results = model([frame_rgb])  # Pass the frame as a batch of one image
    result = results[0]  # Get the first result
    boxes = result.boxes  # Get bounding boxes

    xywh = boxes.xywh  # Get coordinates in xywh format
    conf = boxes.conf  # Get the confidence scores
    cls = boxes.cls  # Get the class labels

    person_bboxes = []
    for i in range(len(cls)):
        if cls[i] == 0:  # Class 0 is 'person'
            x, y, w, h = xywh[i]
            confidence = conf[i]
            x1, y1 = int(x - w / 2), int(y - h / 2)  # Convert to top-left corner
            x2, y2 = int(x1 + w), int(y1 + h)  # Calculate bottom-right corner

            person_bboxes.append((x1, y1, x2, y2, confidence))

    updated_person_ids = []
    updated_person_features = []
    
    for bbox in person_bboxes:
        x1, y1, x2, y2, confidence = bbox
        matched = False
        current_feature = extract_features(frame, (x1, y1, x2, y2))  # Extract features of the current person
        
        # Match current person's feature with the stored ones
# Match current person's feature with the stored ones
        for pid, (prev_bbox, prev_feature) in enumerate(zip(person_ids, person_features)):
            # Unpack bounding box and confidence
            prev_x1, prev_y1, prev_x2, prev_y2, prev_confidence = prev_bbox
            iou = compute_iou((x1, y1, x2, y2), (prev_x1, prev_y1, prev_x2, prev_y2))
            
            # If the IoU is high and features match, keep the same ID
            if iou > 0.3:
                feature_distance = np.linalg.norm(current_feature - prev_feature)  # Compute distance between features
                if feature_distance < 0.5:  # If feature similarity is high enough
                    updated_person_ids.append((bbox, pid, confidence))
                    updated_person_features.append(prev_feature)
                    matched = True
                    break

        
        # If no match was found, assign a new ID and feature
        if not matched:
            updated_person_ids.append((bbox, unique_id, confidence))
            updated_person_features.append(current_feature)
            unique_id += 1

    # Ensure IDs are kept the same for each person across frames
    person_ids = [bbox for bbox, _, _ in updated_person_ids]
    person_features = updated_person_features

    # Draw bounding boxes and person IDs on the frame
# Draw bounding boxes and person IDs on the frame
# Draw bounding boxes and person IDs on the frame
    for (bbox, person_id, confidence) in updated_person_ids:
        x1, y1, x2, y2 = bbox[:4]  # Unpack only the first four values (x1, y1, x2, y2)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f'ID:{person_id} Conf:{confidence:.2f}', (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)



    # Show the frame with bounding boxes and IDs
    cv2.imshow('Video', frame)

    # Exit if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()



0: 384x640 34 persons, 1 handbag, 1027.2ms
Speed: 4.0ms preprocess, 1027.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 36 persons, 1004.2ms
Speed: 3.0ms preprocess, 1004.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 36 persons, 1 handbag, 1033.2ms
Speed: 3.0ms preprocess, 1033.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 34 persons, 1016.2ms
Speed: 4.0ms preprocess, 1016.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 35 persons, 1 backpack, 1081.2ms
Speed: 2.0ms preprocess, 1081.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 35 persons, 1006.2ms
Speed: 2.0ms preprocess, 1006.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 34 persons, 1005.2ms
Speed: 2.0ms preprocess, 1005.2ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 33 persons, 2 handbags, 1020.2