In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = YOLO("yolov8m.pt")
model.to('cpu')

In [None]:
video_path = r'C:\Users\ahmad\Downloads\3623819-hd_1920_1080_25fps.mp4'
cap = cv2.VideoCapture(video_path)

In [None]:
tracked_people = {}  # Dictionary to hold people (ID: (features, last known location))
person_id = 1  # Start ID from 1
frame_count = 0
max_distance = 50  # Max distance for spatial matching
max_feature_similarity = 0.8  # Minimum similarity threshold for matching

In [None]:
def extract_features(image):
    """Extract dominant color as a simple feature vector."""
    # Resize to 50x50 and calculate the mean color as a proxy for clothing color
    resized = cv2.resize(image, (50, 50))
    mean_color = resized.mean(axis=(0, 1))
    return mean_color / 255  # Normalize color values

In [None]:
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    # Detect people using YOLOv8
    results = model(frame)
    boxes = results[0].boxes
    person_boxes = boxes[boxes.cls == 0]  # Filter for people class (class ID 0)

    current_detections = []
    for box in person_boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])  # Bounding box coordinates
        person_crop = frame[y1:y2, x1:x2]  # Crop person region for feature extraction
        center_x = (x1 + x2) // 2
        center_y = (y1 + y2) // 2
        confidence = box.conf[0]
        
        # Extract features for each detected person
        features = extract_features(person_crop)
        current_detections.append((center_x, center_y, x1, y1, x2, y2, confidence, features))

    # Match current detections with tracked people using features
    new_tracked_people = {}
    for center_x, center_y, x1, y1, x2, y2, confidence, features in current_detections:
        matched = False

        for id, data in tracked_people.items():
            prev_center_x, prev_center_y, prev_features, last_frame = data

            # Check spatial distance and feature similarity
            distance = np.sqrt((center_x - prev_center_x) ** 2 + (center_y - prev_center_y) ** 2)
            similarity = cosine_similarity([features], [prev_features])[0][0]
            
            if distance < max_distance and similarity > max_feature_similarity:
                # Update tracked person with current detection
                new_tracked_people[id] = (center_x, center_y, features, frame_count)
                matched = True
                break

        if not matched:
            # Assign a new ID for untracked person
            new_tracked_people[person_id] = (center_x, center_y, features, frame_count)
            person_id += 1

    # Update tracked people with new detections
    tracked_people = new_tracked_people

    # Draw tracking results on the frame
    for id, (center_x, center_y, features, last_frame) in tracked_people.items():
        # Retrieve bounding box coordinates from current_detections for each ID
        # and display with unique ID and confidence score.
        for detection in current_detections:
            det_center_x, det_center_y, det_x1, det_y1, det_x2, det_y2, det_confidence, det_features = detection
            if center_x == det_center_x and center_y == det_center_y:
                # Draw bounding box and label with ID
                cv2.rectangle(frame, (det_x1, det_y1), (det_x2, det_y2), (0, 255, 0), 2)
                label = f'ID: {id} Conf: {det_confidence:.2f}'
                cv2.putText(frame, label, (det_x1, det_y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                break

    cv2.imshow('Tracked People', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 (no detections), 1031.8ms
Speed: 4.0ms preprocess, 1031.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 3 handbags, 1012.3ms
Speed: 4.0ms preprocess, 1012.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 4 handbags, 1 suitcase, 1023.3ms
Speed: 3.0ms preprocess, 1023.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 2 handbags, 1009.8ms
Speed: 2.0ms preprocess, 1009.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 3 handbags, 1009.3ms
Speed: 2.0ms preprocess, 1009.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 4 handbags, 1011.3ms
Speed: 2.0ms preprocess, 1011.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 2 handbags, 1001.3ms
Speed: 2.0ms preprocess, 1001.3ms infe