In [1]:
import cv2
import numpy as np
from ultralytics import YOLO
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model = YOLO("yolov8m.pt")
model.to('cpu')

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(48, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(48, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_

In [3]:
video_path = r'C:\Users\ahmad\Downloads\3623819-hd_1920_1080_25fp.mp4'
cap = cv2.VideoCapture(video_path)

In [4]:
tracked_people = {}  # Dictionary to hold people (ID: (features, last known location))
person_id = 1  # Start ID from 1
frame_count = 0
max_distance = 50  # Max distance for spatial matching
max_feature_similarity = 0.8  # Minimum similarity threshold for matching

In [5]:
def extract_features(image):
    """Extract dominant color as a simple feature vector."""
    # Resize to 50x50 and calculate the mean color as a proxy for clothing color
    resized = cv2.resize(image, (50, 50))
    mean_color = resized.mean(axis=(0, 1))
    return mean_color / 255  # Normalize color values

In [6]:
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    # Detect people using YOLOv8
    results = model(frame)
    boxes = results[0].boxes
    person_boxes = boxes[boxes.cls == 0]  # Filter for people class (class ID 0)

    current_detections = []
    for box in person_boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])  # Bounding box coordinates
        person_crop = frame[y1:y2, x1:x2]  # Crop person region for feature extraction
        center_x = (x1 + x2) // 2
        center_y = (y1 + y2) // 2
        confidence = box.conf[0]
        
        # Extract features for each detected person
        features = extract_features(person_crop)
        current_detections.append((center_x, center_y, x1, y1, x2, y2, confidence, features))

    # Match current detections with tracked people using features
    new_tracked_people = {}
    for center_x, center_y, x1, y1, x2, y2, confidence, features in current_detections:
        matched = False

        for id, data in tracked_people.items():
            prev_center_x, prev_center_y, prev_features, last_frame = data

            # Check spatial distance and feature similarity
            distance = np.sqrt((center_x - prev_center_x) ** 2 + (center_y - prev_center_y) ** 2)
            similarity = cosine_similarity([features], [prev_features])[0][0]
            
            if distance < max_distance and similarity > max_feature_similarity:
                # Update tracked person with current detection
                new_tracked_people[id] = (center_x, center_y, features, frame_count)
                matched = True
                break

        if not matched:
            # Assign a new ID for untracked person
            new_tracked_people[person_id] = (center_x, center_y, features, frame_count)
            person_id += 1

    # Update tracked people with new detections
    tracked_people = new_tracked_people

    # Draw tracking results on the frame
    for id, (center_x, center_y, features, last_frame) in tracked_people.items():
        # Retrieve bounding box coordinates from current_detections for each ID
        # and display with unique ID and confidence score.
        for detection in current_detections:
            det_center_x, det_center_y, det_x1, det_y1, det_x2, det_y2, det_confidence, det_features = detection
            if center_x == det_center_x and center_y == det_center_y:
                # Draw bounding box and label with ID
                cv2.rectangle(frame, (det_x1, det_y1), (det_x2, det_y2), (0, 255, 0), 2)
                label = f'ID: {id} Conf: {det_confidence:.2f}'
                cv2.putText(frame, label, (det_x1, det_y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                break

    cv2.imshow('Tracked People', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 (no detections), 2295.5ms
Speed: 128.0ms preprocess, 2295.5ms inference, 36.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 3 handbags, 1009.2ms
Speed: 3.0ms preprocess, 1009.2ms inference, 88.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 4 handbags, 1 suitcase, 1001.2ms
Speed: 2.0ms preprocess, 1001.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 2 handbags, 995.2ms
Speed: 3.0ms preprocess, 995.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 3 handbags, 1000.2ms
Speed: 2.0ms preprocess, 1000.2ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 4 handbags, 1001.2ms
Speed: 2.0ms preprocess, 1001.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 2 handbags, 993.2ms
Speed: 2.0ms preprocess, 993.2ms infe

In [1]:
import torch
import os
import cv2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from ultralytics import YOLO
from torchvision import models

# Load the YOLOv8 model
yolo_model = YOLO("yolov8m.pt")
yolo_model.to('cpu')

# Define the path to the locally stored checkpoint model
checkpoint_path = r'C:\Users\ahmad\.cache\torch\hub\checkpoints\inception_v3_google-0cc3c7bd.pth'

# Load the InceptionV3 model from torchvision
model = models.inception_v3(pretrained=False)  # Do not load the weights automatically
checkpoint = torch.load(checkpoint_path)

# Load the checkpoint into the model
model.load_state_dict(checkpoint)

# Set model to evaluation mode
model.eval()

# Initialize video capture
video_path = r'C:\Users\ahmad\Downloads\3623819-hd_1920_1080_25fp.mp4'
cap = cv2.VideoCapture(video_path)

# Initialize variables
tracked_people = {}  # Dictionary to hold people (ID: (features, last known location, frame_count))
person_id = 1  # Start ID from 1
frame_count = 0
max_distance = 50  # Max distance for spatial matching
max_feature_similarity = 0.8  # Minimum similarity threshold for matching

def extract_features(image):
    """Extract unique features using the custom checkpoint model."""
    # Preprocess image (resize and normalize)
    image_resized = cv2.resize(image, (299, 299))  # Resize to match InceptionV3 input size
    image_resized = np.transpose(image_resized, (2, 0, 1))  # Convert to CHW format
    image_resized = torch.tensor(image_resized).float() / 255.0  # Normalize to [0, 1]
    image_resized = image_resized.unsqueeze(0)  # Add batch dimension
    
    # Pass the image through the model
    with torch.no_grad():
        features = model(image_resized)  # Get the features from the model
    return features.cpu().numpy().flatten()  # Flatten the feature vector for comparison

def get_person_id(features):
    """Check if the current features match with any previous person ID."""
    max_similarity = 0
    matched_id = None
    for id, (saved_features, _) in tracked_people.items():
        similarity = cosine_similarity([features], [saved_features])[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            matched_id = id
    return matched_id, max_similarity

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    # Detect people using YOLOv8
    results = yolo_model(frame)
    boxes = results[0].boxes
    person_boxes = boxes[boxes.cls == 0]  # Filter for people class (class ID 0)

    current_detections = []
    for box in person_boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])  # Bounding box coordinates
        person_crop = frame[y1:y2, x1:x2]  # Crop person region for feature extraction
        center_x = (x1 + x2) // 2
        center_y = (y1 + y2) // 2
        confidence = box.conf[0]

        # Extract features for each detected person using the custom model
        features = extract_features(person_crop)
        current_detections.append((center_x, center_y, x1, y1, x2, y2, confidence, features))

    new_tracked_people = {}
    for center_x, center_y, x1, y1, x2, y2, confidence, features in current_detections:
        matched_id, similarity = get_person_id(features)

        if matched_id is not None and similarity > max_feature_similarity:
            # Reassign the ID if the person matches an existing ID
            new_tracked_people[matched_id] = (features, frame_count)
        else:
            # Assign a new ID if no match is found
            new_tracked_people[person_id] = (features, frame_count)
            person_id += 1

    # Update tracked people with new detections
    tracked_people = new_tracked_people

    # Draw tracking results on the frame
    for id, (features, _) in tracked_people.items():
        for detection in current_detections:
            det_center_x, det_center_y, det_x1, det_y1, det_x2, det_y2, det_confidence, det_features = detection
            similarity = cosine_similarity([features], [det_features])[0][0]  # Compare features with cosine similarity
            if similarity > max_feature_similarity:  # If similarity is good enough, mark the person
                # Draw bounding box and label with ID
                cv2.rectangle(frame, (det_x1, det_y1), (det_x2, det_y2), (0, 255, 0), 2)
                label = f'ID: {id} Conf: {det_confidence:.2f}'
                cv2.putText(frame, label, (det_x1, det_y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                break

    # Display the resulting frame with tracking
    cv2.imshow('Tracked People', frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture and destroy all windows
cap.release()
cv2.destroyAllWindows()





0: 384x640 (no detections), 382.1ms
Speed: 3.0ms preprocess, 382.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 3 handbags, 353.1ms
Speed: 2.0ms preprocess, 353.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 4 handbags, 1 suitcase, 314.1ms
Speed: 3.0ms preprocess, 314.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 2 handbags, 330.1ms
Speed: 2.0ms preprocess, 330.1ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 3 handbags, 301.1ms
Speed: 1.0ms preprocess, 301.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 4 handbags, 371.1ms
Speed: 2.0ms preprocess, 371.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 2 handbags, 365.1ms
Speed: 2.0ms preprocess, 365.1ms inference, 1.0ms p