In [None]:
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import inception_v3
import numpy as np
from scipy.spatial.distance import cosine
from ultralytics import YOLO

# Load YOLOv8 model (force CPU usage)
yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")  # Explicitly move YOLOv8 to CPU

# Define OSNet model using Inception V3
class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)  # Inception V3 model
        self.fc = nn.Linear(2048, 512)  # Output layer (2048 is the final feature size from Inception V3)

    def forward(self, x):
        # Inception V3 uses a specific resize (299x299) and has auxiliary outputs which we ignore here
        features = self.model(x)
        return features  # Directly return the raw feature output

# Initialize OSNet model
device = torch.device("cpu")  # Force usage of CPU
osnet = OSNet().to(device)
osnet.eval()

# Transformation pipeline for Inception V3 input size
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((299, 299)),  # Resize to 299x299 for Inception V3
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),  # Standard normalization
])

# Extract features from the image using the OSNet model
def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)  # Transform and add batch dimension
    with torch.no_grad():
        features = model(img_tensor)  # Extract features from the model
    return features.cpu().numpy().flatten()  # Ensure 1D output

# Compare features using cosine similarity
def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

# Person detection using YOLOv8
def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.5, iou=0.5, device="cpu")  # Force YOLO inference on CPU
    detections = []
    for box in results[0].boxes:  # Loop through detected objects
        if box.cls == 0:  # Class 0 corresponds to "person" in COCO dataset
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert coordinates to integers
            detections.append((x1, y1, x2 - x1, y2 - y1))  # Format as (x, y, w, h)
    return detections

# Video processing
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}  # Store tracked persons' features
    person_id = 0  # ID counter for new persons

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            # Compare features with existing trackers
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}")  # Debugging line
                if similarity > 0.8:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features  # Update features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            # Draw bounding box and ID on the frame
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        # Show the frame with bounding boxes and IDs
        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Run on video
video_path = r"C:\Users\ahmad\Downloads\4586095-hd_1920_1080_30fps.mp4"
process_video(video_path)




0: 384x640 3 persons, 3 cars, 5 traffic lights, 314.1ms
Speed: 3.0ms preprocess, 314.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Distance = 34.05538558959961
Matching ID 1: Distance = 44.083580017089844
Matching ID 2: Distance = 43.37911605834961

0: 384x640 3 persons, 3 cars, 5 traffic lights, 299.1ms
Speed: 3.0ms preprocess, 299.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Distance = 5.323475360870361
Matching ID 2: Distance = 33.99664306640625
Matching ID 3: Distance = 44.64763641357422
Matching ID 1: Distance = 43.31195831298828
Matching ID 2: Distance = 42.842376708984375
Matching ID 3: Distance = 5.604003429412842
Matching ID 4: Distance = 43.94363784790039
Matching ID 1: Distance = 32.45579147338867
Matching ID 2: Distance = 12.060968399047852
Matching ID 3: Distance = 40.968841552734375
Matching ID 4: Distance = 32.37553787231445
Matching ID 5: Distance = 40.381046295166016

0: 384x640 4 persons, 3 

: 

In [1]:
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision import models
import numpy as np
from scipy.spatial.distance import cosine
from ultralytics import YOLO

# YOLOv8 Model for Person Detection
yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")  # Explicitly move YOLOv8 to CPU

# ResNet backbone for Re-ID
class ReIDModel(nn.Module):
    def __init__(self):
        super(ReIDModel, self).__init__()
        self.model = models.resnet50(pretrained=True)  # Use ResNet-50 as a feature extractor
        self.model.fc = nn.Linear(self.model.fc.in_features, 512)  # Output layer for feature size 512

    def forward(self, x):
        return self.model(x)

# Initialize the Re-ID model
device = torch.device("cpu")  # Force usage of CPU
reid_model = ReIDModel().to(device)
reid_model.eval()

# Transformations for Inference (same as training)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),  # Resize to 224x224 for ResNet input
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

# Extract features from the image using the Re-ID model
def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)  # Transform and add batch dimension
    with torch.no_grad():
        features = model(img_tensor)  # Extract features from the model
    return features.cpu().numpy().flatten()  # Ensure 1D output

# Match features using cosine similarity
def match_features(feature1, feature2):
    # Cosine similarity between two feature vectors
    return 1 - cosine(feature1, feature2)

# Person detection using YOLOv8
def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.5, iou=0.5, device="cpu")  # Force YOLO inference on CPU
    detections = []
    for box in results[0].boxes:  # Loop through detected objects
        if box.cls == 0:  # Class 0 corresponds to "person" in COCO dataset
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert coordinates to integers
            detections.append((x1, y1, x2 - x1, y2 - y1))  # Format as (x, y, w, h)
    return detections

# Video processing
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            # Extract features for the detected person
            features = extract_features(person_crop, reid_model)

            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                if similarity > 0.8:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features 
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Run on video
video_path = r"C:\Users\ahmad\Downloads\4586095-hd_1920_1080_30fps.mp4"
process_video(video_path)




0: 384x640 3 persons, 3 cars, 5 traffic lights, 309.1ms
Speed: 5.0ms preprocess, 309.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 3 cars, 5 traffic lights, 297.1ms
Speed: 3.0ms preprocess, 297.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 3 cars, 5 traffic lights, 297.1ms
Speed: 3.0ms preprocess, 297.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 3 cars, 5 traffic lights, 295.1ms
Speed: 2.0ms preprocess, 295.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 cars, 5 traffic lights, 297.1ms
Speed: 3.0ms preprocess, 297.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 cars, 5 traffic lights, 297.1ms
Speed: 1.0ms preprocess, 297.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 cars, 5 traffic lights, 289.1ms
Speed: 2.0ms 