In [39]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from scipy.spatial.distance import cosine
import torchvision.transforms as transforms
from torchvision.models import inception_v3

yolo_model = YOLO(r"C:\Users\ahmad\Downloads\yolo11l.pt")
yolo_model.to("cpu")

class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)
        self.fc = nn.Linear(2048, 512) 

    def forward(self, x):
        features = self.model(x)
        return features 

device = torch.device("cpu")
osnet = OSNet().to(device)
osnet.eval()

transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299, 299)), transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img_tensor)
    return features.cpu().numpy().flatten()

def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.7, iou=0.5, device="cpu") 
    detections = []
    for box in results[0].boxes:  
        if box.cls == 0:
            x1, y1, x2, y2 = map(int, box.xyxy[0]) 
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0 

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}") 
                if similarity > 0.65:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = r"C:\Users\ahmad\Downloads\video1.mp4"
process_video(video_path)


0: 480x640 3 persons, 1 car, 542.1ms
Speed: 2.0ms preprocess, 542.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.6465841147008542
Matching ID 1: Similarity = 0.7495136627018525

0: 480x640 3 persons, 1 car, 550.1ms
Speed: 3.0ms preprocess, 550.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.7687685572419942
Matching ID 1: Similarity = 0.7128140462458415
Matching ID 1: Similarity = 0.5351544862996493
Matching ID 2: Similarity = 0.5949777320317653

0: 480x640 3 persons, 1 car, 553.2ms
Speed: 3.0ms preprocess, 553.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.6753133258045403
Matching ID 1: Similarity = 0.6967765772381191
Matching ID 1: Similarity = 0.7209458115616997

0: 480x640 3 persons, 1 car, 508.1ms
Speed: 2.0ms preprocess, 508.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.577