**OS-NET**

In [None]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from scipy.spatial.distance import cosine
import torchvision.transforms as transforms
from torchvision.models import inception_v3

yolo_model = YOLO(r"C:\Users\ahmad\Downloads\yolo11l.pt")
yolo_model.to("cpu")

class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)
        self.fc = nn.Linear(2048, 512) 

    def forward(self, x):
        features = self.model(x)
        return features 

device = torch.device("cpu")
osnet = OSNet().to(device)
osnet.eval()

transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299, 299)), transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img_tensor)
    return features.cpu().numpy().flatten()

def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.8, iou=0.5, device="cpu") 
    detections = []
    for box in results[0].boxes:  
        if box.cls == 0:
            x1, y1, x2, y2 = map(int, box.xyxy[0]) 
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0 

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}") 
                if similarity > 0.8:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = r"C:\Users\ahmad\Downloads\8083128-hd_1920_1080_30fps_2.mp4"
process_video(video_path)


0: 384x640 1 person, 397.1ms
Speed: 2.0ms preprocess, 397.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 390.1ms
Speed: 3.0ms preprocess, 390.1ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9812989181691499
Matching ID 1: Similarity = 0.7193835984144735

0: 384x640 2 persons, 382.1ms
Speed: 2.0ms preprocess, 382.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9560489167428217
Matching ID 1: Similarity = 0.7003166955053087
Matching ID 2: Similarity = 0.9254144548353281

0: 384x640 2 persons, 378.1ms
Speed: 2.0ms preprocess, 378.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9775922378778
Matching ID 1: Similarity = 0.684698821465283
Matching ID 2: Similarity = 0.9506132302286836

0: 384x640 2 persons, 388.1ms
Speed: 3.0ms preprocess, 388.1ms inference, 1.0ms postprocess per image at shape (

In [None]:
import cv2
import torch
from scipy.spatial.distance import cosine
from ultralytics import YOLO
import numpy as np
from transformers import SamProcessor, SamModel
import torchvision.transforms as transforms

# Initialize YOLOv8 model
yolo_model = YOLO(r"C:\Users\ahmad\Downloads\yolo11l.pt")
yolo_model.to("cpu")  # Use CPU for processing

# Load SAM model (Segment Anything Model)
processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
sam_model = SamModel.from_pretrained("facebook/sam-vit-huge")

# Function to calculate IoU (Intersection over Union)
def calculate_iou(box1, box2):
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2

    xi1 = max(x1, x2)
    yi1 = max(y1, y2)
    xi2 = min(x1 + w1, x2 + w2)
    yi2 = min(y1 + h1, y2 + h2)

    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    box1_area = w1 * h1
    box2_area = w2 * h2
    union_area = box1_area + box2_area - inter_area

    return inter_area / union_area if union_area > 0 else 0

# Function to get features using SAM model (Segment Anything Model)
def extract_features_sam(frame, bbox):
    x, y, w, h = bbox
    person_crop = frame[y:y+h, x:x+w]
    
    # Convert the cropped image to the required format for SAM
    inputs = processor(images=person_crop, return_tensors="pt")

    # Get the segmentation mask for the person
    with torch.no_grad():
        outputs = sam_model(**inputs)
    
    # Get the segmentation mask from the outputs
    segmentation_mask = outputs.logits.argmax(dim=1).cpu().numpy()
    
    # Here you could use the segmentation mask to extract features
    # For simplicity, using the mask directly as a feature
    return segmentation_mask.flatten()

# Function to match features using cosine similarity
def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

# Detect persons using YOLO
def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.8, iou=0.5)
    detections = []
    for box in results[0].boxes:
        if box.cls == 0:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

# Main function for tracking and re-identification
def track_and_reid(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}  # Store trackers with unique ID
    next_id = 0  # ID counter for new persons
    timeout = 5  # Timeout for unmatched IDs (in seconds)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        detections = detect_persons(frame)
        unmatched_ids = set(tracker_data.keys())

        for det in detections:
            x, y, w, h = det
            best_id = None
            max_iou = 0.0

            # Match detected boxes with existing trackers based on IoU and features
            for track_id, tracker in tracker_data.items():
                iou = calculate_iou(tracker["bbox"], det)
                if iou > max_iou and iou > 0.3:  # IoU threshold
                    max_iou = iou
                    best_id = track_id

            # If no match is found, assign a new ID
            if best_id is None:
                features = extract_features_sam(frame, det)
                tracker_data[next_id] = {"bbox": det, "last_seen": cv2.getTickCount(), "features": features}
                next_id += 1
            else:
                features = extract_features_sam(frame, det)
                tracker_data[best_id]["bbox"] = det
                tracker_data[best_id]["last_seen"] = cv2.getTickCount()
                tracker_data[best_id]["features"] = features

            # Draw tracking information
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, f"ID: {best_id if best_id is not None else next_id - 1}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Remove old trackers based on timeout
        current_time = cv2.getTickCount()
        for track_id in list(tracker_data.keys()):
            if (current_time - tracker_data[track_id]["last_seen"]) / cv2.getTickFrequency() > timeout:
                del tracker_data[track_id]

        # Show the output frame
        cv2.imshow("YOLO Tracking with SAM", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Process the video
video_path = r"C:\Users\ahmad\Downloads\8083128-hd_1920_1080_30fps_2.mp4"
track_and_reid(video_path)
