In [None]:
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import inception_v3
import numpy as np
from scipy.spatial.distance import cosine
from ultralytics import YOLO

# Load YOLOv8 model (force CPU usage)
yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")  # Explicitly move YOLOv8 to CPU

# Define OSNet model using Inception V3
class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)  # Inception V3 model
        self.fc = nn.Linear(2048, 512)  # Output layer (2048 is the final feature size from Inception V3)

    def forward(self, x):
        # Inception V3 uses a specific resize (299x299) and has auxiliary outputs which we ignore here
        features = self.model(x)
        return features  # Directly return the raw feature output

# Initialize OSNet model
device = torch.device("cpu")  # Force usage of CPU
osnet = OSNet().to(device)
osnet.eval()

# Transformation pipeline for Inception V3 input size
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((299, 299)),  # Resize to 299x299 for Inception V3
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),  # Standard normalization
])

# Extract features from the image using the OSNet model
def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)  # Transform and add batch dimension
    with torch.no_grad():
        features = model(img_tensor)  # Extract features from the model
    return features.cpu().numpy().flatten()  # Ensure 1D output

# Compare features using cosine similarity
def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

# Person detection using YOLOv8
def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.5, iou=0.5, device="cpu")  # Force YOLO inference on CPU
    detections = []
    for box in results[0].boxes:  # Loop through detected objects
        if box.cls == 0:  # Class 0 corresponds to "person" in COCO dataset
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert coordinates to integers
            detections.append((x1, y1, x2 - x1, y2 - y1))  # Format as (x, y, w, h)
    return detections

# Video processing
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}  # Store tracked persons' features
    person_id = 0  # ID counter for new persons

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            # Compare features with existing trackers
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}")  # Debugging line
                if similarity > 0.8:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features  # Update features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = r"C:\Users\ahmad\Downloads\4586095-hd_1920_1080_30fps.mp4"
process_video(video_path)




0: 384x640 3 persons, 3 cars, 5 traffic lights, 312.1ms
Speed: 2.0ms preprocess, 312.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.5275757696325553
Matching ID 1: Similarity = 0.27224008912434106
Matching ID 2: Similarity = 0.3031424817955738

0: 384x640 3 persons, 3 cars, 5 traffic lights, 308.1ms
Speed: 3.0ms preprocess, 308.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9886939333145748
Matching ID 1: Similarity = 0.2949858161057233
Matching ID 2: Similarity = 0.3264739203519944
Matching ID 3: Similarity = 0.9893197321533458
Matching ID 1: Similarity = 0.5787287197212305
Matching ID 2: Similarity = 0.9411860125432686

0: 384x640 4 persons, 3 cars, 5 traffic lights, 294.1ms
Speed: 2.0ms preprocess, 294.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9774682045176266
Matching ID 1: Similarity = 0.28194555145334
Matching ID 2: Simi

In [18]:
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import inception_v3
import numpy as np
from scipy.signal import correlate2d
from ultralytics import YOLO

# Load YOLOv8 model (force CPU usage)
yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")  # Explicitly move YOLOv8 to CPU

# Define OSNet model using Inception V3
class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)  # Inception V3 model
        self.fc = nn.Linear(2048, 512)  # Output layer (2048 is the final feature size from Inception V3)

    def forward(self, x):
        # Inception V3 uses a specific resize (299x299) and has auxiliary outputs which we ignore here
        features = self.model(x)
        return features  # Directly return the raw feature output

# Initialize OSNet model
device = torch.device("cpu")  # Force usage of CPU
osnet = OSNet().to(device)
osnet.eval()

# Transformation pipeline for Inception V3 input size
transform = transforms.Compose([ 
    transforms.ToPILImage(),
    transforms.Resize((299, 299)),  # Resize to 299x299 for Inception V3
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),  # Standard normalization
])

# Extract features from the image using the OSNet model
def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)  # Transform and add batch dimension
    with torch.no_grad():
        features = model(img_tensor)  # Extract features from the model
    return features.cpu().numpy().flatten()  # Ensure 1D output

# Compare features using Normalized Cross-Correlation (NCC)
def match_features_correlation(feature1, feature2):
    # Normalize the features (subtract mean, divide by standard deviation)
    feature1_norm = (feature1 - np.mean(feature1)) / (np.std(feature1) + 1e-5)
    feature2_norm = (feature2 - np.mean(feature2)) / (np.std(feature2) + 1e-5)
    
    # Calculate correlation (cross-correlation)
    correlation = np.corrcoef(feature1_norm, feature2_norm)[0, 1]
    return correlation

# Person detection using YOLOv8
def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.5, iou=0.5, device="cpu")  # Force YOLO inference on CPU
    detections = []
    for box in results[0].boxes:  # Loop through detected objects
        if box.cls == 0:  # Class 0 corresponds to "person" in COCO dataset
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert coordinates to integers
            detections.append((x1, y1, x2 - x1, y2 - y1))  # Format as (x, y, w, h)
    return detections

# Video processing
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}  # Store tracked persons' features
    person_id = 0  # ID counter for new persons

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            # Compare features with existing trackers using correlation
            for track_id, track_data in tracker_data.items():
                similarity = match_features_correlation(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}")  # Debugging line
                if similarity > 0.8:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features  # Update features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = r"C:\Users\ahmad\Downloads\8083128-hd_1920_1080_30fps.mp4"
process_video(video_path)



0: 384x640 4 persons, 347.1ms
Speed: 3.0ms preprocess, 347.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.7381089710517096
Matching ID 1: Similarity = 0.7381218852244316
Matching ID 2: Similarity = 0.7484478595861829
Matching ID 1: Similarity = 0.7042548764062082
Matching ID 2: Similarity = 0.7294132935272977
Matching ID 3: Similarity = 0.7425904106128217

0: 384x640 4 persons, 303.1ms
Speed: 3.0ms preprocess, 303.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9620114580355524
Matching ID 1: Similarity = 0.7208361595080224
Matching ID 2: Similarity = 0.9638718784507068
Matching ID 1: Similarity = 0.7227448814967775
Matching ID 2: Similarity = 0.7023083096323562
Matching ID 3: Similarity = 0.9323469983293345
Matching ID 1: Similarity = 0.6684326734468385
Matching ID 2: Similarity = 0.6990753501493674
Matching ID 3: Similarity = 0.7291323704554217
Matching ID 4: Similarity = 0.9576233

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Contrastive Loss Function
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, label):
        # Compute the Euclidean distance between the anchor and positive samples
        euclidean_distance = F.pairwise_distance(anchor, positive, 2)
        
        # Contrastive Loss Calculation
        loss = torch.mean((label) * torch.pow(euclidean_distance, 2) + 
                          (1 - label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss

# Example of how to use Contrastive Loss in the video processing pipeline
def process_video_with_contrastive_loss(video_path, reid_model, contrastive_loss):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame, yolo_model)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]
            features = extract_features(person_crop, reid_model)
            
            # Example for generating label and pair samples (using random values for the example)
            label = torch.tensor([1])  # Assume label 1 for similar (positive pair) and 0 for dissimilar (negative pair)
            positive = tracker_data.get(person_id, {}).get('features', features)  # Positive is same as anchor for simplicity
            negative = np.random.rand(features.shape[0])  # Replace with an actual negative sample for meaningful training
            
            # Contrastive Loss computation
            loss = contrastive_loss(torch.tensor(features), torch.tensor(positive), label)
            print(f"Contrastive Loss: {loss.item()}")
            
            # Re-identification and tracker update
            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                if similarity > 0.8:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Initialize and run the video processing with Contrastive Loss
video_path = r"C:\Users\ahmad\Downloads\3623819-hd_1920_1080_25fp.mp4"
contrastive_loss = ContrastiveLoss(margin=1.0)
reid_model = ReIDModel().to(device)  # Ensure model is on CPU explicitly
yolo_model = YOLO("yolov8m.pt").to(device)  # Ensure YOLO is also on CPU
process_video_with_contrastive_loss(video_path, reid_model, contrastive_loss)



0: 384x640 (no detections), 319.1ms
Speed: 2.0ms preprocess, 319.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 3 handbags, 386.1ms
Speed: 3.0ms preprocess, 386.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Extracted Features Shape: (512,)
Contrastive Loss: 5.11999775909544e-10
Extracted Features Shape: (512,)
Contrastive Loss: 0.32147401571273804
Extracted Features Shape: (512,)
Contrastive Loss: 0.3047580122947693
Extracted Features Shape: (512,)
Contrastive Loss: 0.36198222637176514
Extracted Features Shape: (512,)
Contrastive Loss: 0.3536691963672638
Extracted Features Shape: (512,)
Contrastive Loss: 0.3286042809486389
Extracted Features Shape: (512,)
Contrastive Loss: 0.30674466490745544
Extracted Features Shape: (512,)
Contrastive Loss: 0.2862894535064697
Extracted Features Shape: (512,)
Contrastive Loss: 0.2847078740596771
Extracted Features Shape: (512,)
Contrastive Loss: 0.29607081413269043
Extracted Feature