**OS-NET**

In [2]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from scipy.spatial.distance import cosine
import torchvision.transforms as transforms
from torchvision.models import inception_v3

yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")

class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)
        self.fc = nn.Linear(2048, 512) 

    def forward(self, x):
        features = self.model(x)
        return features 

device = torch.device("cpu")
osnet = OSNet().to(device)
osnet.eval()

transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299, 299)), transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img_tensor)
    return features.cpu().numpy().flatten()

def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.8, iou=0.6, device="cpu") 
    detections = []
    for box in results[0].boxes:  
        if box.cls == 0:
            x1, y1, x2, y2 = map(int, box.xyxy[0]) 
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0 

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}") 
                if similarity > 0.80:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = r"C:\Users\ahmad\Downloads\VIDEO-2024-11-24-19-56-25.mp4"
process_video(video_path)


0: 480x640 3 persons, 1 car, 700.9ms
Speed: 0.0ms preprocess, 700.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.644119381904602
Matching ID 1: Similarity = 0.6408997774124146
Matching ID 2: Similarity = 0.7582831978797913

0: 480x640 3 persons, 1 car, 741.6ms
Speed: 4.0ms preprocess, 741.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.9265842437744141
Matching ID 1: Similarity = 0.7150264382362366
Matching ID 2: Similarity = 0.78499436378479
Matching ID 3: Similarity = 0.8253814578056335
Matching ID 1: Similarity = 0.5434474349021912
Matching ID 2: Similarity = 0.8715530633926392

0: 480x640 3 persons, 1 car, 633.1ms
Speed: 3.8ms preprocess, 633.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Matching ID 1: Similarity = 0.6818168759346008
Matching ID 2: Similarity = 0.6690707802772522
Matching ID 3: Similarity = 0.930153489112854
Matching ID 1: Similarity = 0.80

In [13]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from scipy.spatial.distance import cosine
import torchvision.transforms as transforms
from torchvision.models import inception_v3
from collections import defaultdict

yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")

class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)
        self.fc = nn.Linear(2048, 512)

    def forward(self, x):
        features = self.model(x)
        return features

device = torch.device("cpu")
osnet = OSNet().to(device)
osnet.eval()

transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299, 299)),transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img_tensor)
    return features.cpu().numpy().flatten()

def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

def iou(box1, box2):
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2

    xi1 = max(x1, x2)
    yi1 = max(y1, y2)
    xi2 = min(x1 + w1, x2 + w2)
    yi2 = min(y1 + h1, y2 + h2)

    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    box1_area = w1 * h1
    box2_area = w2 * h2

    union_area = box1_area + box2_area - inter_area
    return inter_area / union_area if union_area > 0 else 0

def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.8, iou=0.6, device="cpu")
    detections = []
    for box in results[0].boxes:
        if box.cls == 0:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        detected_boxes = detect_persons(frame)
        new_tracks = defaultdict(dict)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            best_match_id = None
            best_match_score = 0.0

            for track_id, track_data in tracker_data.items():
                iou_score = iou(bbox, track_data['bbox'])
                feature_score = match_features(features, track_data['features'])

                combined_score = 0.6 * feature_score + 0.4 * iou_score  # Weighted similarity
                if combined_score > best_match_score and combined_score > 0.5:
                    best_match_score = combined_score
                    best_match_id = track_id

            if best_match_id is not None:
                new_tracks[best_match_id]['bbox'] = bbox
                new_tracks[best_match_id]['features'] = features
            else:
                person_id += 1
                new_tracks[person_id]['bbox'] = bbox
                new_tracks[person_id]['features'] = features

        tracker_data = new_tracks

        for track_id, track_data in tracker_data.items():
            x, y, w, h = track_data['bbox']
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {track_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = r"C:\Users\ahmad\Downloads\VIDEO-2024-11-24-19-56-25.mp4"
process_video(video_path)




0: 480x640 3 persons, 1 car, 733.1ms
Speed: 7.5ms preprocess, 733.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 870.9ms
Speed: 5.0ms preprocess, 870.9ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 698.7ms
Speed: 5.2ms preprocess, 698.7ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 800.4ms
Speed: 5.0ms preprocess, 800.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 714.6ms
Speed: 4.4ms preprocess, 714.6ms inference, 2.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 694.1ms
Speed: 5.4ms preprocess, 694.1ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 692.7ms
Speed: 4.4ms preprocess, 692.7ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 car, 713.9ms
Speed: 5.5

In [None]:
from ultralytics import YOLO
import cv2

# Load YOLOv8 model (pre-trained)
model = YOLO('yolov8m.pt')  # Use 'yolov8m' for medium model

# Open the video file
input_video = r"C:\Users\ahmad\Downloads\VIDEO-2024-11-24-19-56-25.mp4"
cap = cv2.VideoCapture(input_video)


# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define video writer
out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

# Loop through video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Run YOLOv8 inference
    results = model.track(frame, conf=0.5, classes=[0], persist=True)  # class=0 for person
    
    # Visualize results on frame
    annotated_frame = results[0].plot()  # Annotate frame with bounding boxes
    
    # Write frame to output video
    out.write(annotated_frame)

    # Show frame (optional)
    cv2.imshow('YOLOv8 Tracking', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define video writer
out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

# Loop through video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Run YOLOv8 inference
    results = model.track(frame, conf=0.5, classes=[0], persist=True)  # class=0 for person
    
    # Visualize results on frame
    annotated_frame = results[0].plot()  # Annotate frame with bounding boxes
    
    # Write frame to output video
    out.write(annotated_frame)

    # Show frame (optional)
    cv2.imshow('YOLOv8 Tracking', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()


0: 480x640 3 persons, 671.9ms
Speed: 15.6ms preprocess, 671.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 718.7ms
Speed: 0.0ms preprocess, 718.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 671.9ms
Speed: 0.0ms preprocess, 671.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 738.8ms
Speed: 0.0ms preprocess, 738.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 687.2ms
Speed: 16.9ms preprocess, 687.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 653.2ms
Speed: 18.9ms preprocess, 653.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 773.9ms
Speed: 0.0ms preprocess, 773.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 899.3ms
Speed: 0.0ms preprocess, 899.3ms inference, 15.6ms postprocess 