# **OS-NET**

In [None]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from scipy.spatial.distance import cosine
import torchvision.transforms as transforms
from torchvision.models import inception_v3

yolo_model = YOLO("yolov8m.pt")
yolo_model.to("cpu")

class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)
        self.fc = nn.Linear(2048, 512) 

    def forward(self, x):
        features = self.model(x)
        return features 

device = torch.device("cpu")
osnet = OSNet().to(device)
osnet.eval()

transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((299, 299)), transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img_tensor)
    return features.cpu().numpy().flatten()

def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.8, iou=0.6, device="cpu") 
    detections = []
    for box in results[0].boxes:  
        if box.cls == 2:
            x1, y1, x2, y2 = map(int, box.xyxy[0]) 
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker_data = {}
    person_id = 0 

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detected_boxes = detect_persons(frame)

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)

            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}") 
                if similarity > 0.80:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

video_path = "test2.m4.mp4"
process_video(video_path)

same as  above just save result in mp4

In [None]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from scipy.spatial.distance import cosine
import torchvision.transforms as transforms
from torchvision.models import inception_v3

device = torch.device("cpu")

# Initialize YOLO model with CPU-only inference
yolo_model = YOLO("yolov8m.pt").to(device)

# OSNet model definition
class OSNet(nn.Module):
    def __init__(self):
        super(OSNet, self).__init__()
        self.model = inception_v3(pretrained=True, transform_input=False)
        self.model.fc = nn.Linear(2048, 512)  # Modify final layer to output 512 features

    def forward(self, x):
        features = self.model(x)
        return features

# Initialize OSNet model on CPU
osnet = OSNet().to(device)
osnet.eval()

# Define image transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

# Extract features using the OSNet model
def extract_features(img, model):
    img_tensor = transform(img).unsqueeze(0).to(device)  # Ensure tensor is on CPU
    with torch.no_grad():
        features = model(img_tensor)
    return features.cpu().numpy().flatten()  # Convert to NumPy array

# Match features using cosine similarity
def match_features(feature1, feature2):
    return 1 - cosine(feature1, feature2)

# Detect persons in a frame using YOLO
def detect_persons(frame):
    results = yolo_model.predict(frame, conf=0.8, iou=0.6, device="cpu")  # Ensure inference is CPU-only
    detections = []
    for box in results[0].boxes:
        if box.cls == 2:  # Assuming class 2 corresponds to "person" in the model
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            detections.append((x1, y1, x2 - x1, y2 - y1))
    return detections

# Process video and save the output
def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
    tracker_data = {}
    person_id = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        detected_boxes = detect_persons(frame)  # Detect persons in the frame

        for bbox in detected_boxes:
            x, y, w, h = bbox
            person_crop = frame[y:y+h, x:x+w]

            features = extract_features(person_crop, osnet)  # Extract features for the cropped person

            matched_id = None
            for track_id, track_data in tracker_data.items():
                similarity = match_features(features, track_data['features'])
                print(f"Matching ID {track_id}: Similarity = {similarity}")
                if similarity > 0.80:
                    matched_id = track_id
                    tracker_data[track_id]['features'] = features
                    break

            if matched_id is None:
                person_id += 1
                matched_id = person_id
                tracker_data[person_id] = {'features': features}

            # Draw bounding box and ID on the frame
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {matched_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        out.write(frame)  # Write the processed frame to the output video

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# Paths for input and output videos
video_path = "test2.m4.mp4"
output_path = "output.mp4"
process_video(video_path, output_path)




0: 384x640 6 cars, 315.1ms
Speed: 5.0ms preprocess, 315.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.7918569056291481
Matching ID 1: Similarity = 0.7991349020039156
Matching ID 2: Similarity = 0.8114870974196381
Matching ID 1: Similarity = 0.7929887593668815
Matching ID 2: Similarity = 0.8279606328990157
Matching ID 1: Similarity = 0.7129517316766603
Matching ID 2: Similarity = 0.7438894886867433
Matching ID 1: Similarity = 0.7160907578052724
Matching ID 2: Similarity = 0.7591492783270819
Matching ID 3: Similarity = 0.8298995363542961

0: 384x640 5 cars, 297.1ms
Speed: 4.0ms preprocess, 297.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Matching ID 1: Similarity = 0.9899177323847947
Matching ID 1: Similarity = 0.794071473779699
Matching ID 2: Similarity = 0.9803947485544247
Matching ID 1: Similarity = 0.7749943582892944
Matching ID 2: Similarity = 0.7761885543873903
Matching ID 3: Similarity = 0.72957178694364

: 

# **YOLO Approach**

In [1]:
from ultralytics import YOLO
import cv2

model = YOLO('yolo11s.pt', task='segment')  # Use 'yolov8m' for medium model

input_video = "test.mp4"
cap = cv2.VideoCapture(input_video)


width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model.track(frame, conf=0.7, classes=[2], persist=True)
    annotated_frame = results[0].plot()  # Annotate frame with bounding boxes
    out.write(annotated_frame)

    cv2.imshow('YOLOv8 Tracking', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# cap.release()
# out.release()
# cv2.destroyAllWindows()

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    results = model.track(frame, conf=0.5, classes=[2], persist=True)
    annotated_frame = results[0].plot() 
    
    out.write(annotated_frame)

    cv2.imshow('YOLOv8 Tracking', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.destroyAllWindows()


0: 384x640 6 cars, 49.0ms
Speed: 3.0ms preprocess, 49.0ms inference, 1332.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 cars, 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 cars, 15.0ms
Speed: 2.0ms preprocess, 15.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 cars, 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 cars, 15.0ms
Speed: 3.0ms preprocess, 15.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 cars, 15.0ms
Speed: 3.0ms preprocess, 15.0ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 cars, 15.0ms
Speed: 3.0ms preprocess, 15.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 16.0ms
Speed: 2.0ms preprocess, 16.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 3

# **Manual**

In [None]:
import cv2
import torch
import numpy as np
from tqdm import tqdm
from ultralytics import YOLO
from torchvision import models, transforms
from scipy.spatial.distance import cosine
from torchvision.transforms.functional import pad


yolo_model = YOLO("yolov8n.pt")

resnet = models.inception_v3(pretrained=True)
resnet.eval()

# def resize_with_aspect_ratio(image, target_size):
#     w, h = image.size
#     target_h, target_w = target_size

#     scale = min(target_w / w, target_h / h)
#     new_w, new_h = int(w * scale), int(h * scale)

#     resized_image = image.resize((new_w, new_h))

#     delta_w = target_w - new_w
#     delta_h = target_h - new_h
#     padding = (delta_w // 2, delta_h // 2, delta_w - delta_w // 2, delta_h - delta_h // 2)
#     padded_image = pad(resized_image, padding, fill=(0, 0, 0))

#     return padded_image

# transform = transforms.Compose([transforms.ToPILImage(),transforms.Lambda(lambda img: resize_with_aspect_ratio(img, (299, 299))),transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((224, 224)),transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

car_registry = {}

def extract_features(image):
    tensor = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(tensor).squeeze().numpy()
    return features / np.linalg.norm(features)

def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Function to handle ReID
def reid_cars(detections, car_registry, frame):
    new_registry = {}
    features_list = []

    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        car_crop = frame[int(y1):int(y2), int(x1):int(x2)]
        car_features = extract_features(car_crop)
        features_list.append((car_features, (x1, y1, x2, y2)))

    for car_features, bbox in features_list:
        matched_id = None
        max_similarity = 0.5  # Threshold for similarity

        for car_id, data in car_registry.items():
            similarity = cosine_similarity(car_features, data["features"])
            if similarity > max_similarity:
                matched_id = car_id
                max_similarity = similarity

        if matched_id:
            new_registry[matched_id] = {"features": car_features, "bbox": bbox}
        else:
            new_id = len(car_registry) + len(new_registry) + 1
            new_registry[new_id] = {"features": car_features, "bbox": bbox}

    return new_registry

def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # Total number of frames in the video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = None

    global car_registry

    # Count objects in the first frame
    first_frame_processed = False
    total_objects = 0

    # Initialize progress bar
    with tqdm(total=total_frames, desc="Processing Video", unit="frame") as pbar:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # Run YOLOv8 for detection
            results = yolo_model(frame)

            # Parse YOLO detection results
            detections = results[0].boxes  # Bounding box objects
            car_detections = []
            for i in range(len(detections)):
                box = detections[i]
                cls = box.cls.cpu().numpy().item()  # Class ID
                conf = box.conf.cpu().numpy().item()  # Confidence
                x1, y1, x2, y2 = box.xyxy.cpu().numpy().astype(int).flatten()  # Bounding box
                if cls == 2:  # Filter for cars
                    car_detections.append([x1, y1, x2, y2, conf, cls])

            # Count total objects in the first frame
            if not first_frame_processed:
                total_objects = len(car_detections)
                print(f"Total objects detected in the first frame: {total_objects}")
                first_frame_processed = True

            # ReID processing
            car_registry = reid_cars(car_detections, car_registry, frame)

            # Draw detections and IDs
            for car_id, data in car_registry.items():
                x1, y1, x2, y2 = map(int, data["bbox"])
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Car {car_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

            # Initialize video writer if not already done
            if out is None:
                h, w, _ = frame.shape
                out = cv2.VideoWriter(output_path, fourcc, 30, (w, h))

            out.write(frame)

            # Update progress bar
            pbar.update(1)

    cap.release()
    if out:
        out.release()

    print("Processing complete. Video saved at:", output_path)


process_video("test.mp4", "output_video.mp4")

In [20]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchvision import models, transforms
from scipy.spatial.distance import cosine

# Load YOLOv8 model
yolo_model = YOLO("yolo11s.pt")  # Replace with your YOLO model path

# Load ResNet for feature extraction
resnet = models.inception_v3(pretrained=True)
resnet.eval()

# Define transformation for ResNet input
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Initialize global car registry
car_registry = {}

# Feature extraction function
def extract_features(image):
    tensor = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(tensor).squeeze().numpy()
    return features / np.linalg.norm(features)  # Normalize the feature vector

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Function to count cars and manage IDs
def how_many(detections, frame, car_registry):
    new_registry = {}
    car_features = []

    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        car_crop = frame[int(y1):int(y2), int(x1):int(x2)]
        features = extract_features(car_crop)
        car_features.append((features, (x1, y1, x2, y2)))

    for features, bbox in car_features:
        matched_id = None
        max_similarity = 0.7  # Cosine similarity threshold

        for car_id, data in car_registry.items():
            similarity = cosine_similarity(features, data["features"])
            if similarity > max_similarity:
                matched_id = car_id
                max_similarity = similarity

        if matched_id:
            new_registry[matched_id] = {"features": features, "bbox": bbox}
        else:
            new_id = len(car_registry) + len(new_registry) + 1
            new_registry[new_id] = {"features": features, "bbox": bbox}

    return new_registry

# Function to resize frame to target resolution
def resize_frame(frame, target_width, target_height):
    h, w, _ = frame.shape
    scale = min(target_width / w, target_height / h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    return cv2.resize(frame, (new_w, new_h))

# Main processing function
def process_video(video_path, output_path, target_resolution=(1920, 1080)):
    cap = cv2.VideoCapture(video_path)
    fourcc = cv2.VideoWriter_fourcc(*'XVID')  
    out = None
    global car_registry

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        results = yolo_model(frame)

        detections = results[0].boxes
        car_detections = []
        for i in range(len(detections)):
            box = detections[i]
            cls = box.cls.cpu().numpy().item()
            conf = box.conf.cpu().numpy().item()
            x1, y1, x2, y2 = box.xyxy.cpu().numpy().astype(int).flatten()
            if cls == 2:  # Filter for cars
                car_detections.append([x1, y1, x2, y2, conf, cls])

        # Update car registry
        car_registry = how_many(car_detections, frame, car_registry)

        for car_id, data in car_registry.items():
            x1, y1, x2, y2 = map(int, data["bbox"])
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"Car {car_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        frame_resized = resize_frame(frame, *target_resolution)

        if out is None:
            h, w, _ = frame_resized.shape
            out = cv2.VideoWriter(output_path, fourcc, 30, (w, h))

        out.write(frame_resized)

        cv2.imshow("ReID Cars", frame_resized)
        if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to quit
            break

    cap.release()
    if out:
        out.release()
    cv2.destroyAllWindows()
    print("Processing complete. Video saved at:", output_path)

process_video("test.mp4", "output_video.mp4", target_resolution=(1920, 1080))  # Adjust resolution as needed


0: 384x640 7 cars, 2 trucks, 16.0ms
Speed: 2.0ms preprocess, 16.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 16.0ms
Speed: 2.0ms preprocess, 16.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 2 trucks, 19.0ms
Speed: 2.0ms preprocess, 19.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 truck, 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 cars, 1 truck, 1 traffic light, 18.0ms
Speed: 2.0ms preprocess, 18.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 truck, 1 traffic light, 16.0ms
Speed: 2.0ms preprocess, 16.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 truck, 1 traffic light, 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 tr

In [None]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
import torch.nn.functional as F
from scipy.spatial.distance import cosine
from torchvision import models, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

yolo_model = YOLO("yolov8m.pt")
yolo_model.to(device)

resnet = models.efficientnet_b4(pretrained=True)
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((960, 640)),transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

car_registry = {}

def extract_features(image):
    tensor = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = resnet(tensor)
    features = features.squeeze()
    features = features / features.norm(p=2) 
    return features


def cosine_similarity_gpu(vec1, vec2, scale_factor=1.0):
    return F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0), dim=1)


def how_many(detections, frame, car_registry):
    new_registry = {}
    car_features = []

    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        car_crop = frame[int(y1):int(y2), int(x1):int(x2)]
        features = extract_features(car_crop)
        car_features.append((features, (x1, y1, x2, y2)))

    for features, bbox in car_features:
        matched_id = None
        max_similarity = 0.60

        for car_id, data in car_registry.items():
            similarity = cosine_similarity_gpu(features, data["features"])
            if similarity > max_similarity:
                matched_id = car_id
                max_similarity = similarity

        if matched_id:
            new_registry[matched_id] = {"features": features, "bbox": bbox}
        else:
            new_id = len(car_registry) + len(new_registry) + 1
            new_registry[new_id] = {"features": features, "bbox": bbox}

    return new_registry

def resize_frame(frame, target_width, target_height):
    h, w, _ = frame.shape
    scale = min(target_width / w, target_height / h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    return cv2.resize(frame, (new_w, new_h))

def process_video(video_path, output_path, target_resolution=(1920, 1080)):
    cap = cv2.VideoCapture(video_path)
    fourcc = cv2.VideoWriter_fourcc(*'XVID')  
    out = None
    global car_registry

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        results = yolo_model(frame, imgsz=1504)

        detections = results[0].boxes
        car_detections = []
        for i in range(len(detections)):
            box = detections[i]
            cls = box.cls.cpu().numpy().item()
            conf = box.conf.cpu().numpy().item()
            x1, y1, x2, y2 = box.xyxy.cpu().numpy().astype(int).flatten()
            if cls == 2:
                car_detections.append([x1, y1, x2, y2, conf, cls])

        car_registry = how_many(car_detections, frame, car_registry)

        for car_id, data in car_registry.items():
            x1, y1, x2, y2 = map(int, data["bbox"])
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"Car {car_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        frame_resized = resize_frame(frame, *target_resolution)

        if out is None:
            h, w, _ = frame_resized.shape
            out = cv2.VideoWriter(output_path, fourcc, 30, (w, h))

        out.write(frame_resized)

        cv2.imshow("ReID Cars", frame_resized)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    if out:
        out.release()
    cv2.destroyAllWindows()
    print("Processing complete. Video saved at:", output_path)

if __name__ == "__main__":
    input_video_path = "test2.m4.mp4"
    output_video_path = "output_video.mp4"
    process_video(input_video_path, output_video_path, target_resolution=(1920, 1080))


0: 864x1504 2 persons, 8 cars, 1 truck, 1 traffic light, 64.0ms
Speed: 10.0ms preprocess, 64.0ms inference, 2.0ms postprocess per image at shape (1, 3, 864, 1504)

0: 864x1504 2 persons, 8 cars, 1 truck, 1 traffic light, 49.0ms
Speed: 12.0ms preprocess, 49.0ms inference, 3.0ms postprocess per image at shape (1, 3, 864, 1504)

0: 864x1504 3 persons, 8 cars, 1 truck, 1 traffic light, 47.0ms
Speed: 11.0ms preprocess, 47.0ms inference, 3.0ms postprocess per image at shape (1, 3, 864, 1504)

0: 864x1504 3 persons, 8 cars, 1 truck, 1 traffic light, 48.0ms
Speed: 9.0ms preprocess, 48.0ms inference, 2.0ms postprocess per image at shape (1, 3, 864, 1504)

0: 864x1504 4 persons, 8 cars, 1 truck, 1 traffic light, 47.0ms
Speed: 13.0ms preprocess, 47.0ms inference, 2.0ms postprocess per image at shape (1, 3, 864, 1504)

0: 864x1504 2 persons, 10 cars, 1 traffic light, 46.0ms
Speed: 10.0ms preprocess, 46.0ms inference, 3.0ms postprocess per image at shape (1, 3, 864, 1504)

0: 864x1504 3 persons, 9