In [6]:
import cv2
from ultralytics import YOLO
import torch

In [7]:
metrics_files = {
    'YOLO-n': './runs/detect/train6',
    'YOLO-l': './runs/detect/train15',
    'YOLO-x': './runs/detect/train_yolo_x',
    'RT-DETR-l': '/runs/detect/train14',
    'RT-DETR-x': '/runs/detect/train_rtdetr_x',
}


In [8]:
yolo11n_model = YOLO(f'./{metrics_files["YOLO-n"]}/weights/best.pt')

In [18]:
def generate_inference_video(model, input_path, output_path):
    # Input and output video paths
    input_video_path = input_path
    output_video_path = output_path

    # Open the video file
    cap = cv2.VideoCapture(input_video_path)

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for output video

    # Create a VideoWriter object
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    # Process the video frame-by-frame
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("End of video or failed to read frame.")
            break

        # Resize the frame to 384x384 as required by your model
        resized_frame = cv2.resize(frame, (384, 384))

        # Run YOLO model on the resized frame
        results = model.predict(
            resized_frame,
            imgsz=384,
            device='cuda' if torch.cuda.is_available() else 'cpu')

        # Extract predictions (bounding boxes, labels, etc.)
        boxes = results[0].boxes.xyxy  # Bounding box coordinates in 384x384
        confidences = results[0].boxes.conf  # Confidence scores
        class_ids = results[0].boxes.cls  # Class IDs

        # Map bounding boxes back to the original frame dimensions
        scale_x = width / 384
        scale_y = height / 384
        mapped_boxes = []
        for box in boxes:
            x1, y1, x2, y2 = box.tolist()
            x1 *= scale_x
            y1 *= scale_y
            x2 *= scale_x
            y2 *= scale_y
            mapped_boxes.append((int(x1), int(y1), int(x2), int(y2)))

        # Draw bounding boxes on the original frame
        for (x1, y1, x2, y2), conf, cls in zip(mapped_boxes, confidences,
                                               class_ids):
            label = f"{model.names[int(cls)]} {conf:.2f}"
            color = (0, 255, 0)  # Green color for bounding boxes
            # Draw bounding box with increased thickness
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness=3)
            # Add text with larger font scale and thickness
            cv2.putText(frame,
                        label, (x1, max(0, y1 - 10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=1.4,
                        color=color,
                        thickness=3)

        # Write the annotated frame to the output video
        out.write(frame)
        frame_count += 1
        print(f"Processed frame {frame_count}")

    # Release resources
    cap.release()
    out.release()
    print(f"Processed video saved to: {output_video_path}")

In [19]:
generate_inference_video(yolo11n_model, 'test_inference.mp4',
                         'output_video.mp4')


0: 384x384 (no detections), 7.6ms
Speed: 9.6ms preprocess, 7.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 1

0: 384x384 (no detections), 7.2ms
Speed: 0.7ms preprocess, 7.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 2

0: 384x384 1 Letter: Y, 7.1ms
Speed: 0.7ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 3

0: 384x384 1 Letter: L, 1 Letter: Y, 7.1ms
Speed: 0.6ms preprocess, 7.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 4

0: 384x384 1 Letter: Y, 7.2ms
Speed: 0.7ms preprocess, 7.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 5

0: 384x384 1 Letter: Y, 7.1ms
Speed: 0.7ms preprocess, 7.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 6

0: 384x384 1 Letter: Y, 7.1ms
Speed: 0.6ms preprocess, 7.1ms inference, 1.0ms postprocess per image at shape

In [20]:
yolo11l_model = YOLO(f'./{metrics_files["YOLO-l"]}/weights/best.pt')

In [22]:
generate_inference_video(yolo11l_model, 'test_inference.mp4',
                         'output_video_l.mp4')


0: 384x384 1 Letter: L, 11.7ms
Speed: 0.5ms preprocess, 11.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 1

0: 384x384 1 Letter: L, 12.3ms
Speed: 0.6ms preprocess, 12.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 2

0: 384x384 1 Letter: L, 12.1ms
Speed: 0.5ms preprocess, 12.1ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 3

0: 384x384 1 Letter: L, 12.0ms
Speed: 0.6ms preprocess, 12.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 4

0: 384x384 1 Letter: L, 12.1ms
Speed: 0.6ms preprocess, 12.1ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 5

0: 384x384 1 Letter: L, 12.1ms
Speed: 0.6ms preprocess, 12.1ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 384)
Processed frame 6

0: 384x384 1 Letter: L, 12.0ms
Speed: 0.6ms preprocess, 12.0ms inference, 0.9ms postprocess per image at shape (1, 3,