In [7]:
import os
import sys
import cv2
import mss
import time
import torch
import numpy as np
from pathlib import Path
from ultralytics import YOLO

sys.path.append(str(Path.cwd().parent))
from utils.file_dialog_utils import pick_file

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

MODEL_PATH = pick_file("Select a YOLO model file", [("YOLO model files", "*.pt")])
model = YOLO(MODEL_PATH)

print("Loaded model:", MODEL_PATH)
print("Classes:", model.names)

CUDA available: True
GPU: NVIDIA GeForce RTX 3060
Loaded model: C:/Users/Gabriel/Documents/Dissertation/Code/models/yolo/RD2_Model.pt
Classes: {0: 'ball'}


In [8]:
# Configuration settings
CONF_THRES = 0.40       # Confidence threshold
IOU_THRES  = 0.70       # NMS IoU threshold
DEVICE     = 0          # GPU 0 (CUDA). Use "cpu" if needed.

SHOW_FPS     = True
FRAME_STRIDE = 1        # Process every Nth frame. 1 = every frame, 2 = every other frame, etc.
MONITOR_INDEX = 1       # mss: monitors[1] is usually primary monitor

# Display settings (display only, inference still uses full-res frame)
DISPLAY_SCALE = 0.60

# Recording settings
RECORD = False                                 
RECORD_PATH = "outputs/live_inference_screen_recording.mp4" 
RECORD_FPS = 30.0       # FPS for the recorded video
RECORD_CODEC = "mp4v"   # Codec for recording (mp4v, XVID, etc.)

WINDOW_NAME = "Live YOLO Screen Detection (Press Q to Quit)"

In [9]:
# Create a resizable window and set a starting size
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
cv2.resizeWindow(WINDOW_NAME, 1280, 720)  # initial size (change as desired)

with mss.mss() as sct:
    monitor = sct.monitors[MONITOR_INDEX]  # full screen capture
    print("Capturing monitor:", monitor)

    # Initialize video writer if recording is enabled
    video_writer = None
    if RECORD:
        os.makedirs(os.path.dirname(RECORD_PATH), exist_ok=True)
        
        # Get monitor dimensions for video size
        width = monitor["width"]
        height = monitor["height"]
        fourcc = cv2.VideoWriter_fourcc(*RECORD_CODEC)
        video_writer = cv2.VideoWriter(RECORD_PATH, fourcc, RECORD_FPS, (width, height))
        print(f"Recording enabled: {RECORD_PATH} ({width}x{height} @ {RECORD_FPS} FPS)")

    frame_count = 0
    last_fps_time = time.time()

    try:
        while True:
            # Screen grab (BGRA)
            img = np.array(sct.grab(monitor))
            frame = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

            frame_count += 1

            # If skipping frames, just show the raw screen (scaled) without inference
            if frame_count % FRAME_STRIDE != 0:
                display = frame
            else:
                # YOLO inference on full-resolution frame
                results = model.predict(
                    source=frame,
                    conf=CONF_THRES,
                    iou=IOU_THRES,
                    device=DEVICE,
                    verbose=False
                )

                r = results[0]
                display = r.plot()  # draws boxes/labels on a copy

                # FPS overlay (computed per inference frame)
                if SHOW_FPS:
                    now = time.time()
                    fps = 1.0 / max(now - last_fps_time, 1e-6)
                    last_fps_time = now
                    cv2.putText(
                        display,
                        f"FPS: {fps:.1f}",
                        (20, 40),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1,
                        (0, 255, 0),
                        2
                    )

            # Write frame to video if recording
            if video_writer is not None:
                video_writer.write(display)

            # Downscale for display only (keeps inference accuracy intact)
            if DISPLAY_SCALE != 1.0:
                h, w = display.shape[:2]
                display = cv2.resize(
                    display,
                    (int(w * DISPLAY_SCALE), int(h * DISPLAY_SCALE)),
                    interpolation=cv2.INTER_AREA
                )

            # Show window (resizable)
            cv2.imshow(WINDOW_NAME, display)

            # Quit on Q
            key = cv2.waitKey(1) & 0xFF
            if key in (ord('q'), ord('Q')):
                break
    
    finally:
        # Release video writer if it was initialized
        if video_writer is not None:
            video_writer.release()
            print(f"Recording saved to: {RECORD_PATH}")

cv2.destroyAllWindows()

Capturing monitor: {'left': 0, 'top': 0, 'width': 1920, 'height': 1080}
