In [1]:
import cv2
import torch
from ultralytics import YOLO

# Load your fine-tuned model
model = YOLO("runs/segment/train5/weights/best.pt")  # path to best.pt

# Select device (MPS for M1/M2, CUDA for NVIDIA, CPU fallback)
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using device: {device}")

# Open webcam (0 = default cam, or use video file path instead)
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not access webcam")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Run inference on frame
    results = model(frame, device=device)[0]

    # Overlay detections (masks + boxes + labels)
    annotated_frame = results.plot()

    # Show the frame
    cv2.imshow("Vision2Clean AI - Waste Segmentation", annotated_frame)

    # Exit when 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Using device: cuda

0: 480x640 1 Paper, 78.7ms
Speed: 4.5ms preprocess, 78.7ms inference, 71.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 1 Paper, 7.8ms
Speed: 1.2ms preprocess, 7.8ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 1 Paper, 9.1ms
Speed: 0.9ms preprocess, 9.1ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 1 Paper, 8.1ms
Speed: 0.9ms preprocess, 8.1ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 8.3ms
Speed: 1.0ms preprocess, 8.3ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 8.0ms
Speed: 0.8ms preprocess, 8.0ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 8.1ms
Speed: 0.9ms preprocess, 8.1ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 Cardboard, 8.8ms
Speed: 0.8ms preprocess, 8.8ms infe