In [2]:
from ultralytics import YOLO
import cv2
import cvzone
import math
import time

cap = cv2.VideoCapture(0)  # For Webcam
cap.set(3, 1280)
cap.set(4, 720)

model = YOLO("../Yolo-Weights/yolov8l.pt")

classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

prev_frame_time = 0
new_frame_time = 0

while True:
    new_frame_time = time.time()
    
    # Capture frame-by-frame
    ret, frame = cap.read()
    
    # Check if frame is valid
    if not ret:
        print("Error: Failed to capture frame")
        break
    
    # Perform object detection
    results = model(frame, stream=True)
    
    for r in results:
        boxes = r.boxes
        for box in boxes:
            # Bounding Box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            w, h = x2 - x1, y2 - y1
            cvzone.cornerRect(frame, (x1, y1, w, h))
            # Confidence
            conf = math.ceil((box.conf[0] * 100)) / 100
            # Class Name
            cls = int(box.cls[0])
 
            cvzone.putTextRect(frame, f'{classNames[cls]} {conf}', (max(0, x1), max(35, y1)), scale=1, thickness=1)
    
    fps = 1 / (new_frame_time - prev_frame_time)
    prev_frame_time = new_frame_time
    print(fps)
    
    # Display the resulting frame
    cv2.imshow('Frame', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture
cap.release()
cv2.destroyAllWindows()


0: 384x640 1 person, 1321.2ms
Speed: 9.6ms preprocess, 1321.2ms inference, 15.4ms postprocess per image at shape (1, 3, 384, 640)
5.842031814512847e-10

0: 384x640 1 person, 1056.6ms
Speed: 3.2ms preprocess, 1056.6ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)
0.3593410023173844

0: 384x640 1 person, 810.7ms
Speed: 2.6ms preprocess, 810.7ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)
0.8851810870307999

0: 384x640 1 person, 786.7ms
Speed: 2.5ms preprocess, 786.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)
1.1399132598123547

0: 384x640 1 person, 803.8ms
Speed: 2.3ms preprocess, 803.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
1.1920499385544785

0: 384x640 1 person, 1129.9ms
Speed: 3.3ms preprocess, 1129.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)
1.1768291998590377

0: 384x640 1 person, 876.0ms
Speed: 4.1ms preprocess, 876.0ms inference, 2.6ms postprocess per image at sha

KeyboardInterrupt: 

: 