In [1]:
import cv2
from collections import deque, Counter
from ultralytics import YOLO # pyright: ignore[reportMissingImports]
import numpy as np
import time

# Load YOLOv8 medium for better accuracy
model = YOLO("yolov8m.pt")  # yolov8l.pt if GPU is available

# Animal keywords
animal_keywords = [
    "dog","cat","bird","horse","sheep","cow","elephant","bear","zebra","giraffe",
    "lion","tiger","monkey","kangaroo","rabbit","hamster","mouse","turtle","fish",
    "whale","dolphin","deer","fox","wolf","bat","squirrel","camel","rhinoceros",
    "hippopotamus","penguin","parrot","chicken","duck","goose","pig","buffalo",
    "goat","donkey","owl","falcon","peacock","crab","lobster","octopus"
]

all_classes = list(model.names.values())
animal_classes = {cls for cls in all_classes if any(k in cls.lower() for k in animal_keywords)}

# Frame smoothing
history_length = 10
frame_history = deque(maxlen=history_length)
detected_label = "None"

# Webcam
cap = cv2.VideoCapture(0)

# FPS control
prev_time = time.time()
frame_count = 0
skip_inference = 2
target_fps = 30

# Store last boxes for skipped frames
last_boxes = []

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    curr_time = time.time()
    elapsed = curr_time - prev_time
    prev_time = curr_time
    fps = 1 / elapsed if elapsed > 0 else 30

    # Adaptive skip for inference
    if fps < target_fps - 5:
        skip_inference = min(skip_inference + 1, 5)
    elif fps > target_fps + 5:
        skip_inference = max(skip_inference - 1, 1)

    h, w = frame.shape[:2]
    # Moderate resolution for speed
    scale = 640 / max(h, w)
    resized_frame = cv2.resize(frame, (int(w * scale), int(h * scale)))

    persons_detected = 0
    animals_detected = 0

    # Run inference every few frames
    if frame_count % skip_inference == 0:
        results = model(resized_frame, imgsz=640, device='cpu')
        last_boxes = []

        for r in results[0].boxes:
            cls_id = int(r.cls[0])
            label = model.names[cls_id]
            conf = float(r.conf[0])

            if conf < 0.4:
                continue

            # Scale to original frame
            coords = r.xyxy[0].cpu().numpy()
            x1, y1, x2, y2 = (int(coords[0] / scale), int(coords[1] / scale),
                              int(coords[2] / scale), int(coords[3] / scale))

            if label == "person":
                persons_detected += 1
                color = (0, 255, 0)  # Green
            elif label in animal_classes:
                animals_detected += 1
                color = (0, 0, 255)  # Red
            else:
                continue

            # Save boxes for skipped frames
            last_boxes.append((x1, y1, x2, y2, label, conf, color))

        # Frame-level label
        current_label = []
        if persons_detected > 0:
            current_label.append("Person")
        if animals_detected > 0:
            current_label.append("Animal")
        if not current_label:
            current_label.append("None")
        frame_history.append(", ".join(current_label))

    # Draw boxes from last inference
    for box in last_boxes:
        x1, y1, x2, y2, label, conf, color = box
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, f"{label}", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

    # Smooth label
    if frame_history:
        detected_label = Counter(frame_history).most_common(1)[0][0]

    # Display overall label and FPS
    cv2.putText(frame, f"Detected: {detected_label}", (30, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 0), 3)
    cv2.putText(frame, f"FPS: {int(fps)} Skip: {skip_inference}", (30, 80),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    cv2.imshow("Smooth Person vs Animal Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()  



0: 480x640 2 persons, 1 cat, 1 dog, 347.9ms
Speed: 27.8ms preprocess, 347.9ms inference, 15.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 254.0ms
Speed: 3.0ms preprocess, 254.0ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bottle, 244.9ms
Speed: 1.8ms preprocess, 244.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 245.6ms
Speed: 2.4ms preprocess, 245.6ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bottle, 245.0ms
Speed: 2.2ms preprocess, 245.0ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 suitcase, 1 bottle, 243.6ms
Speed: 1.5ms preprocess, 243.6ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 suitcase, 1 bottle, 237.2ms
Speed: 1.6ms preprocess, 237.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 per

KeyboardInterrupt: 