In [None]:
#!/usr/bin/env python3
"""
Detect a person waving using:
  1) YOLOv5m for person detection
  2) MobileNetV2+LSTM for waving inference on an 8-frame clip
Uses your MacBook’s built-in webcam instead of RealSense.
Overlays the latest wave probability in the bottom-left corner.
"""

import os
# suppress TF deprecation warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import torch
import tensorflow as tf
import cv2
import numpy as np
from collections import deque

YOLO_MODEL_PATH = 'weights/yolov5m.pt'
WAVE_MODEL_PATH = 'weights/wave_sequence_model_final.h5'
CONF_THRESHOLD  = 0.5
FRAME_WIDTH     = 640
FRAME_HEIGHT    = 480
CLIP_LENGTH     = 8
ROI_SIZE        = 224

def load_models():
    detector = torch.hub.load(
        'ultralytics/yolov5', 'custom',
        path=YOLO_MODEL_PATH
    )
    detector.conf = CONF_THRESHOLD
    wave_model = tf.keras.models.load_model(WAVE_MODEL_PATH)
    return detector, wave_model

def init_camera():
    cap = cv2.VideoCapture(0, cv2.CAP_ANY)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, FRAME_WIDTH)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, FRAME_HEIGHT)
    if not cap.isOpened():
        raise RuntimeError("Could not open webcam")
    return cap

def detect_person_box(detector, frame):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = detector(img, size=640)
    dets = results.xyxy[0].cpu().numpy()
    persons = [d for d in dets if int(d[5]) == 0 and d[4] >= CONF_THRESHOLD]
    if not persons:
        return None
    x1, y1, x2, y2, _, _ = max(persons, key=lambda d: d[4])
    return map(int, (x1, y1, x2, y2))

def main():
    person_detector, wave_model = load_models()
    print("Models loaded.")
    cap = init_camera()
    print("Webcam initialized.")

    roi_buffer = deque(maxlen=CLIP_LENGTH)
    preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input

    latest_prob = 0.0

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                continue

            box = detect_person_box(person_detector, frame)

            # draw bounding box and label if person detected
            if box:
                x1, y1, x2, y2 = box
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                label = "Person"

                # crop and buffer
                crop_bgr = frame[y1:y2, x1:x2]
                if crop_bgr.size == 0:
                    roi_buffer.clear()
                else:
                    crop_rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
                    roi = cv2.resize(crop_rgb, (ROI_SIZE, ROI_SIZE))
                    roi_buffer.append(roi)

                    if len(roi_buffer) == CLIP_LENGTH:
                        clip = np.stack(roi_buffer, axis=0).astype('float32')
                        clip = preprocess_input(clip)
                        latest_prob = float(wave_model.predict(clip[None, ...])[0, 0])
                        if latest_prob >= 0.5:
                            label = "Person • Waving"
                        roi_buffer.clear()

                cv2.putText(frame, label, (x1, max(y1 - 10, 20)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
            else:
                roi_buffer.clear()

            # overlay latest wave probability at bottom-left
            prob_text = f"Wave prob: {latest_prob:.2f}"
            cv2.putText(frame, prob_text, (10, FRAME_HEIGHT - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

            cv2.imshow('Webcam', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    finally:
        cap.release()
        cv2.destroyAllWindows()
        print("Webcam released, exiting.")

if __name__ == '__main__':
    main()

Using cache found in /Users/braeden/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-5-2 Python-3.9.21 torch-2.6.0 CPU

Fusing layers... 
YOLOv5m_v6 summary: 290 layers, 21172173 parameters, 0 gradients
Adding AutoShape... 


Models loaded.
Webcam initialized.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━

KeyboardInterrupt: 

: 