##### Copyright 2023 The MediaPipe Authors. All Rights Reserved.

# Face Detection with MediaPipe Tasks

This notebook shows you how to use the MediaPipe Tasks Python API to detect faces in images.

## Preparation

Let's start with installing MediaPipe.

Then download an off-the-shelf model. Check out the [MediaPipe documentation](https://developers.google.com/mediapipe/solutions/vision/face_detector#models) for more face detection models that you can use.

In [1]:
!wget -q -O detector.tflite -q https://storage.googleapis.com/mediapipe-models/face_detector/blaze_face_short_range/float16/1/blaze_face_short_range.tflite

## Visualization utilities

To better demonstrate the Face Detector API, we have created a set of visualization tools that will be used in this colab. These will draw a bounding box around detected faces, as well as markers over certain detected points on the faces.

In [2]:
from typing import Tuple, Union, List
import math
import cv2
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import os
import urllib.request
import time

MARGIN = 10  # pixels
ROW_SIZE = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
TEXT_COLOR = (255, 0, 0)  # red


def _normalized_to_pixel_coordinates(
    normalized_x: float, normalized_y: float, image_width: int,
    image_height: int) -> Union[None, Tuple[int, int]]:
  """Converts normalized value pair to pixel coordinates."""

  # Checks if the float value is between 0 and 1.
  def is_valid_normalized_value(value: float) -> bool:
    return (value > 0 or math.isclose(0, value)) and (value < 1 or math.isclose(1, value))

  if not (is_valid_normalized_value(normalized_x) and
          is_valid_normalized_value(normalized_y)):
    # TODO: Draw coordinates even if it's outside of the image bounds.
    return None
  x_px = min(math.floor(normalized_x * image_width), image_width - 1)
  y_px = min(math.floor(normalized_y * image_height), image_height - 1)
  return x_px, y_px


def visualize(
    image,
    detection_result
) -> np.ndarray:
  """Draws bounding boxes and keypoints on the input image and return it.
  Args:
    image: The input RGB image.
    detection_result: The list of all "Detection" entities to be visualize.
  Returns:
    Image with bounding boxes.
  """
  annotated_image = image.copy()
  height, width, _ = image.shape

  for detection in detection_result.detections:
    # Draw bounding_box
    bbox = detection.bounding_box
    start_point = bbox.origin_x, bbox.origin_y
    end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
    cv2.rectangle(annotated_image, start_point, end_point, TEXT_COLOR, 3)

    # Draw keypoints
    for keypoint in detection.keypoints:
      keypoint_px = _normalized_to_pixel_coordinates(keypoint.x, keypoint.y,
                                                     width, height)
      color, thickness, radius = (0, 255, 0), 2, 10  # Increased radius from 2 to 10
      cv2.circle(annotated_image, keypoint_px, thickness, color, radius)

    # Draw label and score
    category = detection.categories[0]
    category_name = category.category_name
    category_name = '' if category_name is None else category_name
    probability = round(category.score, 2)
    result_text = category_name + ' (' + str(probability) + ')'
    text_location = (MARGIN + bbox.origin_x,
                     MARGIN + ROW_SIZE + bbox.origin_y)
    cv2.putText(annotated_image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN,
                FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)

  return annotated_image

## Running inference and visualizing the results

The final step is to run face detection on your selected image. This involves creating your FaceDetector object, loading your image, running detection, and finally, the optional step of displaying the image with visualizations.

You can check out the [MediaPipe documentation](https://developers.google.com/mediapipe/solutions/vision/face_detector/python) to learn more about configuration options that this solution supports.

## Video Face Detection

Now let's process video files for face detection. This will process each frame of the video and detect faces in real-time.


In [3]:
# Video processing function
def process_video(video_path=0, output_path=None):
    """
    Process video file for face detection
    Args:
        video_path: Path to input video file
        output_path: Path to save output video (optional)
    """
    # Initialize video capture
    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    print(f"Video properties: {width}x{height} @ {fps} FPS")
    
    # Setup video writer if output path is provided
    if output_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
            
        # Convert BGR to RGB for MediaPipe
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Create MediaPipe Image object
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
        
        # Detect faces
        detection_result = detector.detect(mp_image)
        
        # Visualize results
        annotated_frame = visualize(frame, detection_result)
        
        # Save frame if output path is provided
        if output_path:
            out.write(annotated_frame)
        
        frame_count += 1
        
        # Process every 10th frame to avoid too many displays
        if frame_count % 10 == 0:
            print(f"Processed {frame_count} frames")
    
    # Release resources
    cap.release()
    if output_path:
        out.release()
    
    print(f"Video processing completed. Total frames: {frame_count}")


In [4]:
VIDEO_FILE = 0

In [5]:
# Process the uploaded video
if VIDEO_FILE != 0:
    print("Processing video for face detection...")
    
    # Process video and save output
    output_video_path = "output_with_faces.mp4"
    process_video(VIDEO_FILE, output_video_path)
    
    print(f"Output video saved as: {output_video_path}")
else:
    print("Processing live video")


Processing live video


## Eye Gaze Detection

Now let's implement eye gaze detection to determine if the person is looking at the screen/camera. We'll use MediaPipe Face Mesh to detect eye landmarks and calculate gaze direction.


In [6]:
# Download Face Mesh model
!wget -q -O face_landmarker.task https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task


In [7]:
# Eye landmark indices for MediaPipe Face Mesh
# Left eye landmarks
LEFT_EYE_INDICES = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246]
# Right eye landmarks  
RIGHT_EYE_INDICES = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]

# Key eye points for gaze calculation
LEFT_EYE_CENTER = 468
RIGHT_EYE_CENTER = 473
LEFT_EYE_INNER = 133
LEFT_EYE_OUTER = 33
RIGHT_EYE_INNER = 362
RIGHT_EYE_OUTER = 263


In [8]:
def calculate_gaze_ratio(eye_landmarks: List[Tuple[float, float]], 
                        p1_idx: int, 
                        p2_idx: int,
                        center_idx: int) -> float:
    """
    Calculate gaze direction ratio with a robust algorithm.
    Args:
        eye_landmarks: List of all face landmark coordinates.
        p1_idx: Index of the first horizontal eye corner.
        p2_idx: Index of the second horizontal eye corner.
        center_idx: Index of the eye center (iris).
    Returns:
        Gaze ratio (0.0 = looking left, 1.0 = looking right, 0.5 = looking straight).
    """
    # Extract coordinates for the points of interest
    p1 = np.array(eye_landmarks[p1_idx])
    p2 = np.array(eye_landmarks[p2_idx])
    center = np.array(eye_landmarks[center_idx])
    
    # Determine the leftmost and rightmost points based on x-coordinate
    left_point = p1 if p1[0] < p2[0] else p2
    right_point = p2 if p1[0] < p2[0] else p1
    
    # Calculate total horizontal distance (width of the eye)
    eye_width = right_point[0] - left_point[0]
    if eye_width == 0:
        return 0.5  # Default to center if width is zero

    # Calculate horizontal position of the eye center relative to the left corner
    center_horizontal_pos = center[0] - left_point[0]
    
    # Normalize the position to get the ratio
    ratio = center_horizontal_pos / eye_width
    
    # Clip the ratio to handle cases where the iris might be slightly outside the corners
    return np.clip(ratio, 0.0, 1.0)

In [9]:
def calculate_eye_aspect_ratio(eye_landmarks: List[Tuple[float, float]]) -> float:
    """
    Calculate Eye Aspect Ratio (EAR) to detect if eyes are open
    Args:
        eye_landmarks: List of eye landmark coordinates
    Returns:
        EAR value (lower values indicate closed eyes)
    """
    # Use specific eye landmark indices for more accurate calculation
    # Top and bottom eye landmarks
    top_landmarks = [1, 2]  # Upper eyelid
    bottom_landmarks = [4, 5]  # Lower eyelid
    horizontal_landmarks = [0, 3]  # Left and right corners
    
    # Calculate vertical distances
    vertical_distances = []
    for top_idx in top_landmarks:
        for bottom_idx in bottom_landmarks:
            if top_idx < len(eye_landmarks) and bottom_idx < len(eye_landmarks):
                vertical_dist = np.linalg.norm(
                    np.array(eye_landmarks[top_idx]) - np.array(eye_landmarks[bottom_idx])
                )
                vertical_distances.append(vertical_dist)
    
    # Calculate horizontal distance
    horizontal_dist = 0
    if len(horizontal_landmarks) >= 2:
        horizontal_dist = np.linalg.norm(
            np.array(eye_landmarks[horizontal_landmarks[0]]) - 
            np.array(eye_landmarks[horizontal_landmarks[1]])
        )
    
    if horizontal_dist == 0:
        return 0.3  # Default value if no horizontal distance
    
    # Calculate EAR
    avg_vertical = np.mean(vertical_distances) if vertical_distances else 0
    ear = avg_vertical / horizontal_dist
    
    return ear


In [10]:
def detect_attention_status(face_landmarks) -> dict:
    """
    Detect if person is looking at screen/camera based on eye gaze.
    Args:
        face_landmarks: MediaPipe face landmarks.
    Returns:
        Dictionary with attention status and metrics.
    """
    if not face_landmarks or len(face_landmarks) == 0:
        return {"looking_at_screen": False, "confidence": 0.0, "gaze_direction": "no_face_detected", "student_status": "Student not looking at the screen"}
    
    landmarks = face_landmarks[0]
    all_landmarks = [(lm.x, lm.y) for lm in landmarks]
    
    # Extract eye landmarks for EAR calculation
    left_eye_landmarks = [all_landmarks[i] for i in LEFT_EYE_INDICES]
    right_eye_landmarks = [all_landmarks[i] for i in RIGHT_EYE_INDICES]
    
    left_ear = calculate_eye_aspect_ratio(left_eye_landmarks)
    right_ear = calculate_eye_aspect_ratio(right_eye_landmarks)
    avg_ear = (left_ear + right_ear) / 2.0
    
    eyes_open = avg_ear > 0.20
    
    if not eyes_open:
        return {
            "looking_at_screen": False, 
            "confidence": 0.0, 
            "gaze_direction": "eyes_closed",
            "ear": avg_ear,
            "student_status": "Student not looking at the screen"
        }
    
    # Calculate gaze ratios for both eyes using the new robust function
    left_gaze_ratio = calculate_gaze_ratio(
        all_landmarks, LEFT_EYE_INNER, LEFT_EYE_OUTER, LEFT_EYE_CENTER
    )
    
    # The right eye's inner/outer landmarks are anatomically swapped
    # but our new function handles it automatically by finding the min/max x.
    right_gaze_ratio = calculate_gaze_ratio(
        all_landmarks, RIGHT_EYE_INNER, RIGHT_EYE_OUTER, RIGHT_EYE_CENTER
    )
    
    avg_gaze_ratio = (left_gaze_ratio + right_gaze_ratio) / 2.0
    
    # Determine gaze direction with adjusted thresholds
    if avg_gaze_ratio < 0.4:  # Looking left
        gaze_direction = "looking_left"
    elif avg_gaze_ratio > 0.6: # Looking right
        gaze_direction = "looking_right"
    else:
        gaze_direction = "looking_straight"
    
    # Confidence is 1.0 at center (0.5) and 0.0 at the edges (0 or 1)
    center_distance = abs(avg_gaze_ratio - 0.5)
    confidence = max(0, 1 - center_distance * 2)
    
    # *** KEY CHANGE HERE ***
    # Relax the confidence threshold for looking at the screen
    looking_at_screen = confidence >= 0.90 # Was 0.95
    
    student_status = "Student looking at the screen" if looking_at_screen else "Student not looking at the screen"
    
    return {
        "looking_at_screen": looking_at_screen,
        "confidence": confidence,
        "gaze_direction": gaze_direction,
        "gaze_ratio": avg_gaze_ratio,
        "ear": avg_ear,
        "eyes_open": eyes_open,
        "student_status": student_status
    }

In [11]:
def visualize_attention(image, face_landmarks, attention_status):
    """
    Visualize eye gaze and attention status on image
    Args:
        image: Input image
        face_landmarks: MediaPipe face landmarks
        attention_status: Dictionary with attention metrics
    Returns:
        Annotated image
    """
    annotated_image = image.copy()
    height, width, _ = image.shape
    
    # Always show status text, even when no face is detected
    if not face_landmarks or len(face_landmarks) == 0:
        # Show "Student not looking at the screen" message
        status_text = "Student not looking at the screen"
        color = (0, 0, 255)  # Red
    else:
        landmarks = face_landmarks[0]
        
        # Draw eye landmarks
        for idx in LEFT_EYE_INDICES + RIGHT_EYE_INDICES:
            if idx < len(landmarks):
                x = int(landmarks[idx].x * width)
                y = int(landmarks[idx].y * height)
                cv2.circle(annotated_image, (x, y), 2, (0, 255, 0), -1)
        
        # Draw eye centers
        left_center_x = int(landmarks[LEFT_EYE_CENTER].x * width)
        left_center_y = int(landmarks[LEFT_EYE_CENTER].y * height)
        right_center_x = int(landmarks[RIGHT_EYE_CENTER].x * width)
        right_center_y = int(landmarks[RIGHT_EYE_CENTER].y * height)
        
        cv2.circle(annotated_image, (left_center_x, left_center_y), 5, (255, 0, 0), -1)
        cv2.circle(annotated_image, (right_center_x, right_center_y), 5, (255, 0, 0), -1)
        
        # Prepare status text
        status_text = f"Looking at screen: {'YES' if attention_status['looking_at_screen'] else 'NO'}"
        confidence_text = f"Confidence: {attention_status['confidence']:.2f}"
        gaze_text = f"Gaze: {attention_status['gaze_direction']}"
        ear_text = f"EAR: {attention_status['ear']:.3f}"
        
        # Choose colors based on attention status
        if attention_status['looking_at_screen']:
            color = (0, 255, 0)  # Green
        else:
            color = (0, 0, 255)  # Red
    
    # Draw text with background
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1.2  # Increased from 0.7
    thickness = 3     # Increased from 2
    
    # Prepare texts to display
    if not face_landmarks or len(face_landmarks) == 0:
        texts = [status_text]
    else:
        texts = [status_text, confidence_text, gaze_text, ear_text]
    
    y_offset = 30
    
    for text in texts:
        # Get text size for background rectangle
        (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)
        
        # Draw background rectangle
        cv2.rectangle(annotated_image, 
                     (10, y_offset - text_height - 5), 
                     (10 + text_width, y_offset + 5), 
                     (0, 0, 0), -1)
        
        # Draw text
        cv2.putText(annotated_image, text, (10, y_offset), font, font_scale, color, thickness)
        y_offset += 35
    
    return annotated_image


In [12]:
# Initialize Face Landmarker
base_options = python.BaseOptions(model_asset_path='face_landmarker.task')
options = vision.FaceLandmarkerOptions(
    base_options=base_options,
    output_face_blendshapes=True,
    output_facial_transformation_matrixes=True,
    num_faces=1
)
face_landmarker = vision.FaceLandmarker.create_from_options(options)


I0000 00:00:1759224976.374987 43223734 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4
W0000 00:00:1759224976.375534 43223734 face_landmarker_graph.cc:174] Sets FaceBlendshapesGraph acceleration to xnnpack by default.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1759224976.378823 43225873 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1759224976.384721 43225878 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [13]:
# Real-time eye gaze detection for video
def process_video_with_attention(video_path, output_path=None):
    """
    Process video file for eye gaze detection and attention monitoring
    Args:
        video_path: Path to input video file
        output_path: Path to save output video (optional)
    """
    # Initialize video capture
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS) or 30
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f"Video properties: {width}x{height} @ {fps:.1f} FPS")

    # Setup video writer if output path is provided
    writer = None
    if output_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    last_timestamp_ms = 0
    attention_stats = {
        "total_frames": 0,
        "looking_at_screen_frames": 0,
        "eyes_closed_frames": 0,
        "looking_away_frames": 0
    }

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert BGR to RGB for MediaPipe
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Create MediaPipe Image object
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)

        if fps > 0:
            frame_timestamp_ms = int(frame_count * (1000.0 / fps))
        else:
            frame_timestamp_ms = last_timestamp_ms + 33
        if frame_timestamp_ms <= last_timestamp_ms:
            frame_timestamp_ms = last_timestamp_ms + 1
        last_timestamp_ms = frame_timestamp_ms

        # Detect face landmarks
        detection_result = face_landmarker.detect_for_video(mp_image, frame_timestamp_ms)

        # Detect attention status
        attention_status = detect_attention_status(detection_result.face_landmarks)

        # Update statistics
        attention_stats["total_frames"] += 1
        if attention_status["gaze_direction"] == "eyes_closed":
            attention_stats["eyes_closed_frames"] += 1
        elif attention_status["looking_at_screen"]:
            attention_stats["looking_at_screen_frames"] += 1
        else:
            attention_stats["looking_away_frames"] += 1

        # Visualize results
        annotated_frame = visualize_attention(frame, detection_result.face_landmarks, attention_status)

        # Display frame (local)
        cv2.imshow('Attention Detection (Video)', annotated_frame)

        # Save frame if output path is provided
        if writer:
            writer.write(annotated_frame)

        frame_count += 1

        # Process every 10th frame to log stats
        if frame_count % 10 == 0:
            print(f"Processed {frame_count} frames")
            print(f"Attention stats: {attention_stats}")

        # Handle key events
        if cv2.waitKey(1) & 0xFF == 27:  # ESC to quit
            break

    # Release resources
    cap.release()
    if writer:
        writer.release()
    cv2.destroyAllWindows()

    total_frames = attention_stats["total_frames"]
    if total_frames == 0:
        print("No frames processed.")
        return attention_stats

    # Print final statistics
    print(f"\nVideo processing completed. Total frames: {frame_count}")
    print(f"Final attention statistics:")
    print(f"- Looking at screen: {attention_stats['looking_at_screen_frames']} frames ({attention_stats['looking_at_screen_frames']/total_frames*100:.1f}%)")
    print(f"- Looking away: {attention_stats['looking_away_frames']} frames ({attention_stats['looking_away_frames']/total_frames*100:.1f}%)")
    print(f"- Eyes closed: {attention_stats['eyes_closed_frames']} frames ({attention_stats['eyes_closed_frames']/total_frames*100:.1f}%)")

    return attention_stats


In [14]:
# Initialize Face Landmarker with optional GPU delegate
MODEL_URL = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
MODEL_PATH = "face_landmarker.task"
if not os.path.exists(MODEL_PATH):
    print("Downloading face_landmarker.task ...")
    urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
    print("Download complete.")

base_options = python.BaseOptions(model_asset_path=MODEL_PATH)

# Try to enable GPU delegate if available; fall back to CPU
try:
    options = vision.FaceLandmarkerOptions(
        base_options=base_options,
        running_mode=vision.RunningMode.VIDEO,
        num_faces=1,
        output_face_blendshapes=True,
        output_facial_transformation_matrixes=True
    )
    face_landmarker = vision.FaceLandmarker.create_from_options(options)
    print("FaceLandmarker initialized (GPU delegate if available).")
except Exception as e:
    print("GPU delegate not available or failed, falling back to CPU:", e)
    options = vision.FaceLandmarkerOptions(
        base_options=base_options,
        running_mode=vision.RunningMode.VIDEO,
        num_faces=1
    )
    face_landmarker = vision.FaceLandmarker.create_from_options(options)
    print("FaceLandmarker initialized on CPU.")


FaceLandmarker initialized (GPU delegate if available).


I0000 00:00:1759224976.401042 43223734 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4
W0000 00:00:1759224976.401241 43223734 face_landmarker_graph.cc:174] Sets FaceBlendshapesGraph acceleration to xnnpack by default.
W0000 00:00:1759224976.404621 43225879 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1759224976.410414 43225882 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [None]:
# Real-time webcam streaming (local)
def _open_camera(preferred=(0, 1, 2), backend=cv2.CAP_AVFOUNDATION):
    for idx in preferred:
        cap = cv2.VideoCapture(idx, backend)
        if cap.isOpened():
            return cap, idx
        cap.release()
    raise RuntimeError("No available camera (tried: %s)" % (preferred,))

def run_webcam(device_index: int = 0, save_path: str = None):
    try:
        cap, device_index = _open_camera((device_index, 0, 1, 2))
    except RuntimeError as e:
        raise RuntimeError(f"Cannot open camera index {device_index}. {e}")

    if not cap.isOpened():
        raise RuntimeError(f"Cannot open camera index {device_index}")

    fps = cap.get(cv2.CAP_PROP_FPS) or 30
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print(f"Camera opened: {width}x{height} @ ~{fps:.1f} FPS")

    # Optional recorder
    writer = None
    if save_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        writer = cv2.VideoWriter(save_path, fourcc, fps, (width, height))
        print(f"Recording to: {save_path}")

    last_timestamp_ms = 0

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)

            timestamp_ms = int(time.monotonic() * 1000)
            if timestamp_ms <= last_timestamp_ms:
                timestamp_ms = last_timestamp_ms + 1
            last_timestamp_ms = timestamp_ms

            result = face_landmarker.detect_for_video(mp_image, timestamp_ms)

            attention_status = detect_attention_status(result.face_landmarks)
            annotated = visualize_attention(frame, result.face_landmarks, attention_status)

            cv2.imshow('Attention Detection (Local)', annotated)
            if writer:
                writer.write(annotated)

            if cv2.waitKey(1) & 0xFF == 27:  # ESC to quit
                break
        cap.release()
        cv2.destroyAllWindows()
    finally:
        cap.release()
        if writer:
            writer.release()
        cv2.destroyAllWindows()

run_webcam(0, 'attn_recording.mp4')  # pass None to disable recording


Camera opened: 1920x1080 @ ~15.0 FPS
Recording to: attn_recording.mp4
