In [1]:
import cv2
import mediapipe as mp
import numpy as np
import time
import threading
import win32com.client

# -------------------------------------------------
# Text-to-speech setup.
# This runs in a separate thread so video never freezes.
# -------------------------------------------------
speaker = win32com.client.Dispatch("SAPI.SpVoice")

# Why thread: Speech runs in background, Camera loop stays real-time, this function prevents freezing
def speak_async(text):
    threading.Thread(
        target=lambda: speaker.Speak(text),
        daemon=True #daemon=True: Thread dies automatically when program exits.
    ).start()

# -------------------------------------------------
# Basic geometry helpers.
# We only need distance between two points.
# -------------------------------------------------
def dist(p1, p2):
    return np.linalg.norm(np.array(p1) - np.array(p2))

# -------------------------------------------------
# Eye Aspect Ratio.
# Lower value means eyes are more closed.
# Vertical distances (A, B)
# Horizontal distance (C)
# Landmarks come from scientific facial geometry papers.
# -------------------------------------------------
def eye_aspect_ratio(eye):
    A = dist(eye[1], eye[5])
    B = dist(eye[2], eye[4])
    C = dist(eye[0], eye[3])
    return (A + B) / (2.0 * C)

# -------------------------------------------------
# Mouth Aspect Ratio.
# Ratio: Works for near and far faces
# Higher value means mouth is more open.
# -------------------------------------------------
def mouth_aspect_ratio(landmarks):
    # Correct MediaPipe lip landmarks
    top = landmarks[13]
    bottom = landmarks[14]
    left = landmarks[78]
    right = landmarks[308]

    vertical = dist(top, bottom)
    horizontal = dist(left, right)

    return vertical / horizontal

# -------------------------------------------------
# Draws a semi-transparent panel on the screen.
# Used for metrics and system info boxes.
# -------------------------------------------------
def draw_panel(frame, x1, y1, x2, y2, color=(50,60,70), alpha=0.6):
    overlay = frame.copy()
    cv2.rectangle(overlay, (x1,y1), (x2,y2), color, -1)
    cv2.addWeighted(overlay, alpha, frame, 1-alpha, 0, frame)

# -------------------------------------------------
# Mediapipe face mesh initialization.
# max_num_faces=1; Saves CPU. This is a driver system, not a crowd.
# refine_landmarks=True; Enables: Iris landmarks, Better eye accuracy
# -------------------------------------------------
mp_face = mp.solutions.face_mesh
face_mesh = mp_face.FaceMesh(
    max_num_faces=1,
    refine_landmarks=True
)
# Landmark indices for eyes
LEFT_EYE  = [33,160,158,133,153,144]
RIGHT_EYE = [362,385,387,263,373,380]

# Thresholds decided empirically
EYE_THRESH = 0.23
MOUTH_THRESH = 0.40

# -------------------------------------------------
# ALERT_COOLDOWN   = 5 
# After an alert is spoken, wait 5 seconds before allowing the next alert.
# Without cooldown: Voice repeats every frame, 30–60 alerts per second, System becomes unusable
# 5 seconds? 2 seconds → too frequent, 10 seconds → too late, 5 seconds → balanced
# -------------------------------------------------
ALERT_COOLDOWN = 5
DROWSY_TIME_LIMIT = 1.5 # seconds
YAWN_TIME_LIMIT = 1.5 # seconds


# -------------------------------------------------
# Runtime state variables.
# These change continuously while the program runs.
# -------------------------------------------------
eye_closed_start = None # Stores time when eyes first closed.
mouth_open_start = None

mouth_open = False # flag prevents multiple yawn counts for one yawn.
yawn_count = 0
yawn_state = "Not Yawning"
drowsy_level = 0 # Converted into percentage bar.

last_drowsy_alert = 0
last_yawn_alert = 0

# -------------------------------------------------
# Start webcam.
# -------------------------------------------------
cap = cv2.VideoCapture(0)
prev_time = time.time()

# -------------------------------------------------
# Main loop.
# -------------------------------------------------
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Mirror view for natural interaction
    frame = cv2.flip(frame, 1)
    h, w, _ = frame.shape
    
    # FPS calculation with safety against division by zero
    now = time.time()
    fps = int(1 / max(now - prev_time, 1e-6)) # 1e-6: means 0.000001 seconds. It prevents division by zero.
    prev_time = now
    
    # Convert frame to RGB for Mediapipe
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb)

    ear = 0.0
    mar = 0.0
    face_count = 0

    if results.multi_face_landmarks:
        face_count = 1
        face = results.multi_face_landmarks[0]
        
        #MediaPipe gives normalized values (0–1). OpenCV needs pixel coordinates.
        # Convert normalized landmarks to pixel coordinates
        landmarks = [(int(l.x * w), int(l.y * h)) for l in face.landmark]

        left_eye = [landmarks[i] for i in LEFT_EYE]
        right_eye = [landmarks[i] for i in RIGHT_EYE]

        ear = (eye_aspect_ratio(left_eye) + eye_aspect_ratio(right_eye)) / 2
        mar = mouth_aspect_ratio(landmarks)

        # print(f"EAR: {ear:.3f} | MAR: {mar:.3f}")


        # ---------------------------
        # Drowsiness detection
        # ---------------------------
        if ear < EYE_THRESH:
            if eye_closed_start is None:
                eye_closed_start = now
            drowsy_level = min(100, int((now - eye_closed_start) * 60))
        else:
            eye_closed_start = None
            drowsy_level = max(0, drowsy_level - 5)

        # ---------------------------
        # Yawn detection 
        # ---------------------------
        if mar > MOUTH_THRESH and ear < 0.30:
            if not mouth_open:
                mouth_open = True
                mouth_open_start = now
                yawn_state = "Yawning"
        else:
            if mouth_open:
                if now - mouth_open_start >= YAWN_TIME_LIMIT:
                    yawn_count += 1
                mouth_open = False
                yawn_state = "Not Yawning"

        # Face bounding box
        xs = [p[0] for p in landmarks]
        ys = [p[1] for p in landmarks]
        cv2.rectangle(frame, (min(xs), min(ys)), (max(xs), max(ys)), (0,255,0), 2)

    # -------------------------------------------------
    # Alert handling with cooldown protection.
    # -------------------------------------------------
    if eye_closed_start and (now - eye_closed_start) >= DROWSY_TIME_LIMIT:
        if now - last_drowsy_alert > ALERT_COOLDOWN:
            speak_async("Drowsiness detected. Stay alert.")
            last_drowsy_alert = now

    if yawn_count >= 4:
        if now - last_yawn_alert > ALERT_COOLDOWN:
            speak_async("Frequent yawning detected.")
            last_yawn_alert = now
            yawn_count = 0

    # -------------------------------------------------
    # User interface drawing.
    # -------------------------------------------------
    draw_panel(frame, 10, 10, 350, 190)
    cv2.putText(frame, "METRICS", (20,35),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)
    cv2.putText(frame, f"EAR: {ear:.2f}", (20,70),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)
    cv2.putText(frame, f"Yawn State: {yawn_state}", (20,105),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)
    cv2.putText(frame, f"Yawn Count: {yawn_count}", (20,140),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)

    draw_panel(frame, 360, 10, 760, 190)
    cv2.putText(frame, "SYSTEM INFO", (390,35),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)
    cv2.putText(frame, f"Faces: {face_count}", (390,70),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)
    cv2.putText(frame, f"Drowsiness: {drowsy_level}%", (390,105),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)

    cv2.rectangle(frame, (390,135),
                  (390 + int(drowsy_level * 3), 155),
                  (0,0,255) if drowsy_level >= 70 else (0,255,0), -1)

    cv2.putText(frame, f"FPS: {fps}", (w-100,30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)

    cv2.imshow("Driver Monitoring System", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()