## Import

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ultralytics import YOLO
import threading
import time
from scipy.spatial import distance

## Glasses Detection

In [2]:
model = YOLO('glasses_v2_640.pt', task='detect')

In [57]:
result1 = model("image_03.jpg", conf=0.4)
result2 = model("image_06.jpg", conf=0.4)

# results = model("image_06.jpg", conf=0.4)
# annotated_image = results[0].plot()

# cv2.imshow('glasses_detection', annotated_image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()



image 1/1 c:\Users\Leo\Desktop\AI\fairness_demonstator\image_03.jpg: 384x640 1 face_with_glasses, 1642.4ms
Speed: 8.1ms preprocess, 1642.4ms inference, 3.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 c:\Users\Leo\Desktop\AI\fairness_demonstator\image_06.jpg: 384x640 1 face_without_glasses, 1045.8ms
Speed: 13.7ms preprocess, 1045.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


In [10]:
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Fehler beim Öffnen der Kamera")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Fehler beim Erfassen des Frames")
        break

    results = model(frame, conf=0.4)
    
    detected_objects = results[0].boxes.cls
    if len(detected_objects) == 0:
        print("no_detection")
    else:
        for detected_object in detected_objects:
            if detected_object.item() == 0:
                print("face_with_glasses")
            elif detected_object.item() == 1:
                print("face_without_glasses")
            else:
                print("no_object")
        
    annotated_frame = results[0].plot()

    cv2.imshow('YOLOv8 Predictions', annotated_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 480x640 1 face_without_glasses, 1178.9ms
Speed: 6.0ms preprocess, 1178.9ms inference, 5.0ms postprocess per image at shape (1, 3, 480, 640)
face_without_glasses

0: 480x640 1 face_without_glasses, 1763.6ms
Speed: 14.1ms preprocess, 1763.6ms inference, 2.5ms postprocess per image at shape (1, 3, 480, 640)
face_without_glasses

0: 480x640 2 face_without_glassess, 2246.6ms
Speed: 11.0ms preprocess, 2246.6ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)
face_without_glasses
face_without_glasses

0: 480x640 1 face_without_glasses, 1283.3ms
Speed: 31.5ms preprocess, 1283.3ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)
face_without_glasses

0: 480x640 1 face_without_glasses, 1030.1ms
Speed: 2.0ms preprocess, 1030.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
face_without_glasses

0: 480x640 2 face_without_glassess, 722.2ms
Speed: 4.0ms preprocess, 722.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
face_wi

## Blink Detection:
When a person is in the drowsy state, the total number of eye blinks in a minute decreases. https://core.ac.uk/download/pdf/328811514.pdf

In [3]:
def calculate_ear(eye_landmarks):
    # EAR = (||P2 - P6|| + ||P3 - P5||) / (2 * ||P1 - P4||)
    #vertical
    A = np.linalg.norm(np.array([eye_landmarks[1].x, eye_landmarks[1].y]) - np.array([eye_landmarks[5].x, eye_landmarks[5].y]))
    B = np.linalg.norm(np.array([eye_landmarks[2].x, eye_landmarks[2].y]) - np.array([eye_landmarks[4].x, eye_landmarks[4].y]))
    #horizontal
    C = np.linalg.norm(np.array([eye_landmarks[0].x, eye_landmarks[0].y]) - np.array([eye_landmarks[3].x, eye_landmarks[3].y]))
    #eye aspect ratio
    ear = (A + B) / (2.0 * C)
    return ear

In [23]:
detected_object1 = 'face_without_glasses'
detected_object2 = 'face_with_glasses'

In [20]:
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

LEFT_EYE = [33, 160, 158, 133, 153, 144]  # P1=33, P2=160, P3=158, P4=133, P5=153, P6=144
RIGHT_EYE = [362, 385, 387, 263, 373, 380]  # P1=362, P2=385, P3=387, P4=263, P5=373, P6=380

if detected_object1 == 'face_without_glasses':
    treshhold = 0.25
elif detected_object1 == 'face_with_glasses':
    treshhold = 0.005
else:
    treshhold = 0.05

cap = cv2.VideoCapture(0)

blink_count = 0
last_blink_time = time.time()
no_blink_start_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            left_eye_landmarks = [face_landmarks.landmark[idx] for idx in LEFT_EYE]
            right_eye_landmarks = [face_landmarks.landmark[idx] for idx in RIGHT_EYE]

            left_ear = calculate_ear(left_eye_landmarks)
            right_ear = calculate_ear(right_eye_landmarks)
            ear = (left_ear + right_ear) / 2.0

            for idx in LEFT_EYE + RIGHT_EYE:
                x = int(face_landmarks.landmark[idx].x * frame.shape[1])
                y = int(face_landmarks.landmark[idx].y * frame.shape[0])
                cv2.circle(frame, (x, y), 1, (0, 255, 0), -1)

            if ear < treshhold:  # threshold for blinking
                cv2.putText(frame, "Blinking", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                blink_count += 1
                last_blink_time = time.time()
                no_blink_start_time = time.time()  # Reset the no blink timer

    # Check if 10 seconds have passed without blinking
    if time.time() - last_blink_time > 10:
        cv2.putText(frame, "Not allowed to drive", (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow('Eye Landmarks', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

## Blink+glasses Detection:

In [21]:
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

LEFT_EYE = [33, 160, 158, 133, 153, 144]  # P1=33, P2=160, P3=158, P4=133, P5=153, P6=144
RIGHT_EYE = [362, 385, 387, 263, 373, 380]  # P1=362, P2=385, P3=387, P4=263, P5=373, P6=380

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Fehler beim Öffnen der Kamera")
    exit()

detected_object = None
blink_count = 0
last_blink_time = time.time()
no_blink_start_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Fehler beim Erfassen des Frames")
        break

    results = model(frame, conf=0.4)
    detected_objects = results[0].boxes.cls
    if len(detected_objects) == 0:
        detected_object = 'no_detection'
    else:
        for detected_object in detected_objects:
            if detected_object.item() == 0:
                detected_object = 'face_with_glasses'
            elif detected_object.item() == 1:
                detected_object = 'face_without_glasses'
            else:
                detected_object = 'no_object'
        if detected_object == 'face_without_glasses':
            treshhold = 0.25
        elif detected_object == 'face_with_glasses':
            treshhold = 0.005
        else:
            treshhold = 0.05

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            left_eye_landmarks = [face_landmarks.landmark[idx] for idx in LEFT_EYE]
            right_eye_landmarks = [face_landmarks.landmark[idx] for idx in RIGHT_EYE]

            left_ear = calculate_ear(left_eye_landmarks)
            right_ear = calculate_ear(right_eye_landmarks)
            ear = (left_ear + right_ear) / 2.0

            for idx in LEFT_EYE + RIGHT_EYE:
                x = int(face_landmarks.landmark[idx].x * frame.shape[1])
                y = int(face_landmarks.landmark[idx].y * frame.shape[0])
                cv2.circle(frame, (x, y), 1, (0, 255, 0), -1)

            if ear < treshhold:  
                cv2.putText(frame, "Blinking", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                blink_count += 1
                last_blink_time = time.time()
                no_blink_start_time = time.time()


    if time.time() - last_blink_time > 10:
        cv2.putText(frame, "Not allowed to drive", (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow('YOLOv8 Predictions and Eye Landmarks', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 480x640 (no detections), 1073.7ms
Speed: 31.0ms preprocess, 1073.7ms inference, 5.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 1157.1ms
Speed: 3.0ms preprocess, 1157.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 1033.8ms
Speed: 3.0ms preprocess, 1033.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 1187.5ms
Speed: 3.6ms preprocess, 1187.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 1256.2ms
Speed: 3.0ms preprocess, 1256.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 779.0ms
Speed: 3.6ms preprocess, 779.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 731.1ms
Speed: 3.0ms preprocess, 731.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 764.2ms
Speed: 

## höhere FPS

In [None]:
def calculate_ear(eye_landmarks):
    A = ((eye_landmarks[1][0] - eye_landmarks[5][0]) ** 2 + (eye_landmarks[1][1] - eye_landmarks[5][1]) ** 2) ** 0.5
    B = ((eye_landmarks[2][0] - eye_landmarks[4][0]) ** 2 + (eye_landmarks[2][1] - eye_landmarks[4][1]) ** 2) ** 0.5
    C = ((eye_landmarks[0][0] - eye_landmarks[3][0]) ** 2 + (eye_landmarks[0][1] - eye_landmarks[3][1]) ** 2) ** 0.5
    ear = (A + B) / (2.0 * C)
    return ear

mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

LEFT_EYE = [33, 160, 158, 133, 153, 144]  # P1=33, P2=160, P3=158, P4=133, P5=153, P6=144
RIGHT_EYE = [362, 385, 387, 263, 373, 380]  # P1=362, P2=385, P3=387, P4=263, P5=373, P6=380

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Fehler beim Öffnen der Kamera")
    exit()

detected_object = None
blink_count = 0
last_blink_time = time.time()
no_blink_start_time = time.time()
last_glasses_check_time = time.time()
glasses_check_interval = 5
treshhold = 0.15

# def detect_glasses(frame):
#     results = model(frame, conf=0.4)
#     detected_objects = results[0].boxes.cls
#     if len(detected_objects) == 0:
#         return 'no_detection'
#     else:
#         for detected_object in detected_objects:
#             if detected_object.item() == 0:
#                 return 'face_with_glasses'
#             elif detected_object.item() == 1:
#                 return 'face_without_glasses'
#             else:
#                 return 'no_object'

def detect_glasses(frame):
    results = model(frame, conf=0.4)
    detected_objects = results[0].boxes

    if len(detected_objects) == 0:
        return 'no_detection'
    else:
        closest_object = None
        max_area = 0

        for detected_object in detected_objects:
            box = detected_object.xyxy[0]
            area = (box[2] - box[0]) * (box[3] - box[1])
            if area > max_area:
                max_area = area
                closest_object = detected_object

        if closest_object:
            if closest_object.cls.item() == 0:
                return 'face_with_glasses'
            elif closest_object.cls.item() == 1:
                return 'face_without_glasses'
            else:
                return 'no_object'
        else:
            return 'no_object'

def detect_blinks(frame):
    global blink_count, last_blink_time, treshhold

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            left_eye_landmarks = [(face_landmarks.landmark[idx].x * frame.shape[1], 
                                   face_landmarks.landmark[idx].y * frame.shape[0]) for idx in LEFT_EYE]
            right_eye_landmarks = [(face_landmarks.landmark[idx].x * frame.shape[1], 
                                    face_landmarks.landmark[idx].y * frame.shape[0]) for idx in RIGHT_EYE]

            left_ear = calculate_ear(left_eye_landmarks)
            right_ear = calculate_ear(right_eye_landmarks)
            ear = (left_ear + right_ear) / 2.0

            # for idx in LEFT_EYE + RIGHT_EYE:
            #     x = int(face_landmarks.landmark[idx].x * frame.shape[1])
            #     y = int(face_landmarks.landmark[idx].y * frame.shape[0])
            #     cv2.circle(frame, (x, y), 1, (0, 255, 0), -1)

            if ear < treshhold:
                cv2.putText(frame, "Blinking", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                blink_count += 1
                last_blink_time = time.time()

    if time.time() - last_blink_time > 10:
        cv2.putText(frame, "Not allowed to drive", (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

def process_frame():
    global last_glasses_check_time, treshhold
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Fehler beim Erfassen des Frames")
            break

        detect_blinks(frame)
        
        current_time = time.time()
        if current_time - last_glasses_check_time > glasses_check_interval:
            print(f"Glasses Detection")
            last_glasses_check_time = current_time

            glasses_status = detect_glasses(frame)
            if glasses_status == 'face_with_glasses':
                treshhold = 0.05 
            elif glasses_status == 'face_without_glasses':
                treshhold = 0.20 
            else:
                treshhold = 0.15

        cv2.imshow('YOLOv8 Predictions and Eye Landmarks', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

processing_thread = threading.Thread(target=process_frame)
processing_thread.start()
processing_thread.join()

cap.release()
cv2.destroyAllWindows()

Glasses Detection

0: 480x640 1 face_without_glasses, 668.0ms
Speed: 15.7ms preprocess, 668.0ms inference, 15.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_without_glasses, 608.2ms
Speed: 0.0ms preprocess, 608.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 619.8ms
Speed: 3.1ms preprocess, 619.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 618.0ms
Speed: 0.0ms preprocess, 618.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 615.5ms
Speed: 0.0ms preprocess, 615.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 722.0ms
Speed: 0.0ms preprocess, 722.0ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses,

In [None]:
def calculate_ear(eye_landmarks):
    A = ((eye_landmarks[1][0] - eye_landmarks[5][0]) ** 2 + (eye_landmarks[1][1] - eye_landmarks[5][1]) ** 2) ** 0.5
    B = ((eye_landmarks[2][0] - eye_landmarks[4][0]) ** 2 + (eye_landmarks[2][1] - eye_landmarks[4][1]) ** 2) ** 0.5
    C = ((eye_landmarks[0][0] - eye_landmarks[3][0]) ** 2 + (eye_landmarks[0][1] - eye_landmarks[3][1]) ** 2) ** 0.5
    ear = (A + B) / (2.0 * C)
    return ear

mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=2, 
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

LEFT_EYE = [33, 160, 158, 133, 153, 144]  # P1=33, P2=160, P3=158, P4=133, P5=153, P6=144
RIGHT_EYE = [362, 385, 387, 263, 373, 380]  # P1=362, P2=385, P3=387, P4=263, P5=373, P6=380

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Fehler beim Öffnen der Kamera")
    exit()

detected_object = None
blink_count = 0
last_blink_time = time.time()
no_blink_start_time = time.time()
last_glasses_check_time = time.time()
glasses_check_interval = 5
treshhold = 0.15

def detect_glasses(frame):
    results = model(frame, conf=0.4)
    detected_objects = results[0].boxes

    if len(detected_objects) == 0:
        return 'no_detection'
    else:
        closest_object = None
        max_area = 0

        for detected_object in detected_objects:
            box = detected_object.xyxy[0]
            area = (box[2] - box[0]) * (box[3] - box[1])
            if area > max_area:
                max_area = area
                closest_object = detected_object

        if closest_object:
            if closest_object.cls.item() == 0:
                return 'face_with_glasses'
            elif closest_object.cls.item() == 1:
                return 'face_without_glasses'
            else:
                return 'no_object'
        else:
            return 'no_object'

def detect_blinks(frame):
    global blink_count, last_blink_time, treshhold

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if results.multi_face_landmarks:
        closest_face_landmarks = None
        max_area = 0

        for face_landmarks in results.multi_face_landmarks:
            x_min = min([landmark.x for landmark in face_landmarks.landmark])
            y_min = min([landmark.y for landmark in face_landmarks.landmark])
            x_max = max([landmark.x for landmark in face_landmarks.landmark])
            y_max = max([landmark.y for landmark in face_landmarks.landmark])
            area = (x_max - x_min) * (y_max - y_min)

            if area > max_area:
                max_area = area
                closest_face_landmarks = face_landmarks

        if closest_face_landmarks:
            left_eye_landmarks = [(closest_face_landmarks.landmark[idx].x * frame.shape[1], 
                                   closest_face_landmarks.landmark[idx].y * frame.shape[0]) for idx in LEFT_EYE]
            right_eye_landmarks = [(closest_face_landmarks.landmark[idx].x * frame.shape[1], 
                                    closest_face_landmarks.landmark[idx].y * frame.shape[0]) for idx in RIGHT_EYE]

            left_ear = calculate_ear(left_eye_landmarks)
            right_ear = calculate_ear(right_eye_landmarks)
            ear = (left_ear + right_ear) / 2.0

            if ear < treshhold:
                cv2.putText(frame, "Blinking", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                blink_count += 1
                last_blink_time = time.time()

    if time.time() - last_blink_time > 10:
        cv2.putText(frame, "Not allowed to drive", (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

def process_frame():
    global last_glasses_check_time, treshhold
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Fehler beim Erfassen des Frames")
            break

        detect_blinks(frame)
        
        current_time = time.time()
        if current_time - last_glasses_check_time > glasses_check_interval:
            print(f"Glasses Detection")
            last_glasses_check_time = current_time

            glasses_status = detect_glasses(frame)
            if glasses_status == 'face_with_glasses':
                treshhold = 0.01 
            elif glasses_status == 'face_without_glasses':
                treshhold = 0.20 
            else:
                treshhold = 0.15

        cv2.imshow('YOLOv8 Predictions and Eye Landmarks', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

processing_thread = threading.Thread(target=process_frame)
processing_thread.start()
processing_thread.join()

cap.release()
cv2.destroyAllWindows()



Glasses Detection

0: 480x640 1 face_without_glasses, 815.2ms
Speed: 18.7ms preprocess, 815.2ms inference, 24.8ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_without_glasses, 725.0ms
Speed: 3.2ms preprocess, 725.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_without_glasses, 602.7ms
Speed: 0.0ms preprocess, 602.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_without_glasses, 633.4ms
Speed: 0.0ms preprocess, 633.4ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_without_glasses, 639.7ms
Speed: 0.0ms preprocess, 639.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 (no detections), 607.1ms
Speed: 0.0ms preprocess, 607.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 (no detections)

In [None]:
def calculate_ear(eye_landmarks):
    A = ((eye_landmarks[1][0] - eye_landmarks[5][0]) ** 2 + (eye_landmarks[1][1] - eye_landmarks[5][1]) ** 2) ** 0.5
    B = ((eye_landmarks[2][0] - eye_landmarks[4][0]) ** 2 + (eye_landmarks[2][1] - eye_landmarks[4][1]) ** 2) ** 0.5
    C = ((eye_landmarks[0][0] - eye_landmarks[3][0]) ** 2 + (eye_landmarks[0][1] - eye_landmarks[3][1]) ** 2) ** 0.5
    ear = (A + B) / (2.0 * C)
    return ear

mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=2,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

LEFT_EYE = [33, 160, 158, 133, 153, 144]  # P1=33, P2=160, P3=158, P4=133, P5=153, P6=144
RIGHT_EYE = [362, 385, 387, 263, 373, 380]  # P1=362, P2=385, P3=387, P4=263, P5=373, P6=380

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Fehler beim Öffnen der Kamera")
    exit()

detected_object = None
blink_count = 0
last_blink_time = time.time()
no_blink_start_time = time.time()
last_glasses_check_time = time.time()
glasses_check_interval = 5
treshhold = 0.15

def detect_glasses(frame):
    results = model(frame, conf=0.4)
    detected_objects = results[0].boxes

    if len(detected_objects) == 0:
        return 'no_detection'
    else:
        closest_object = None
        max_area = 0

        for detected_object in detected_objects:
            box = detected_object.xyxy[0]
            area = (box[2] - box[0]) * (box[3] - box[1])
            if area > max_area:
                max_area = area
                closest_object = detected_object

        if closest_object:
            if closest_object.cls.item() == 0:
                return 'face_with_glasses'
            elif closest_object.cls.item() == 1:
                return 'face_without_glasses'
            else:
                return 'no_object'
        else:
            return 'no_object'


processing_thread = threading.Thread(target=process_frame)
processing_thread.start()
processing_thread.join()

cap.release()
cv2.destroyAllWindows()

## Demonstrator save output

In [None]:
import cv2
import time
import threading
import mediapipe as mp

def calculate_ear(eye_landmarks):
    A = ((eye_landmarks[1][0] - eye_landmarks[5][0]) ** 2 + (eye_landmarks[1][1] - eye_landmarks[5][1]) ** 2) ** 0.5
    B = ((eye_landmarks[2][0] - eye_landmarks[4][0]) ** 2 + (eye_landmarks[2][1] - eye_landmarks[4][1]) ** 2) ** 0.5
    C = ((eye_landmarks[0][0] - eye_landmarks[3][0]) ** 2 + (eye_landmarks[0][1] - eye_landmarks[3][1]) ** 2) ** 0.5
    ear = (A + B) / (2.0 * C)
    return ear

mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=2, 
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

LEFT_EYE = [33, 160, 158, 133, 153, 144]  # P1=33, P2=160, P3=158, P4=133, P5=153, P6=144
RIGHT_EYE = [362, 385, 387, 263, 373, 380]  # P1=362, P2=385, P3=387, P4=263, P5=373, P6=380

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Fehler beim Öffnen der Kamera")
    exit()

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter('output.avi', cv2.VideoWriter_fourcc(*'XVID'), 20, (frame_width, frame_height))

detected_object = None
blink_count = 0
last_blink_time = time.time()
no_blink_start_time = time.time()
last_glasses_check_time = time.time()
glasses_check_interval = 5
treshhold = 0.15

def detect_glasses(frame):
    results = model(frame, conf=0.4)
    detected_objects = results[0].boxes

    if len(detected_objects) == 0:
        return 'no_detection'
    else:
        closest_object = None
        max_area = 0

        for detected_object in detected_objects:
            box = detected_object.xyxy[0]
            area = (box[2] - box[0]) * (box[3] - box[1])
            if area > max_area:
                max_area = area
                closest_object = detected_object

        if closest_object:
            if closest_object.cls.item() == 0:
                return 'face_with_glasses'
            elif closest_object.cls.item() == 1:
                return 'face_without_glasses'
            else:
                return 'no_object'
        else:
            return 'no_object'

def detect_blinks(frame):
    global blink_count, last_blink_time, treshhold

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if results.multi_face_landmarks:
        closest_face_landmarks = None
        max_area = 0

        for face_landmarks in results.multi_face_landmarks:
            x_min = min([landmark.x for landmark in face_landmarks.landmark])
            y_min = min([landmark.y for landmark in face_landmarks.landmark])
            x_max = max([landmark.x for landmark in face_landmarks.landmark])
            y_max = max([landmark.y for landmark in face_landmarks.landmark])
            area = (x_max - x_min) * (y_max - y_min)

            if area > max_area:
                max_area = area
                closest_face_landmarks = face_landmarks

        if closest_face_landmarks:
            left_eye_landmarks = [(closest_face_landmarks.landmark[idx].x * frame.shape[1], 
                                   closest_face_landmarks.landmark[idx].y * frame.shape[0]) for idx in LEFT_EYE]
            right_eye_landmarks = [(closest_face_landmarks.landmark[idx].x * frame.shape[1], 
                                    closest_face_landmarks.landmark[idx].y * frame.shape[0]) for idx in RIGHT_EYE]

            left_ear = calculate_ear(left_eye_landmarks)
            right_ear = calculate_ear(right_eye_landmarks)
            ear = (left_ear + right_ear) / 2.0

            if ear < treshhold:
                cv2.putText(frame, "Blinking", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                blink_count += 1
                last_blink_time = time.time()

    if time.time() - last_blink_time > 10:
        cv2.putText(frame, "Not allowed to drive", (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

def process_frame():
    global last_glasses_check_time, treshhold
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Fehler beim Erfassen des Frames")
            break

        detect_blinks(frame)
        
        current_time = time.time()
        if current_time - last_glasses_check_time > glasses_check_interval:
            print(f"Glasses Detection")
            last_glasses_check_time = current_time

            glasses_status = detect_glasses(frame)
            if glasses_status == 'face_with_glasses':
                treshhold = 0.05 
            elif glasses_status == 'face_without_glasses':
                treshhold = 0.20 
            else:
                treshhold = 0.15

        cv2.imshow('YOLOv8 Predictions and Eye Landmarks', frame)
        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

processing_thread = threading.Thread(target=process_frame)
processing_thread.start()
processing_thread.join()

cap.release()
out.release() 
cv2.destroyAllWindows()



Glasses Detection

0: 480x640 1 face_without_glasses, 886.3ms
Speed: 15.6ms preprocess, 886.3ms inference, 22.8ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_without_glasses, 616.1ms
Speed: 3.2ms preprocess, 616.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 (no detections), 619.3ms
Speed: 2.0ms preprocess, 619.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 579.4ms
Speed: 1.2ms preprocess, 579.4ms inference, 7.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 (no detections), 634.1ms
Speed: 2.9ms preprocess, 634.1ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 605.0ms
Speed: 3.0ms preprocess, 605.0ms inference, 6.0ms postprocess per image at shape (1, 3, 480, 640)
Glasses Detection

0: 480x640 1 face_with_glasses, 644.5ms