In [1]:
import platform
import torch

print("Python version: ",platform.python_version())
print("PyTotch:", torch.__version__)
print("torch available: ", torch.cuda.is_available())
print("DEVICE: ", torch.cuda.get_device_name(torch.cuda.current_device()))

Python version:  3.9.19
PyTotch: 2.3.1+cu118
torch available:  True
DEVICE:  NVIDIA GeForce GTX 1650 with Max-Q Design


In [1]:
import cv2
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np
import torch.nn as nn
import os
import time
import mediapipe as mp
from typing import Tuple, Union, Optional
import math
from IPython.display import display
import ipywidgets as widgets

os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0"
cv2.ocl.setUseOpenCL(False)

# Configuration
CONFIG = {
    'model_path': 'models/resnet18-e_20-d_10k.pth',
    'image_size': (224, 224),
    'class_names': ['close', 'open'],
    'num_classes': 2,
}

# Constants
EYE_VERTICAL_OFFSET = 35
EYE_HORIZONTAL_OFFSET = 35

# Define the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def load_model(model_path: str, num_classes: int) -> nn.Module:
    """Load and prepare the model for inference."""
    model = models.resnet18(pretrained=False)
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, num_classes)
    model.load_state_dict(torch.load(model_path, map_location=device))
    return model.to(device).eval()

def load_face_detector():
    """Load the face detector model."""
    mp_face_detection = mp.solutions.face_detection
    return mp_face_detection.FaceDetection(min_detection_confidence=0.5)

# Load models
model = load_model(CONFIG['model_path'], CONFIG['num_classes'])

# Image preprocessing
def apply_clahe(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    equalized = clahe.apply(gray)
    return cv2.cvtColor(equalized, cv2.COLOR_GRAY2BGR)

def adjust_gamma(image, gamma=1.0):
    inv_gamma = 1.0 / gamma
    table = np.array([((i / 255.0) ** inv_gamma) * 255
                      for i in np.arange(0, 256)]).astype("uint8")
    return cv2.LUT(image, table)

def normalize_image(image):
    return cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX)

def preprocess_image(image):
    # Apply a combination of techniques
    image = normalize_image(image)
    image = apply_clahe(image)
    image = adjust_gamma(image, 1.2)  # Slightly increase brightness
    return image

# Define the transformation for input images
transform = transforms.Compose([
    transforms.Resize(CONFIG['image_size']),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def eyes_detection(frame: np.ndarray, face_detection) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """Detect eyes in the given frame."""
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_detection.process(image)
    
    height, width, _ = frame.shape
    
    lefteye_img = None
    righteye_img = None
    eyes_coor = None
    
    if results.detections:
        for detection in results.detections:
            bbox = detection.location_data.relative_bounding_box
            x, y, w, h = int(bbox.xmin * width), int(bbox.ymin * height), int(bbox.width * width), int(bbox.height * height)
            
            # Estimate eye positions (you may need to adjust these ratios)
            left_eye_x = int(x + w * 0.3)
            right_eye_x = int(x + w * 0.7)
            eye_y = int(y + h * 0.1)
            
            # eyes_coor = [(left_X1, left_X2), (right_X1, right_X2), (Y1, Y2)]
            eyes_coor = [(left_eye_x - EYE_HORIZONTAL_OFFSET, left_eye_x + EYE_HORIZONTAL_OFFSET), 
                         (right_eye_x - EYE_HORIZONTAL_OFFSET, right_eye_x + EYE_HORIZONTAL_OFFSET),
                         (eye_y - EYE_VERTICAL_OFFSET, eye_y + EYE_VERTICAL_OFFSET)]
            
            lefteye_img = frame[eyes_coor[2][0]:eyes_coor[2][1], eyes_coor[0][0]:eyes_coor[0][1]]
                                
            righteye_img = frame[eyes_coor[2][0]:eyes_coor[2][1], eyes_coor[1][0]:eyes_coor[1][1]]
            
            break  # Assume only one face for simplicity
    
    return lefteye_img, righteye_img, eyes_coor

def predict_frame(frame: np.ndarray, face_detection) -> Tuple[int, Optional[np.ndarray], Optional[np.ndarray]]:
    """Predict the state of eyes in the given frame."""
    preprocessed_frame = preprocess_image(frame)
    
    left_frame, right_frame, eyes_coor = eyes_detection(preprocessed_frame, face_detection)

    if left_frame is None or right_frame is None:
        return "Eyes not detected", None, None, None

    # Further preprocess the eye images
    left_frame = preprocess_image(left_frame)
    right_frame = preprocess_image(right_frame)

    left_img = transform(Image.fromarray(left_frame)).unsqueeze(0).to(device)
    right_img = transform(Image.fromarray(right_frame)).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs_left = model(left_img)
        outputs_right = model(right_img)
        _, predicted_left = torch.max(outputs_left, 1)
        _, predicted_right = torch.max(outputs_right, 1)

    result = predicted_left.item() + predicted_right.item()

    return result, left_frame, right_frame, eyes_coor

# Create widgets for displaying results
result_text = widgets.Label(style={'font-size': '30px', 'color': 'red'})
computeSpeed_text = widgets.Label(style={'font-size': '30px'})
inputFPS_text = widgets.Label(style={'font-size': '30px'})
left_eye_image = widgets.Image(format='png')
right_eye_image = widgets.Image(format='png')
origin_image = widgets.Image(format='png', width=320, height=240)

right_eye_widget = widgets.VBox([
    widgets.Label(value='  Right Eye', layout=widgets.Layout(align_items='center')),
    right_eye_image
])

left_eye_widget = widgets.VBox([
    widgets.Label(value='  Left Eye', layout=widgets.Layout(align_items='center')),
    left_eye_image
])

origin_image_widget = widgets.VBox([
    widgets.Label(value='    Camera frame', layout=widgets.Layout(align_items='center')),
    origin_image
])

output_widget = widgets.HBox([
    widgets.HBox([widgets.VBox([result_text, computeSpeed_text, inputFPS_text]),
                  widgets.Label(value=' '),
                widgets.HBox([left_eye_widget, right_eye_widget, origin_image_widget])
    ])
], layout=widgets.Layout(align_items='center'))

def main():
    
    cap = cv2.VideoCapture(0, cv2.CAP_V4L)
    cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'))
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    cap.set(cv2.CAP_PROP_FPS, 15.0)
    
    
    # cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Cannot open camera")
        return
    
    
    display(output_widget)
    
    start_time = time.time()

    eyes_states_his = []
    eyes_states = 1 # 1 for safe, 0 for danger
    history_len = 5
    
    face_detection = load_face_detector()  # Create face detection object once
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                continue

            end_time = time.time()
            compute_time = end_time - start_time
            start_time = end_time
            
            num_opened_eyes, left_img, right_img, eyes_coor = predict_frame(frame, face_detection)
            
            if not eyes_coor is None:
                cv2.rectangle(frame, (eyes_coor[0][0], eyes_coor[2][0]), (eyes_coor[0][1], eyes_coor[2][1]), (255, 0, 0), 2, cv2.LINE_AA)
                cv2.rectangle(frame, (eyes_coor[1][0], eyes_coor[2][0]), (eyes_coor[1][1], eyes_coor[2][1]), (255, 0, 0), 2, cv2.LINE_AA)
            _, frame_buffer = cv2.imencode('.png', frame)
            origin_image.value = frame_buffer.tostring()
            
            
            
            if len(eyes_states_his) == history_len:
                eyes_states_his = eyes_states_his[1:]
                eyes_states_his.append(0 if num_opened_eyes == 0 else 1)
                eyes_states = 0 if (eyes_states_his[0] == 0 and eyes_states_his[1] == 0 and eyes_states_his[2] == 0) or \
                                    (eyes_states_his[1] == 0 and eyes_states_his[2] == 0 and eyes_states_his[3] == 0) or \
                                    (eyes_states_his[2] == 0 and eyes_states_his[3] == 0 and eyes_states_his[4] == 0) else 1
            else:
                eyes_states_his.append(0 if num_opened_eyes ==0 else 1)
            
            fps = int(cap.get(5))
            inputFPS_text.value = f'FPS: {fps}'
            result_text.value = f'Predicted: {num_opened_eyes} eyes opened!'
            computeSpeed_text.value = f'Compute Time: {compute_time:.3f} s'
            
            if left_img is not None:
                if eyes_states:
                    cv2.rectangle(left_img, (0, 0), (EYE_VERTICAL_OFFSET * 2, EYE_HORIZONTAL_OFFSET * 2), (0, 255, 0), 5, cv2.LINE_AA)
                else:
                    cv2.rectangle(left_img, (0, 0), (EYE_VERTICAL_OFFSET * 2, EYE_HORIZONTAL_OFFSET * 2), (0, 0, 255), 5, cv2.LINE_AA)
                
                _, left_buffer = cv2.imencode('.png', left_img)
                left_eye_image.value = left_buffer.tostring()

            if right_img is not None:
                if eyes_states:
                    cv2.rectangle(right_img, (0, 0), (EYE_VERTICAL_OFFSET * 2, EYE_HORIZONTAL_OFFSET * 2), (0, 255, 0), 5, cv2.LINE_AA)
                else:
                    cv2.rectangle(right_img, (0, 0), (EYE_VERTICAL_OFFSET * 2, EYE_HORIZONTAL_OFFSET * 2), (0, 0, 255), 5, cv2.LINE_AA)
                    
                _, right_buffer = cv2.imencode('.png', right_img)
                right_eye_image.value = right_buffer.tostring()
            
            time.sleep(1/30)
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
    except KeyboardInterrupt:
        pass
    
    finally:
        cap.release()

cuda




In [4]:
if __name__ == "__main__":
    main()

HBox(children=(HBox(children=(VBox(children=(Label(value='Predicted: 2 eyes opened!'), Label(value='Compute Ti…

I0000 00:00:1722923297.204185    6205 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1722923297.240261    6317 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.183.01), renderer: NVIDIA GeForce GTX 1650 with Max-Q Design/PCIe/SSE2
W0000 00:00:1722923297.243499    6312 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
  origin_image.value = frame_buffer.tostring()
  left_eye_image.value = left_buffer.tostring()
  right_eye_image.value = right_buffer.tostring()
