### Import packages

In [1]:
import cv2
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np
import torch.nn as nn
import os
import time
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from typing import Tuple, Union
import math
from IPython.display import display, clear_output
import ipywidgets as widgets

### Import model

In [2]:
model_dir = f"{os.getcwd()}/models"
model_dir

# Class names corresponding to the output classes
class_names = ['close', 'open'] # 0: close, 1: open

In [3]:
# Define the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model
model = models.resnet18(pretrained=False)
num_classes = 2  # Ensure this matches the number of classes in your dataset
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
model.load_state_dict(torch.load(f'{model_dir}/resnet18-e_20-d_10k.pth', map_location=device))
model = model.to(device)
model.eval()  # Set the model to evaluation mode



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

### Image preprocessing

#### Face detection

In [4]:
# Define the transformation for input images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match the input size of the model
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Same normalization as training
])

In [5]:
def _normalized_to_pixel_coordinates(
    normalized_x: float, normalized_y: float, image_width: int,
    image_height: int) -> Union[None, Tuple[int, int]]:
  """Converts normalized value pair to pixel coordinates."""

  # Checks if the float value is between 0 and 1.
  def is_valid_normalized_value(value: float) -> bool:
    return (value > 0 or math.isclose(0, value)) and (value < 1 or
                                                      math.isclose(1, value))

  if not (is_valid_normalized_value(normalized_x) and
          is_valid_normalized_value(normalized_y)):
    # TODO: Draw coordinates even if it's outside of the image bounds.
    return None
  x_px = min(math.floor(normalized_x * image_width), image_width - 1)
  y_px = min(math.floor(normalized_y * image_height), image_height - 1)
  return x_px, y_px

In [6]:
base_options = python.BaseOptions(model_asset_path='models/detector.tflite')
options = vision.FaceDetectorOptions(base_options=base_options)
detector = vision.FaceDetector.create_from_options(options)

I0000 00:00:1721460776.273489   35376 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1721460776.289668   35419 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.183.01), renderer: NVIDIA GeForce GTX 1650 with Max-Q Design/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1721460776.320294   35422 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [7]:
def eyes_detection(frame)-> np.ndarray:
    """Detect eyes in the given frame."""
    lefteye_img = None
    righteye_img = None
    image = None

    image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)

    if image is None:
        return lefteye_img, righteye_img
    
    detection_result = detector.detect(image)

    image = np.copy(image.numpy_view())
    height, width, _ = image.shape
    
    
    for detection in detection_result.detections:

        keypoint_px_right = _normalized_to_pixel_coordinates(detection.keypoints[0].x, detection.keypoints[0].y, width, height)
        if keypoint_px_right is None:
            return lefteye_img, None
        righteye_img = image[keypoint_px_right[1] - 40:keypoint_px_right[1] + 30, keypoint_px_right[0] - 35:keypoint_px_right[0] + 35]
        
        keypoint_px_left = _normalized_to_pixel_coordinates(detection.keypoints[1].x, detection.keypoints[1].y, width, height)
        if keypoint_px_left is None:
            return None, righteye_img
        lefteye_img = image[keypoint_px_left[1] - 40:keypoint_px_left[1] + 30, keypoint_px_left[0] - 35:keypoint_px_left[0] + 35]

    return lefteye_img, righteye_img

In [8]:
def predict_frame(frame):
    """Predict the state of eyes in the given frame."""

    # Detect eyes in the frame
    left_img, right_img = None, None
    left_img, right_img = eyes_detection(frame)

    # Check if eyes were detected
    if left_img is None or right_img is None:
        return "Eyes not detected", None, None

    # Preprocess the LEFT eye image
    left_img = transform(Image.fromarray(left_img))
    right_img = transform(Image.fromarray(right_img))

    left_img = left_img.unsqueeze(0)  # Add a batch dimension
    right_img = right_img.unsqueeze(0)  # Add a batch dimension

    # Move the images to the device
    left_img = left_img.to(device)
    right_img = right_img.to(device)

    # Perform inference
    with torch.no_grad():
        outputs_left = model(left_img)
        _, predicted_left = torch.max(outputs_left, 1)
        outputs_right = model(right_img)
        _, predicted_right = torch.max(outputs_right, 1)

    # Convert prediction to class label
    if predicted_left.item() + predicted_right.item() == 0:
        result = "Close 2 eyes"
    elif predicted_left.item() + predicted_right.item() == 1:
        result = "Close 1 eye"
    else:
        result = "Open 2 eyes"

    return result, left_img, right_img

### Main code

In [9]:
cap = cv2.VideoCapture(2, cv2.CAP_V4L2)
cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'))
width = 640
height = 480
cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)

if not cap.isOpened():
    print("Error: Could not open the camera.")
    exit()

In [10]:
# Create widgets for displaying results
result_text = widgets.Label(style={'font-size': '30px', 'color': 'red'})
computeSpeed_text = widgets.Label(style={'font-size': '30px'})
image_widget = widgets.Image(format='jpeg', width=640, height=480)
left_eye_image = widgets.Image(format='jpeg')
right_eye_image = widgets.Image(format='jpeg')

right_eye_widget = widgets.VBox([
    widgets.Label(value='Right Eye', layout=widgets.Layout(align_items='center')),
    right_eye_image
])

left_eye_widget = widgets.VBox([
    widgets.Label(value='Left Eye', layout=widgets.Layout(align_items='center')),
    left_eye_image
])

output_widget = widgets.HBox([
    image_widget,
    widgets.VBox([widgets.VBox([result_text, computeSpeed_text]),
                widgets.HBox([left_eye_widget, right_eye_widget])
    ])
])

In [11]:
# Display the widgets

display(output_widget)

start_time = time.time()
try:
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()

        if not ret:
            print("Error: Could not read frame.")
            break

        # Predict the class of the frame
        if len(frame.shape) == 3:
            predicted_class, left_img, right_img = predict_frame(frame)
        else:
            continue

        # Calculate the frames per second (FPS)
        end_time = time.time()
        compute_time = end_time - start_time
        start_time = end_time

        # Update widget content
        result_text.value = f'Predicted: {predicted_class}'
        computeSpeed_text.value = f'Compute Time: {compute_time:.3f} s'

        _, frame_buffer = cv2.imencode('.jpeg', frame)
        image_widget.value = frame_buffer.tobytes()

        if left_img is not None:
            left_img_np = left_img.squeeze(0).permute(1, 2, 0).cpu().numpy()
            left_img_np = (left_img_np * 255).astype(np.uint8)
            _, left_buffer = cv2.imencode('.jpeg', left_img_np)
            left_eye_image.value = left_buffer.tobytes()

        if right_img is not None:
            right_img_np = right_img.squeeze(0).permute(1, 2, 0).cpu().numpy()
            right_img_np = (right_img_np * 255).astype(np.uint8)
            _, right_buffer = cv2.imencode('.jpeg', right_img_np)
            right_eye_image.value = right_buffer.tobytes()

        time.sleep(0.1)

        # Break the loop on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
except KeyboardInterrupt:
    pass

# When everything is done, release the capture
finally:
    cap.release()

HBox(children=(Image(value=b'', format='jpeg', height='480', width='640'), VBox(children=(VBox(children=(Label…

