In [1]:
import numpy as np
import cv2 as cv
import os

In [3]:
video_path = "../data/raw/test_video2.mp4"
video = cv.VideoCapture(video_path)

In [4]:
width = int(video.get(cv.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv.CAP_PROP_FRAME_HEIGHT))
fps = video.get(cv.CAP_PROP_FPS)
frame_count = int(video.get(cv.CAP_PROP_FRAME_COUNT))

In [5]:
width, height, fps, frame_count

(1280, 720, 28.726484264353843, 382)

In [None]:
# let's get the first frame

# set to the first frame
video.set(cv.CAP_PROP_POS_FRAMES, 0)

# get the frame
check, first_frame = video.read()

# let's get the last frame
video.set(cv.CAP_PROP_POS_FRAMES, int(video.get(cv.CAP_PROP_FRAME_COUNT))-1)

# get the frame
check, last_frame = video.read()

# save the frames
cv.imwrite('first_frame.png', first_frame)

cv.imwrite('last_frame.png', last_frame)


True

In [None]:
cv.destroyAllWindows()

video.release()

## testing the extract_frames function

In [12]:
def extract_frames(video_path, output_path):
    """
    input:
        video_path - path to the video.
        output_path - path where to save the frames.
    
    output:
        - returns a list of frame paths and there timings when they appear
    """
    result = []
    vid = cv.VideoCapture(video_path)
    
    # check we got the video
    if not vid.isOpened():
        print("Cannot open video")
        exit()
        
    # let's calculate the total duration of the video
    fps = vid.get(cv.CAP_PROP_FPS)
    total_frame = vid.get(cv.CAP_PROP_FRAME_COUNT)
    vid_duration = (total_frame/fps) * 1000
    
    # start at 0 milisecs
    current_time_ms = 0
    frame_cnt = 0
    
    while True:
        # check of the video is ended then break if false
        if current_time_ms < vid_duration:
            # jump to specific time
            vid.set(cv.CAP_PROP_POS_MSEC, current_time_ms)
            
            # read the frame
            ret, frame = vid.read()
            
            # getting the current time 
            time_frame = current_time_ms/1000
            
            # save the frame
            path = f"{output_path}/{frame_cnt+1}_ts_{time_frame}.jpg"
            cv.imwrite(path, frame)
            
            frame_cnt += 1
            current_time_ms += 1000
            
            result.append((path, time_frame))
        else:
            break
    
    return result

In [13]:
result = extract_frames(video_path="../data/raw/test_video2.mp4", output_path="../data/processed")

## Testing the face_detector functionality

In [2]:
from deepface import DeepFace
import mediapipe as mp

2025-12-05 08:27:44.466933: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764923264.667904   38074 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764923264.735735   38074 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-05 08:27:45.368412: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=True,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5
)

MESA: error: ZINK: failed to choose pdev
I0000 00:00:1764905921.796921    1095 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1764905921.970025    1825 gl_context.cc:369] GL version: 3.1 (OpenGL ES 3.1 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: D3D12 (AMD Radeon (TM) Graphics)


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1764905922.011011    1815 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764905922.080261    1823 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [None]:
img = cv.imread()

In [8]:
def get_face_data(image_path, pad: float):
    """
    Input: Path to image
    Output:
        - face_crop: The cropped numpy array (ready for DeepFace)
        - landmarks: The raw mesh points (ready for Gaze Math)
    """
    
    img = cv.imread(image_path)
    # check
    if img is None:
        print(f"Error: Could not load {image_path}")
        return None, None
    
    h, w, _ = img.shape
    
    # convert to rgb and process it
    img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    # ready to be passed to get the face_mesh
    results = face_mesh.process(img_rgb)
    
    # check
    if not results.multi_face_landmarks:
        return None, None
    
    # Assuming one face
    landmarks = results.multi_face_landmarks[0].landmark
    
    # part-A calculating the bounding box from mesh
    
    # extract all x and y coordinates 
    x_coords = [p.x for p in landmarks]
    y_coords = [p.y for p in landmarks]
    
    x_min = int(min(x_coords) * w)
    x_max = int(max(x_coords) * w)
    y_min = int(min(y_coords) * h)
    y_max = int(max(y_coords) * h)
    
    # print("x_coords: ", x_coords)
    # print("y_coords: ", y_coords)
    
    # Applying the padding
    face_w = x_max - x_min
    face_h = y_max - y_min
    
    # 20% padding
    pad_x = int(face_w * pad)
    pad_y = int(face_h * pad)
    
    # applying padding with boundary checks
    crop_x1 = max(0, x_min - pad_x)
    crop_y1 = max(0, y_min - pad_y)
    crop_x2 = min(w, x_max + pad_x)
    crop_y2 = min(h, y_max + pad_y)
    
    # crop and return
    face_crop = img[crop_y1:crop_y2, crop_x1:crop_x2]
    
    return face_crop, landmarks
    

In [9]:
face_crop, lm = get_face_data(image_path="../data/processed/1_ts_0.0.jpg", pad=0.2)

W0000 00:00:1764905947.306013    1814 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


In [15]:
# Test on one of your saved frames
test_path = "../data/processed/2_ts_1.0.jpg" 

crop, landmarks = get_face_data(image_path=test_path, pad=0.2)

if crop is not None:
    # Show the crop (This is what DeepFace will see)
    cv.imshow("DeepFace Input (Padded)", crop)
    print(f"Success! Cropped shape: {crop.shape}")
    
    # save it
    cv.imwrite("../data/processed/cropped/2_ts_1.0_crop.jpg", crop)
    
    # Verify we still have landmarks for Gaze Logic
    print(f"Landmark 468 (Iris) X: {landmarks[468].x}")
    
    cv.waitKey(0)
    cv.destroyAllWindows()
else:
    print("No face detected.")

Success! Cropped shape: (379, 332, 3)
Landmark 468 (Iris) X: 0.48172643780708313


In [None]:
lm

## Now, Let's try to pass the cropped image to deepface

In [3]:
results = DeepFace.analyze(img_path="../data/processed/cropped/2_ts_1.0_crop.jpg", actions=['emotion'])

2025-12-05 03:59:44.798557: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


25-12-05 03:59:45 - facial_expression_model_weights.h5 will be downloaded...


Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: /home/adi_6200_/.deepface/weights/facial_expression_model_weights.h5
100%|██████████| 5.98M/5.98M [00:08<00:00, 673kB/s] 


In [4]:
results

[{'emotion': {'angry': 0.17388486303389072,
   'disgust': 9.764790753763464e-07,
   'fear': 1.5524779446423054,
   'happy': 0.006974869756959379,
   'sad': 2.214517258107662,
   'surprise': 2.6190579660578805e-05,
   'neutral': 96.05211615562439},
  'dominant_emotion': 'neutral',
  'region': {'x': 28,
   'y': 34,
   'w': 275,
   'h': 275,
   'left_eye': (195, 131),
   'right_eye': (99, 142)},
  'face_confidence': 0.94}]

## Let's try different approach for facial-expression analysis

In [146]:
image_path = "../data/processed/down.jpg"

In [85]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

In [86]:
# path to the model file(downloaded using wget)
model_path = "../models/face_landmarker.task"

# This just let's mediapipe know where is the .task(model) weights of the model
base_options = python.BaseOptions(model_asset_path=model_path)

# Start the Face detector engine
options = vision.FaceLandmarkerOptions(
    base_options=base_options,
    output_face_blendshapes=True,
    # output_face_landmarks=True,
    num_faces=1,
    min_face_detection_confidence=0.5,
    running_mode=vision.RunningMode.IMAGE
)

# initialize the detector
detector = vision.FaceLandmarker.create_from_options(options)

I0000 00:00:1764924946.220389   38074 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1764924946.688062   41824 gl_context.cc:369] GL version: 3.1 (OpenGL ES 3.1 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: D3D12 (AMD Radeon (TM) Graphics)
W0000 00:00:1764924946.775033   38074 face_landmarker_graph.cc:174] Sets FaceBlendshapesGraph acceleration to xnnpack by default.
W0000 00:00:1764924946.970160   41829 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764924947.160325   41836 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [147]:
image = mp.Image.create_from_file(image_path)

In [148]:
results = detector.detect(image)

In [129]:
categories = [category.category_name for category in results.face_blendshapes[0]]

In [None]:
categories

In [16]:
len(categories)

52

## Gaze Calculation

In [149]:
face_lm = results.face_landmarks[0]

In [150]:
type(face_lm), len(face_lm)

(list, 478)

In [151]:
def draw_landmarks_on_image(rgb_image, detection_result):
    """
    Inputs:
      rgb_image: The numpy array of the image (height, width, 3)
      detection_result: The result object from FaceLandmarker
    Output:
      annotated_image: Image with dots drawn on it
    """
    face_landmarks_list = detection_result.face_landmarks
    
    # Create a copy so we don't modify the original
    annotated_image = np.copy(rgb_image)
    height, width, _ = annotated_image.shape

    # Loop through all faces (usually just 1)
    for face_landmarks in face_landmarks_list:
        
        # 1. Draw ALL 478 points (Small Green Dots)
        for idx, landmark in enumerate(face_landmarks):
            x = int(landmark.x * width)
            y = int(landmark.y * height)
            
            # Draw tiny green dot
            cv.circle(annotated_image, (x, y), 1, (0, 255, 0), -1)

        # 2. Highlight GAZE Points (Large Red Dots)
        # These are the specific indices we use for the Ratio Math
        # [Iris, Inner, Outer, Top, Bottom] for both eyes
        gaze_indices = [
            468, 473,       # Irises
            263, 362,       # Left Eye Horizontal
            33, 133,        # Right Eye Horizontal
            386, 374,       # Left Eye Vertical
            159, 145        # Right Eye Vertical
        ]
        
        for idx in gaze_indices:
            landmark = face_landmarks[idx]
            x = int(landmark.x * width)
            y = int(landmark.y * height)
            
            # Draw larger Red dot
            cv.circle(annotated_image, (x, y), 3, (255, 0, 0), -1)

    return annotated_image

In [152]:
img = cv.imread(image_path)

In [153]:
u_img = draw_landmarks_on_image(rgb_image=img, detection_result=results)

In [154]:
cv.imshow(winname="oyee", mat=u_img)
cv.waitKey(0)
cv.destroyAllWindows()

In [155]:
# RIGHT EYE (User's Right)
R_IRIS_CENTER = 468
R_INNER_CORNER = 133  # Toward nose
R_OUTER_CORNER = 33   # Toward ear
R_TOP_LID = 159
R_BOTTOM_LID = 145

# LEFT EYE (User's Left)
L_IRIS_CENTER = 473
L_INNER_CORNER = 362  # Toward nose
L_OUTER_CORNER = 263  # Toward ear
L_TOP_LID = 386
L_BOTTOM_LID = 374

In [156]:
face_lm

[NormalizedLandmark(x=0.5549406409263611, y=0.6634749174118042, z=-0.019677728414535522, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5534313321113586, y=0.6182544231414795, z=-0.03889309987425804, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5534477829933167, y=0.6314840316772461, z=-0.019542589783668518, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5461621880531311, y=0.5775415301322937, z=-0.030993977561593056, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5533217787742615, y=0.6057403683662415, z=-0.04176728054881096, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5529879331588745, y=0.5892707705497742, z=-0.03947670757770538, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5521971583366394, y=0.5489587783813477, z=-0.02182210236787796, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.49769240617752075, y=0.5439732074737549, z=0.004208231344819069, visibility=0.0, presence=0.0),
 NormalizedLandmark(x=0.5511660575866699, y=

In [157]:
def get_coordinates(landmarks: list, idx: int) -> tuple:
    return (landmarks[idx].x, landmarks[idx].y)

In [158]:
r_iris_center_coor = get_coordinates(landmarks=face_lm, idx=R_IRIS_CENTER)

In [159]:
r_iris_center_coor

(0.5114676356315613, 0.5419827699661255)

In [160]:
import math
def euclidean_distance(coord1: tuple, coord2: tuple) -> float:
    return math.sqrt((coord1[0] - coord2[0])**2 + (coord1[1] - coord2[1])**2)

In [161]:
def calculate_gaze_ratios(landmarks: list) -> tuple:
    """
    Inputs:
        landmarks: resultant landmarks coordinates of all the important points on the face.
    Output:
        returns: (horizontal gaze ratio, vertical gaze ratio)
    """
    # important indexes in the landmark result.
    # RIGHT EYE (User's Right)
    R_IRIS_CENTER = 468
    R_INNER_CORNER = 133  # Toward nose
    R_OUTER_CORNER = 33   # Toward ear
    R_TOP_LID = 159
    R_BOTTOM_LID = 145

    # LEFT EYE (User's Left)
    L_IRIS_CENTER = 473
    L_INNER_CORNER = 362  # Toward nose
    L_OUTER_CORNER = 263  # Toward ear
    L_TOP_LID = 386
    L_BOTTOM_LID = 374
    
    # Calculating Horizontal Gaze ratio
    # For right eye
    # getting important coordinates
    r_iris_center_coor = get_coordinates(landmarks=landmarks, idx=R_IRIS_CENTER)
    r_inner_corner_coor = get_coordinates(landmarks=landmarks, idx=R_INNER_CORNER)
    r_outer_corner_coor = get_coordinates(landmarks=landmarks, idx=R_OUTER_CORNER)
    
    # getting the distances
    r_inner_d = euclidean_distance(coord1=r_iris_center_coor, coord2=r_inner_corner_coor)
    r_outer_inner_d = euclidean_distance(coord1=r_inner_corner_coor, coord2=r_outer_corner_coor)
    
    # right eye ratio
    rh_ratio = r_inner_d/r_outer_inner_d
    
    # for left eye
    # getting important coordinates
    l_iris_center_coor = get_coordinates(landmarks=landmarks, idx=L_IRIS_CENTER)
    l_inner_corner_coor = get_coordinates(landmarks=landmarks, idx=L_INNER_CORNER)
    l_outer_corner_coor = get_coordinates(landmarks=landmarks, idx=L_OUTER_CORNER)
    
    # getting the distances
    l_outer_d = euclidean_distance(coord1=l_iris_center_coor, coord2=l_outer_corner_coor)
    l_outer_inner_d = euclidean_distance(coord1=l_inner_corner_coor, coord2=l_outer_corner_coor)
    
    # left eye ratio
    lh_ratio = l_outer_d/l_outer_inner_d
    
    # horizontal gaze ratio
    h_ratio = (rh_ratio + lh_ratio)/2
    
    # Calculating Vertical Gaze ratio
    # for right eye
    ry_center = (r_inner_corner_coor[1] + r_outer_corner_coor[1])/2
    r_offset = r_iris_center_coor[1] - ry_center
    rv_ratio = r_offset/r_outer_inner_d
    
    # for left eye
    ly_center = (l_inner_corner_coor[1] + l_outer_corner_coor[1])/2
    l_offset = l_iris_center_coor[1] - ly_center
    lv_ratio = l_offset/l_outer_inner_d
    
    # vertical gaze ratio
    v_ratio = (rv_ratio + lv_ratio)/2
    
    return (h_ratio, v_ratio)

In [162]:
ratios = calculate_gaze_ratios(landmarks=face_lm)

In [163]:
ratios

(0.4983644305399181, 0.0010534110315543852)