In [1]:
%%capture
!pip install mediapipe

In [2]:
import numpy as np
import os
import cv2
import math
import glob
import random
import scipy.io as sio
from math import cos, sin
from pathlib import Path
import pandas as pd
import mediapipe
import warnings

In [3]:
warnings.filterwarnings('ignore')
from google.colab.patches import cv2_imshow

In [None]:
if os.path.isfile('/content/AFLW2000-3D.zip') == False:
    !gdown --id 1fP3zvSCYjll_o_m7S12nvQLZ9MnsEoap
    !unzip /content/AFLW2000-3D.zip

In [46]:
# Extract X_points, Y_points, labels from images
x_points= []
y_points = []
labels = []
file_names = sorted([Path(f).stem for f in glob.glob("AFLW2000/*.mat")])
faceModule = mediapipe.solutions.face_mesh
for filename in file_names:
  with faceModule.FaceMesh(static_image_mode=True) as faces:
    image = cv2.imread('AFLW2000/'+filename+'.jpg')
    results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_face_landmarks != None:
          face = results.multi_face_landmarks[0]
          X = []
          Y = []
          for landmark in face.landmark:
              X.append(landmark.x)
              Y.append(landmark.y)
          x_points.append(np.array(X))
          y_points.append(np.array(Y))
          mat_file = sio.loadmat('AFLW2000/'+filename+'.mat')
          pose_para = mat_file["Pose_Para"][0][:3]
          labels.append(pose_para)
x_points = np.array(x_points)
y_points = np.array(y_points)
labels = np.array(labels)

In [52]:
# Normalizing data
x_center_point = x_points - x_points[:,5].reshape(-1,1)
y_center_point = y_points - y_points[:,5].reshape(-1,1)

final_x = x_center_point / np.max(np.abs(x_center_point),axis=1).reshape(-1,1)
final_y = y_center_point / np.max(np.abs(y_center_point),axis=1).reshape(-1,1)

# Features
feature = np.hstack([final_x,final_y])

In [74]:
# Spliting data
from sklearn.model_selection import train_test_split
features_train, features_validation, labels_train, labels_validation = train_test_split(feature ,labels ,test_size=.2,random_state=40)

In [75]:
# Use the SVR model for data
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
model_svr = MultiOutputRegressor(SVR(kernel='rbf',C=3,gamma=0.01,degree=3,epsilon=0.005))
model_svr.fit(features_train, labels_train)

In [76]:
print("Train acc: ", model_svr.score(features_train,labels_train ))
print("Validation Error: ",model_svr.score(features_validation,labels_validation ))

Train acc:  0.44659287276185106
Validation Error:  0.8567949882037263


In [56]:
def draw_axis(img, pitch, yaw, roll, tdx=None, tdy=None, size=100):
    yaw = -yaw
    if tdx != None and tdy != None:
        tdx = tdx
        tdy = tdy
    else:
        height, width = img.shape[:2]
        tdx = width / 2
        tdy = height / 2

    # X-Axis pointing to right. drawn in red
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy

    # Y-Axis | drawn in green
    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x1), int(y1)), (0, 0, 255), 3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x2), int(y2)), (0, 255, 0), 3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3), int(y3)), (255, 0, 0), 2)

    return img

In [64]:
def landmarks(image):
    x_features = []
    y_features = []
    with faceModule.FaceMesh(static_image_mode=True) as faces:
        results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if results.multi_face_landmarks != None:
            for face in results.multi_face_landmarks:
                for landmark in face.landmark:
                    x = landmark.x
                    y = landmark.y
                    shape = image.shape
                    x_features.append(x*shape[1])
                    y_features.append(y*shape[0])
    if x_features:
        x_features=np.array(x_features)
        y_features=np.array(y_features)
        img_features2=np.hstack([x_features,y_features])
        x_center=x_features-x_features[5]
        y_center=y_features-y_features[5]
        final_x = x_center / np.max(np.abs(x_center))
        final_y = y_center / np.max(np.abs(y_center))
        features = np.hstack([final_x,final_y])
    else:
        features=[]
        img_features2=[]
    return features , x_features,y_features

In [65]:
def predict_pose(image, model):
    img_features,x_features,y_features = landmarks(image)
    if image.shape == (0,) or len(img_features) == 0:
        return None,None,None
    img_features = img_features.reshape(1,-1)
    return model.predict(img_features), x_features , y_features

In [90]:
video_path = '/content/WhatsApp Video 2024-03-03 at 9.05.53 PM.mp4'
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
out = cv2.VideoWriter('new_axis_video.mp4',cv2.VideoWriter_fourcc('X','V','I','D'), 30, (frame_width,frame_height))
while True:
    ret, frame = cap.read()
    if not ret:
        break
    pose , x_features,y_features  = predict_pose(frame, model_svr) or (None,None)
    if pose is not None:
        pitch, yaw, roll = pose[0]
        draw_axis(frame, pitch, yaw,roll,tdx=x_features[5],tdy=y_features[5] ,size = 100)
    out.write(frame)
cap.release()
out.release()
cv2.destroyAllWindows()