In [None]:

# install mediapipe library
%pip install mediapipe

%pip install tpot

In [1]:
from tqdm import tqdm_notebook as tqdm

**Importing Needed Libraries**

In [2]:
import numpy as np
import os,cv2,math,glob,random
import scipy.io as sio
from math import cos, sin
from pathlib import Path
import mediapipe
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
#from google.colab.patches import cv2_imshow # only when you use google colab

**Download and load Data**


*   Data is 
*   List item



-the link if you want to directly download the data
http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/Database/AFLW2000-3D.zip

-OR you can download it from google drive using id which is google drive gives to every file on it
(you can find the id of your file in the place of * in url such as: https://drive.google.com/file/d/**********************/view?usp=sharing)

**Preparing Data for Training by these steps**

*   picking X_points
*   picking Y_points
*   picking labels
*   Which files that MediaPipe able to detect the face in images

-For simplicity We will just take X,Y points from data ignoring Z points and using this points for training Ml model

-Dont worry It will be explained when we reach to this part of code

In [3]:
X_points = []
Y_points = []
labels = []
detected_files = []

'''
if you take a look of data files you will find 2000 images and there are matlab files
for every image named the same name of image that it belongs to
'''
# now we will create a list extracting all the names of images (the same name of matlab files)
file_names = sorted([Path(f).stem for f in glob.glob("AFLW2000/*.mat")])

''' 
or you can use this line insted
file_names = sorted([Path(f).stem for f in glob.glob("..\AFLW2000*.jpg")])
'''


' \nor you can use this line insted\nfile_names = sorted([Path(f).stem for f in glob.glob("..\\AFLW2000*.jpg")])\n'

In [4]:
len(file_names) # this will show you the number of images in the folder

2000

using mediapipe we can generating the landmarks into the faces

this mean we can extract the features from images using it 


In [None]:
# first let's try open image useng CV2
imagetest1 = cv2.imread('AFLW2000/image00053.jpg') # read the image
cv2.imshow('image test 1',imagetest1) # display the image
cv2.waitKey(0) 
  
#closing all open windows 
cv2.destroyAllWindows()


In [None]:
'''
 note: Any photo is only array of pixels and the pixel have 3 values express the color from 0 to 255
 i will show you this array

'''
imagetest1

In [None]:
# first 2 number is the number of pixels that photo consist of
imagetest1.shape

In [None]:
# other image to compare number of pixels
imagetest2 = cv2.imread('robert_downey_jr.jpg')
cv2.imshow("image test 2",imagetest2)
cv2.waitKey(0) 
cv2.destroyAllWindows()


In [None]:
imagetest2

In [None]:
imagetest2.shape

In [None]:
# Its very obvious to say that the image have more pixels is bigger 
# Note: first num is Y axis and second is X axis

In [6]:
'''
face_mesh sub-module exposes the function necessary to do the face detection and landmarks estimation

'''
faceModule = mediapipe.solutions.face_mesh 

# looping over the file names to load the images and their corresponding mat file
for filename in tqdm(file_names):

  '''
  We are going to wrap the creation of this object on a with statement
  This ensures the resources are freed after we no longer need the object
  without it you memory maybe exceeds its storage capacit
  '''
  with faceModule.FaceMesh(static_image_mode=True) as faces: # creating object from FaceMesh class

    # loading the image
    image = cv2.imread('AFLW2000/'+filename+'.jpg')

    '''
    -processing the image to detect the face and then generating the landmarks (468 for each x,y,z)
    -note: cv2 use The BGR color model insted of RGB so we must convert the image to this color model
    '''
    results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # results.multi_face_landmarks -----> returns list of dict with all landmarks
    if results.multi_face_landmarks != None: # check that mediapipe can generate landmarks and if true we will use this image
      # appending the file names where have been detected.
          detected_files.append(filename)
          # detecting the face
          face = results.multi_face_landmarks[0]

          # initializing two lists to store the points for the image.
          # like we siad before we will use x,y only 
          X = []
          Y = []
          # looping over the 468 points of x and y
          for landmark in face.landmark:
              x = landmark.x
              y = landmark.y
              ### note: the x and y values are scaled to the their width and height so we will get back their actual value in the image.
              shape = image.shape
              relative_x = int(x * shape[1])
              relative_y = int(y * shape[0])
              # X_features
              X.append(relative_x)
              # Y_features
              Y.append(relative_y)

          # converting the lists to numpy arrays
          X = np.array(X)
          Y = np.array(Y)
          # appending the points of the images in the list of all image points
          X_points.append(X)
          Y_points.append(Y)


          '''
          loading the mat file to extract the labels (pitch,yaw,roll)
          the result in mat_file is dict, we only need first 3 items which represent 
          the three angles (pitch,yaw,roll) and those are the labels
          '''
          mat_file = sio.loadmat('AFLW2000/'+filename+'.mat')


          # extracting the labels 3 angels
          pose_para = mat_file["Pose_Para"][0][:3]
          # appending the 3 angels to labels list
          labels.append(pose_para)


# converting features and labels to 2D array
X_points = np.array(X_points)
Y_points = np.array(Y_points)
labels = np.array(labels)

# the first label (pitch)
pitch_label = labels[:,0]
# the first label (yaw)
yaw_label = labels[:,1]
# the first label (roll)
roll_label = labels[:,2]
print(X_points.shape)
print(Y_points.shape)
print(labels.shape)

  0%|          | 0/2000 [00:00<?, ?it/s]

(1853, 468)
(1853, 468)
(1853, 3)


In [7]:
len(detected_files)

1853

Now we have the data (features and labels) let's Preprocessing this data



**Preprocessing the data**


In [12]:
# center 99
# max = 10
# min = 171
# centering the data arround the 99th point 
Center_X = X_points - X_points[:,99].reshape(-1,1)
Center_Y = Y_points - Y_points[:,99].reshape(-1,1)

# normalizing the data to be in the same scale by dividing over the distance between point 10 and point 171
X_171 = X_points[:,171]
X_10 = X_points[:,10]
Y_171 = Y_points[:,171]
Y_10 = Y_points[:,10]
# computing the distance
distance = np.linalg.norm(np.array((X_10,Y_10)) - np.array((X_171,Y_171)),axis = 0).reshape(-1,1)
Norm_X = Center_X / distance
Norm_Y = Center_Y / distance

### if you want to choose specific columns from the data
Final_X = Norm_X
Final_Y = Norm_Y

print(Final_X.shape)
print(Final_Y.shape)

# concatinating the X and Y points to form the compelete features
features = np.hstack([Final_X,Final_Y])
print(features.shape)

(1853, 468)
(1853, 468)
(1853, 936)


In [30]:
import pandas as pd
featuresDF=pd.DataFrame(features)
featuresDF.to_csv('features.csv',index=True)

**pitch ML model using Tpot**

In [31]:
pitch_DF=pd.DataFrame(pitch_label)   # creating a dataframe from the labels
pitch_DF.to_csv('pitch.csv',index=False)  # saving the dataframe to a csv file

yaw_DF = pd.DataFrame(yaw_label)
yaw_DF.to_csv('yaw.csv',index=False)

roll_DF = pd.DataFrame(roll_label)
roll_DF.to_csv('roll.csv',index=False)

In [53]:
pitch_DF

Unnamed: 0,0
0,-0.399231
1,0.470065
2,-0.184650
3,-0.175379
4,-0.026812
...,...
1848,-0.306358
1849,-0.367547
1850,-0.156035
1851,-0.197102


In [54]:
yaw_DF

Unnamed: 0,0
0,0.018227
1,1.189533
2,0.881137
3,0.299208
4,0.011965
...,...
1848,-0.283822
1849,-0.429723
1850,0.567114
1851,-0.070430


In [55]:
roll_DF

Unnamed: 0,0
0,0.085676
1,0.300959
2,-0.236852
3,-0.373374
4,-0.220662
...,...
1848,0.038554
1849,0.122791
1850,-0.108536
1851,0.105118


Starting in ML model

In [34]:
# splitting the data
from sklearn.model_selection import train_test_split
x_train_pitch,x_test_pitch,y_train_pitch,y_test_pitch = train_test_split(featuresDF,pitch_DF,test_size = 0.2,random_state = 20)


x_train_yaw,x_test_yaw,y_train_yaw,y_test_yaw = train_test_split(featuresDF,yaw_DF,test_size = 0.2,random_state = 20)


x_train_roll,x_test_roll,y_train_roll,y_test_roll = train_test_split(featuresDF,roll_DF,test_size = 0.2,random_state = 20)

In [36]:
svr_parameters = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'C':[0.01,0.1,1,10,100]}
# grid search pitch
svr = SVR()
svr_gs_pitch = GridSearchCV(estimator = svr,param_grid = svr_parameters)
svr_gs_pitch.fit(x_train_pitch, y_train_pitch)
svr_winner_pitch = svr_gs_pitch.best_estimator_
print("Pitch Winner Model: ",svr_winner_pitch)
print("Train Error: ",mean_absolute_error(svr_winner_pitch.predict(x_train_pitch),y_train_pitch))
print("Validation Error: ",mean_absolute_error(svr_winner_pitch.predict(x_test_pitch),y_test_pitch))

Pitch Winner Model:  SVR(C=100, kernel='poly')
Train Error:  0.09068371104077763
Validation Error:  0.09771809279218029


In [37]:
# grid search yaw
svr = SVR()
svr_gs_yaw = GridSearchCV(estimator = svr,param_grid = svr_parameters)
svr_gs_yaw.fit(x_train_yaw, y_train_yaw)
svr_winner_yaw = svr_gs_yaw.best_estimator_
print("Yaw Winner Model: ",svr_winner_yaw)
print("Train Error: ",mean_absolute_error(svr_winner_yaw.predict(x_train_yaw),y_train_yaw))
print("Validation Error: ",mean_absolute_error(svr_winner_yaw.predict(x_test_yaw),y_test_yaw))

Yaw Winner Model:  SVR(C=100, kernel='poly')
Train Error:  0.06833417314046158
Validation Error:  0.07244099823750216


In [38]:
# grid search roll
svr = SVR()
svr_gs_roll = GridSearchCV(estimator = svr,param_grid = svr_parameters)
svr_gs_roll.fit(x_train_roll, y_train_roll)
svr_winner_roll = svr_gs_roll.best_estimator_
print("Yaw Winner Model: ",svr_winner_roll)
print("Train Error: ",mean_absolute_error(svr_winner_roll.predict(x_train_roll),y_train_roll))
print("Validation Error: ",mean_absolute_error(svr_winner_roll.predict(x_test_roll),y_test_roll))

Yaw Winner Model:  SVR(C=10)
Train Error:  0.07640400253374444
Validation Error:  0.07844118236957526


draw tha axis

In [58]:
def draw_axis(img, pitch,yaw,roll, tdx=None, tdy=None, size = 100):

    yaw = -yaw
    if tdx != None and tdy != None:
        tdx = tdx
        tdy = tdy
    else:
        height, width = img.shape[:2]
        tdx = width / 2
        tdy = height / 2

    # X-Axis pointing to right. drawn in red
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy

    # Y-Axis | drawn in green
    #        v
    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x1),int(y1)),(0,0,255),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x2),int(y2)),(0,255,0),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3),int(y3)),(255,0,0),2)

    return img

Draw axis on image

In [96]:
def plot_axis_on_image(random_file = 'image00053'):
    immage = cv2.imread('AFLW2000/'+random_file+'.jpg')
    mat_file = sio.loadmat('AFLW2000/'+random_file+'.mat')

    pose_para = mat_file["Pose_Para"][0][:3]

    pitch_label = pose_para[0]
    yaw_label = pose_para[1]
    roll_label = pose_para[2]

    cv2.imshow("show image",draw_axis(immage,pitch_label,yaw_label,roll_label))
    cv2.imshow("show image",immage)
    cv2.waitKey(0) 
    cv2.destroyAllWindows()


In [118]:
def draw_axis_and_points(random_file = 'image00053'):
    faceModule = mediapipe.solutions.face_mesh
    with faceModule.FaceMesh(static_image_mode=True) as faces:
        image = cv2.imread('AFLW2000/'+random_file+'.jpg')
        results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if results.multi_face_landmarks != None: 
            for face in results.multi_face_landmarks:
                for landmark in face.landmark:
                    x = landmark.x
                    y = landmark.y
                # note: the x and y values are scaled to the their width and height so we will get back their actual value in the image
                    shape = image.shape 
                    relative_x = int(x * shape[1])
                    relative_y = int(y * shape[0])

                    
                # cv2.putText(image, str(relative_y), (int(relative_x),int(relative_y)), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,255,0), 2)
                    cv2.circle(image, (relative_x, relative_y), radius=1, color=(0, 255, 0), thickness=2)
            cv2.imshow("show image",draw_axis(image,pitch_label,yaw_label,roll_label))

            cv2.waitKey(0) 
            cv2.destroyAllWindows()

In [None]:
def draw_axis_and_points_pred(random_file = 'image00053'):
    faceModule = mediapipe.solutions.face_mesh
    with faceModule.FaceMesh(static_image_mode=True) as faces:
        image = cv2.imread('AFLW2000/'+random_file+'.jpg')
        results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if results.multi_face_landmarks != None: 
            for face in results.multi_face_landmarks:
                for landmark in face.landmark:
                    x = landmark.x
                    y = landmark.y
                # note: the x and y values are scaled to the their width and height so we will get back their actual value in the image
                    shape = image.shape 
                    relative_x = int(x * shape[1])
                    relative_y = int(y * shape[0])

                # cv2.putText(image, str(relative_y), (int(relative_x),int(relative_y)), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,255,0), 2)
                    cv2.circle(image, (relative_x, relative_y), radius=1, color=(0, 255, 0), thickness=2)
            cv2.imshow("show image",draw_axis(image,pitch_label,yaw_label,roll_label))

            cv2.waitKey(0) 
            cv2.destroyAllWindows()

test ploting axis and points

In [198]:
random_file = 'image00052'
plot_axis_on_image(random_file)
draw_axis_and_points(random_file)

Function to get fetures for specific image

In [170]:
def get_features(random_file = 'image00053'): # for specific image

  faceModule = mediapipe.solutions.face_mesh 
  with faceModule.FaceMesh(static_image_mode=True) as faces: # creating object from FaceMesh class
    image = cv2.imread('AFLW2000/'+random_file+'.jpg')
    results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_face_landmarks != None:
      face = results.multi_face_landmarks[0]
      X = []
      Y = []
      for landmark in face.landmark:
          x = landmark.x
          y = landmark.y

          shape = image.shape
          relative_x = int(x * shape[1])
          relative_y = int(y * shape[0])
          X.append(relative_x)
          Y.append(relative_y)
      X_points = np.array(X).reshape(1,-1)
      Y_points = np.array(Y).reshape(1,-1)


      Center_X = X_points - X_points[:,99].reshape(-1,1)
      Center_Y = Y_points - Y_points[:,99].reshape(-1,1)

      # normalizing the data to be in the same scale by dividing over the distance between point 10 and point 171
      X_171 = X_points[:,171]
      X_10 = X_points[:,10]
      Y_171 = Y_points[:,171]
      Y_10 = Y_points[:,10]

      distance = np.linalg.norm(np.array((X_10,Y_10)) - np.array((X_171,Y_171)),axis = 0).reshape(-1,1)
      Norm_X = Center_X / distance
      Norm_Y = Center_Y / distance

      tdx=X_points[:,99]
      tdy=Y_points[:,99]


      features = np.hstack([Norm_X,Norm_Y])
      
      return features,tdx,tdy



test prediction axis in image

In [186]:
random_file = 'image00002'
img_feture,tdx,tdy=get_features(random_file)
pitch_pred=svr_winner_pitch.predict(img_feture)

yaw_pred=svr_winner_yaw.predict(img_feture)

roll_pred=svr_winner_roll.predict(img_feture)

image = cv2.imread('AFLW2000/'+random_file+'.jpg')
cv2.imshow("show image",draw_axis(image,pitch_pred,yaw_pred,roll_pred,tdx,tdy))
cv2.waitKey(0) 
cv2.destroyAllWindows()

In [193]:
class Queue:
    # defining the constructor
    def __init__(self,max_size):
        self.queue = []
        self.length = 0
        self.max_size = max_size
    
    # adding values to the queue
    def enqueue(self,x):
        if self.length < self.max_size:
          self.queue.append(x)
          self.length = self.length+1
        else:
          print("You have reached the maximum size") 
    # removing values from the queue
    def dequeue(self):
        if len(self.queue) > 0:
            removed = self.queue[0]
            del self.queue[0]
            self.length = self.length-1
            return removed
        else:
            print("Queue is Empty")

    # checking if the queue is full
    def IsFull(self):
      if self.length == self.max_size:
        return True
    
    # printing the queue values
    def print_queue(self):
        for i in self.queue:
            print(i)

In [241]:
# Create a VideoCapture object and read from input file
def Create_TestVideo(pitch_model,yaw_model,roll_model,smoothing = False,size = 30):

  cap = cv2.VideoCapture('AFLW2000/test.mp4')
  width= int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  height= int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

  # Check if camera opened successfully
  if (cap.isOpened()== False): 
      print("Error opening video file")

  
  pitch_queue = Queue(max_size = size)
  yaw_queue = Queue(max_size = size)
  roll_queue = Queue(max_size = size)

  # initializing a list to store the frames   
  img_array = []
  # Read until video is completed
  while(cap.isOpened()):   
    # Capture frame-by-frame
      ret, frame = cap.read()
      if ret == True:
        with faceModule.FaceMesh(static_image_mode=True) as face:
          # processing the image to detect the face and then generating the land marks (468 for each x,y,z).
          results = face.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
          if results.multi_face_landmarks != None:
            for face in results.multi_face_landmarks:
                # initializing X and Y lists to store the spacial coordinates of the points
                X = []
                Y = []
                # looping over the landmarks to extract x and y
                for j,landmark in enumerate(face.landmark):
                    x = landmark.x
                    y = landmark.y
                    # retrieve the true values of x and y
                    shape = frame.shape 
                    relative_x = int(x * shape[1])
                    relative_y = int(y * shape[0])
                    X.append(relative_x)
                    Y.append(relative_y)

                X = np.array(X)
                Y = np.array(Y)
                # centering the data arround the point 99
                X_center = X - X[99]
                Y_center = Y - Y[99]
                d = np.linalg.norm(np.array((X[171],Y[171])) - np.array((X[10],Y[10])))
                X_norm = X_center/d
                Y_norm = Y_center/d
                X_norm = X_norm
                Y_norm = Y_norm
                points = np.hstack([X_norm,Y_norm]).reshape(1,-1)
                # predicting the 3 angels to draw the axis on the image
                pred_pitch = pitch_model.predict(points)
                pred_yaw = yaw_model.predict(points)
                pred_roll = roll_model.predict(points)
                
                if smoothing  == True:
                  if not pitch_queue.IsFull(): 
                    pitch_queue.enqueue(pred_pitch)
                    yaw_queue.enqueue(pred_yaw)
                    roll_queue.enqueue(pred_roll)
                  else:
                    pitch_queue.dequeue()
                    yaw_queue.dequeue()
                    roll_queue.dequeue()
                    pitch_queue.enqueue(pred_pitch)
                    yaw_queue.enqueue(pred_yaw)
                    roll_queue.enqueue(pred_roll)

                  pitch = sum(pitch_queue.queue)/len(pitch_queue.queue)
                  yaw = sum(yaw_queue.queue)/len(yaw_queue.queue)
                  roll = sum(roll_queue.queue)/len(roll_queue.queue)
                  draw_axis(frame,pitch,yaw,roll,X[1],Y[1])

                else:
                  draw_axis(frame,pred_pitch,pred_yaw,pred_roll,X[1],Y[1])
                # appending the result frame to the img_array list
                img_array.append(frame)
      # Break the loop
      else: 
          break
  cap.release()  
  # Closes all the frames
  cv2.destroyAllWindows()
  print("Number of Detected Frames = ",len(img_array))
  # converting the frames to video
  out = cv2.VideoWriter('out.mp4',cv2.VideoWriter_fourcc(*'DIVX'), 20, (width,height))
  for i in range(len(img_array)):
    out.write(img_array[i])
  out.release()

In [242]:
Create_TestVideo(svr_winner_pitch,svr_winner_yaw,svr_winner_roll,smoothing =True,size = 25)

Number of Detected Frames =  749
