In [2]:
import numpy as np
import cv2
import time
import matplotlib.pyplot as plt
import glob
import mediapipe as mp
from constants import LIPS_POSITIONS

In [1]:
class LandmarkExtractor:
    def __init__(self):
        self.mpHands = mp.solutions.hands # Load mediapipe hands module
        self.hands = mpHands.Hands( # Initialize hands model
            max_num_hands=2,
            model_complexity=1,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5,
            static_image_mode=False)
        
        self.mpFace = mp.solutions.face_mesh # Load mediapipe face module
        self.faces = mpFace.FaceMesh( # Initialize Face model
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5,
            static_image_mode=False)
        
        self.mpDrawHands = mp.solutions.drawing_utils # Initializing drawing object for hands
        self.mpDrawFace = mp.solutions.drawing_utils # Initializing drawing object for Face
        self.mp_drawing_styles =mp.solutions.drawing_styles
        self.mp_drawing_face = mpDrawFace.DrawingSpec(color=(0,0,200),thickness=0,circle_radius=1) #Initializing drawing specifications for face
        self.mp_drawing_hands = mpDrawHands.DrawingSpec(color=(255,0,0),thickness=0,circle_radius=1) #Initializing drawing specifications for hand
    def findHands(self,img):
        imgRGB = cv2.cvtColor(img,cv2.COLOR_BGR2RGB) # Transform to RGB
        results = self.hands.process(imgRGB) # Feeding image through Hands model
        return results # Returning values from model prediction
        
    def findFace(self, img):
        imgRGB = cv2.cvtColor(img,cv2.COLOR_BGR2RGB) # Transform image to RGB
        results = self.faces.process(imgRGB) # Feeding image through Face model
        return results # Returning values from model prediction


    def __getCoordinates(self,landmarks,index,scale,img_size): 
        x=landmarks.landmark[index].x
        y=landmarks.landmark[index].y
        z=landmarks.landmark[index].z
        if scale:
            x=x*img_size[0]
            y=y*img_size[1]
        return x,y,z  
        
    def getLipsLandmarks(self,resultsFace,scale=False,img_size=(700,720)):
        list_lips_positions=[]
        for cord in LIPS_POSITIONS:
            landmarkovi=resultsFace.multi_face_landmarks[0]
            x1,y1,z1=self.__getCoordinates(landmarkovi,cord[0],scale,img_size)
            x2,y2,z2=self.__getCoordinates(landmarkovi,cord[1],scale,img_size)
            
            avg_x=float((x1+x2)/2)
            avg_y=float((y1+y2)/2)
            
            list_lips_positions.append((avg_x,avg_y,z1))
        return list_lips_positions
 
        
    def drawLandmarks(self,img,resultsFace,resultsHands):
        img=img.copy()
        if resultsFace.multi_face_landmarks:
            for face_landmarks in resultsFace.multi_face_landmarks:   
                self.mpDrawFace.draw_landmarks( # Draw face lendmark
                  image=img,
                  landmark_list=face_landmarks,
                  connections=self.mpFace.FACEMESH_CONTOURS,
                  landmark_drawing_spec=self.mp_drawing_face,
                  connection_drawing_spec=self.mp_drawing_styles.get_default_face_mesh_tesselation_style())
            
        if resultsHands.multi_hand_landmarks:
            for handlms in resultsHands.multi_hand_landmarks:   
                mpDrawHands.draw_landmarks(image=img, # Draw hand landmarks
                                       landmark_list=handlms,
                                       connections=self.mpHands.HAND_CONNECTIONS,
                                      landmark_drawing_spec=self.mp_drawing_hands)
        return img

In [None]:
class VideoLoader:
    def __init__(self):
        self.fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        self.landmark_extractor=LandmarkExtractor()

    def loadVideo(self,path,output_path=None):
        
        cap = cv2.VideoCapture(path)
        if output_path is not None:
            out = cv2.VideoWriter(output_path,fourcc, 15,(700,720))

        use_frame=True
        frames=[]  

        while(True):
            ret, frame = cap.read() #reading frames
            if ret: #if frame exist ret=True, otherwise False
                if use_frame: # this means we will skip every other frame
                    frame=frame[:, 300:1000,:] #cropping image, retainig all 3 rgb channels
                    frames.append(frame)
                    
                    
                    resultsFace=self.landmark_extractor.findFace(frame) #using function defined above to detect facial landmarks in a frame (findFace)
                    resultsHands=self.landmark_extractor.findHands(frame) #using function defined above to detect hand landmarks in a frame (findHnds)
            
                    if ouput_path is not None:
                        out.write(drawLandmarks(frame.copy(),resultsFace,resultsHands)) #drawing landmarks on frames by using function defined above (drawLadmarks)
        
                    use_frame=False
                else:
                    use_frame=True
            else:
                break
        if ouput_path is not None:
            out.release() #close writing stream
    return frames