In [2]:
import cv2
from facenet_pytorch import MTCNN
import face_recognition

from pytube import YouTube
import cv2
import numpy as np
import pandas as pd
from keras.models import load_model

class VideoProcessor:
    def __init__(self, yt_link, batch_size=50, skip_frames=1, video_output = False):
        self.yt_link = yt_link
        self.batch_size = batch_size
        self.skip_frames = skip_frames
        self.video_output = video_output
        self.frames_without_faces = 0
        self.video_emotions_by_frame = []
        self.video_info_by_frame = []

        self.load_video_from_youtube()


    def load_video_from_youtube(self):

        # Download the YouTube video and get the highest resolution stream
        yt_video = YouTube(self.yt_link)
        stream = yt_video.streams.get_highest_resolution()

        # Open the video stream using OpenCV
        self.video = cv2.VideoCapture(stream.url)

        # Get available and used resolution - Debugging
        #self.available_resolutions = [streams.resolution for streams in yt_video.streams.filter(type="video", progressive=True)]
        #self.used_resolution = stream.resolution

        # Get the number of frames in the video
        self.frame_count = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))

        # Get the frame rate of the video
        self.fps = int(self.video.get(cv2.CAP_PROP_FPS))

        # Get the height and width of the video frames
        self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))

        # Calculate the total number of frames to process after skipping frames
        self.total_frames = len(range(0, self.frame_count, self.skip_frames))

        # Calculate the number of batches required to process all the frames
        self.num_batches = int(np.ceil(self.total_frames / self.batch_size))

    def get_batches(self):
        # Initialize an empty numpy array to hold the frames
        frames = np.empty((self.batch_size, self.height, self.width, 3), np.dtype('uint8'))

        self.frames_read = 0

        # Read the frames in batches and fill up the numpy array
        for batch_start in range(0, self.frame_count, self.batch_size * self.skip_frames):
            batch_end = min(batch_start + (self.batch_size * self.skip_frames), self.frame_count)
            batch_index = 0

            for i in range(batch_start, batch_end):
                ret, frame = self.video.read()
                self.frames_read +=1
                if not ret:
                    break

                if i % self.skip_frames == 0:
                    frames[batch_index] = frame
                    batch_index += 1

            # Resize the numpy array to fit the actual number of frames in the batch
            if batch_index < self.batch_size:
                frames = frames[:batch_index]

            # Yield the current batch of frames
            yield frames

        # Release the video stream
        self.video.release()
    
    def get_overview_df(self, plottable = False):

        frame_info_cols = ['frame', 'person_ID']

        df_emotions = pd.DataFrame(self.video_emotions_by_frame, columns=EmotionDetector.get_emotion_categories())
        df_frame_info = pd.DataFrame(self.video_info_by_frame, columns=frame_info_cols)
        df = pd.concat([df_emotions, df_frame_info], axis=1)
        
        if plottable:
            df = pd.melt(df, id_vars=frame_info_cols, value_vars=EmotionDetector.get_emotion_categories(), var_name='emotion', value_name='probability')
        return df

class FaceDetector:
    def __init__(self, frame_width, frame_height, detection_type="face_recognition", offset = 1.2):
        self.frame_width = frame_width
        self.frame_height = frame_height
        self.detection_type = detection_type
        self.offset = offset

        if self.detection_type == "MTCNN":
            # Initialize the MTCNN face detector
            self.face_detector = MTCNN(keep_all=True, post_process=False, margin=20)
        elif self.detection_type == "haarcascade":
            # Load the Haar Cascade face detector
            self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        elif self.detection_type == "face_recognition":
            # Package face_detector doesn't require object initialization
            self.face_detector = None
        else: 
            raise ValueError("Choose one of the implemented models, ya cunt!")


    def detect_faces(self, frame):
        if self.detection_type == 'haarcascade':
            frame_preprocessed = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            boxes = self.face_detector.detectMultiScale(frame_preprocessed, scaleFactor=1.3, minNeighbors=3)
            
        elif self.detection_type == 'MTCNN':
            # No preprocessing needed
            boxes, _ = self.face_detector.detect(frame)
            
        elif self.detection_type == 'face_recognition':
            frame_preprocessed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Package face_detector directly return boxes without object initialization
            boxes = face_recognition.face_locations(frame_preprocessed, number_of_times_to_upsample=1)
        
        if (boxes is None) or (len(boxes) == 0):
            return [] 

        # Convert boxes to normalized format
        norm_boxes = self.get_norm_boxes(boxes)

        valid_boxes = []
        for box in norm_boxes: 
            box_rearranged = np.array([box[0],box[3], box[1], box[2]])
            #box_rearranged = [box[0],box[3], box[1], box[2]]     <--- Look again inside maybe
            face_landmark = face_recognition.face_landmarks(frame, [box_rearranged])[0]
            if FaceDetector.valid_landmarks(face_landmark):
                # check for squared
                box = self.augment_box(box, self.frame_width, self.frame_height, self.offset)
                valid_boxes.append(box) 

        return valid_boxes

    def get_norm_boxes(self, boxes):
        """
        Normalize the bounding box coordinates from MTCNN to numpy indexing format.
        Output format: np.array(y_min, y_max, x_min, x_max)
        """
        normalized_box = []

        if self.detection_type == 'haarcascade':
            for box in boxes:
                x, y, w, h = box
                normalized_box.append([y, y+h, x, x+w])
            return np.array(normalized_box)    

        elif self.detection_type == "MTCNN":
            for box in boxes:
                x_min, y_min, x_max, y_max = box.astype(int)
                normalized_box.append([y_min, y_max, x_min, x_max])
            return np.array(normalized_box)

        elif self.detection_type =='face_recognition':
            for box in boxes:
                y_min, x_max, y_max, x_min = box
                normalized_box.append([y_min, y_max, x_min, x_max])
            return np.array(normalized_box)    
    
    @staticmethod
    def valid_landmarks(face_landmark):
        left_eye = face_landmark['left_eye']
        right_eye = face_landmark['right_eye']
        mouth = face_landmark['top_lip'] + face_landmark['bottom_lip']

        left_eye_center = np.mean(left_eye, axis=0)
        right_eye_center = np.mean(right_eye, axis=0)
        mouth_center = np.mean(mouth, axis=0)

        eye_distance = np.linalg.norm(right_eye_center - left_eye_center)
        mouth_to_eye_distance = np.linalg.norm(mouth_center - left_eye_center)

        return mouth_to_eye_distance > eye_distance * 0.8

    @staticmethod
    def augment_box(box, frame_width, frame_height, offset=1.2):
        y_min, y_max, x_min, x_max = box
        box_height = y_max - y_min
        box_width = x_max - x_min

        # Check if result will be too large to fit into frame, return original box if so
        max_side = max(box_height, box_width)
        if max_side * offset > min(frame_width, frame_height):
            return box

        # Calculate middle point of rectangle, redefine corner points from there by multiplying offset
        center_x = (x_min + x_max) // 2
        center_y = (y_min + y_max) // 2
        
        y_min = center_y - max_side * offset / 2
        y_max = center_y + max_side * offset / 2
        x_min = center_x - max_side * offset / 2
        x_max = center_x + max_side * offset / 2

        # Move box back into frame if scaled box lays outside some side
        # As chack for too large box already was done in step 1, we can use elif
        if x_min < 0 or y_min < 0 or x_max > frame_width or y_max > frame_height:
            x_offset, y_offset = 0, 0
            if x_min < 0:
                x_offset = -x_min
            elif x_max > frame_width:
                x_offset = frame_width - x_max
            if y_min < 0:
                y_offset = -y_min
            elif y_max > frame_height:
                y_offset = frame_height - y_max
            y_min += y_offset
            y_max += y_offset
            x_min += x_offset
            x_max += x_offset

        return [int(y_min), int(y_max), int(x_min), int(x_max)]            

class EmotionDetector:
    def __init__(self):
        self.emotion_detector = load_model("../models/emotion_model.hdf5", compile=False)
        self.emotion_categories = self.get_emotion_categories()
    
    def predict(self, frame, box):

        gray_cropped_face = EmotionDetector.crop_face(box, frame)

        prob = self.emotion_detector.predict(gray_cropped_face)[0]  # check for underscore
        return prob
    
    @staticmethod
    def crop_face(box, frame):
        face = frame[box[0]:box[1], box[2]:box[3]]
        face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
        face = cv2.resize(face, (64, 64))
        face = face.astype('float32')/ 255
        face = np.expand_dims(face, axis=-1)
        face = np.expand_dims(face, axis=0)

        return face
    
    @staticmethod
    def get_emotion_categories():
        return ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
    
class PersonIdentifier:
    def __init__(self, threshold = 0.6):
        self.threshold = threshold
        self.individual_id_counter = 0
        self.known_face_encodings = {}

    def assignID(self, frame, box):
        box_rearranged = np.array([box[0],box[3], box[1], box[2]])
        #box_rearranged = [box[0],box[3], box[1], box[2]]     <--- Look again inside maybe
        face_encoding = face_recognition.face_encodings(frame, [box_rearranged])[0]
        min_distance = float("inf")
        matched_individual_id = None

        # Potential for: median face calculation & Passing the list

        for individual_id, individual_reference_face_encoding in self.known_face_encodings.items():
            distance = face_recognition.face_distance([individual_reference_face_encoding], face_encoding)[0]
            if distance < self.threshold: # Adjust this threshold value based on your requirements
                if distance < min_distance:
                    min_distance = distance
                    matched_individual_id = individual_id
        
        if matched_individual_id is None:
            self.individual_id_counter += 1
            matched_individual_id = self.individual_id_counter
            self.known_face_encodings[self.individual_id_counter] = face_encoding

        return matched_individual_id      



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'                   # 1 Actor
#yt_link = 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
#yt_link = 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons
yt_link = 'https://www.youtube.com/watch?v=UECCHwh7bZE'

my_test = VideoProcessor(yt_link, skip_frames=50, batch_size=10000, video_output = True)

my_face_detector = FaceDetector(detection_type='face_recognition', 
                                frame_width = my_test.width, 
                                frame_height = my_test.height,
                                offset = 1)
my_emotion_detector = EmotionDetector()
my_person_identifier = PersonIdentifier()

frame_nr = 0

if my_test.video_output == False:
    for idx, batch in enumerate(my_test.get_batches()):
        for frame in batch:
            face_boxes = my_face_detector.detect_faces(frame)
            
            if len(face_boxes) == 0:
                my_test.frames_without_faces += 1
                continue
            
            for box in face_boxes: 
                my_test.video_emotions_by_frame.append(my_emotion_detector.predict(frame, box))
                my_test.video_info_by_frame.append((frame_nr, my_person_identifier.assignID(frame, box)))
            frame_nr +=1

else:
    fourcc = cv2.VideoWriter_fourcc(*'XVID')    
    writer =  cv2.VideoWriter('Output_video_new_Off1.mp4', fourcc, 10, (my_test.width, my_test.height)) 
    
    for idx, batch in enumerate(my_test.get_batches()):
        for frame in batch:
            face_boxes = my_face_detector.detect_faces(frame)
            
            if len(face_boxes) == 0:
                my_test.frames_without_faces += 1
                continue
            
            for box in face_boxes: 
                prob = my_emotion_detector.predict(frame, box)
                character_id = my_person_identifier.assignID(frame, box)

                my_test.video_emotions_by_frame.append(prob)
                my_test.video_info_by_frame.append((frame_nr, character_id))
            
                max_emotion, max_prob = np.argmax(prob), np.max(prob)
                emotion_text = EmotionDetector.get_emotion_categories()[max_emotion]
        
                y_min, y_max, x_min, x_max = box
        
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                cv2.putText(frame, f"Prob: {max_prob:.1%}", (x_min, y_max + 60), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)
                cv2.putText(frame, f"{emotion_text}", (x_min, y_max + 30), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)
                cv2.putText(frame, f"ID: {character_id}", (x_min, y_min -20), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)
            
            
            writer.write(frame)
            frame_nr +=1
    
    writer.release() 

my_df = my_test.get_overview_df(plottable=False)           

OpenCV: FFMPEG: tag 0x44495658/'XVID' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'




2023-03-26 18:26:13.297770: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [20]:
my_test.__dict__

{'yt_link': 'https://www.youtube.com/watch?v=UECCHwh7bZE',
 'batch_size': 10000,
 'skip_frames': 25,
 'video_output': True,
 'frames_without_faces': 0,
 'video_emotions_by_frame': [],
 'video_info_by_frame': []}

In [21]:
my_df

Unnamed: 0,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,frame,person_ID
0,0.020894,0.000024,0.068198,0.145649,0.072939,0.000987,0.691310,0,1
1,0.062610,0.000774,0.111277,0.144467,0.205002,0.027280,0.448589,0,2
2,0.026876,0.000883,0.021906,0.752188,0.026912,0.004777,0.166458,1,3
3,0.025669,0.000347,0.260382,0.071585,0.135191,0.000511,0.506314,1,1
4,0.041262,0.002269,0.147834,0.048683,0.548283,0.006140,0.205529,1,2
...,...,...,...,...,...,...,...,...,...
248,0.078556,0.003998,0.523000,0.054074,0.138481,0.005457,0.196434,89,46
249,0.051000,0.053365,0.087438,0.580437,0.042181,0.040655,0.144925,89,3
250,0.504933,0.005568,0.028646,0.013052,0.014140,0.372410,0.061250,89,7
251,0.033859,0.000117,0.537065,0.025003,0.105364,0.003690,0.294901,90,6


In [22]:
my_test.get_overview_df(True)

Unnamed: 0,frame,person_ID,emotion,probability
0,0,1,Angry,0.020894
1,0,2,Angry,0.062610
2,1,3,Angry,0.026876
3,1,1,Angry,0.025669
4,1,2,Angry,0.041262
...,...,...,...,...
1766,89,46,Neutral,0.196434
1767,89,3,Neutral,0.144925
1768,89,7,Neutral,0.061250
1769,90,6,Neutral,0.294901


In [9]:
!pip install FER

Collecting FER
  Using cached fer-22.5.0-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting opencv-contrib-python
  Using cached opencv_contrib_python-4.7.0.72-cp37-abi3-macosx_11_0_arm64.whl (41.0 MB)
Installing collected packages: tqdm, opencv-contrib-python, FER
Successfully installed FER-22.5.0 opencv-contrib-python-4.7.0.72 tqdm-4.65.0


In [10]:
512//2

2