In [1]:
#!pip install facenet_pytorch

In [2]:
import os           

# Check if string is appropriate youtube link
import re
import cv2
import numpy as np
import pandas as pd
from facenet_pytorch import MTCNN
from matplotlib import pyplot as plt
from keras.models import load_model
from pytube import YouTube

import face_recognition

import seaborn as sns
import plotly.express as px

# DEBUG-Mode stops operations when max_emotions were detected and also prints a short summary
DEBUG = True
if DEBUG:
    import time
    debug_params = {
        'max_emotions' : 5000
    }

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_path(kind):
    if kind == "youtube":
        #return "https://www.youtube.com/watch?v=vtT78TfDfXU"                   # Random video
        #return 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
        #return 'https://www.youtube.com/watch?v=m70UInZKJjU'   
        #return  'https://www.youtube.com/watch?v=cLmCJKT5ssw&ab_channel=TheTonightShowStarringJimmyFallon'                 # Two persons
        #return 'https://www.youtube.com/watch?v=QRnw9f5rbN4'
        return 'https://www.youtube.com/watch?v=VKl41s51JvE&ab_channel=Movieclips'
    
    if kind == "local":
        # adjust individually
        return '/Users/steve/Neue_Fische/face_demo/vids/Video_One_output.mp4'
    if kind == "error_on_purpose":
        return "wrongful path"
    else:
        raise ValueError(f"Passed Argument kind must bei in ['youtube', 'local', 'error_on_purpose'] but was: {kind}")

def youtube_stream(yt_link):
    # Load the video from YouTube
    yt_video = YouTube(yt_link)
    stream = yt_video.streams.get_highest_resolution()    #or highest resolution?
    stream.download()
    return cv2.VideoCapture(stream.default_filename)

def local_stream(local_path):
    return cv2.VideoCapture(local_path)

def get_stream(path):
    # Check if the string is a YouTube link
    if re.match(r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+$', path):
        return youtube_stream(path)
    # Check if the string is a local path
    elif os.path.isfile(path):
        return local_stream(path)
    # Check if the path is a local file path but no file is found
    elif os.path.exists(path):
        raise ValueError(f"File not found at path: {path}")
    # If it's neither a local path nor a YouTube link, raise an error
    else:
        raise ValueError("The input string is neither a local path nor a YouTube link.")
    
def load_emotion_classifier():
    return load_model("../models/emotion_model.hdf5", compile=False)


def print_debug_report(operating_results):    
    print(f'{operating_results["analyzed_emotions"]} faces found in {operating_results["analyzed_frames"]} frames.')
    print(f'{operating_results["frames_without_faces"]} frames had no face detected ({operating_results["frames_without_faces_ratio"]}%).')
    print(f'Stopped operations after around {operating_results["processed_video_time"]} seconds into the video.')
    print(f'Execution time: {operating_results["runtime"]} seconds, processing (roughly) {round(operating_results["processed_video_time"]/operating_results["runtime"],2)} seconds of video per second of execution')

def initialize_face_detector(model_type):
    if model_type == 'haarcascade':
        return cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    elif model_type == "MTCNN":
        return MTCNN(keep_all=True, post_process=False, margin=20)
    
    elif model_type == "face_recognition":
        return None
    
    else:
        raise ValueError("Choose one of the implemented models, ya cunt!")

def preprocess_frame(frame, model_type):
    if model_type == 'haarcascade':
        return cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    elif model_type == "MTCNN":
        return frame
    
    elif model_type =='face_recognition':
        return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    else: 
        raise ValueError("Choose one of the implemented models, ya cunt!")

    

def get_norm_boxes(boxes, model_type):
    """
    Normalize the bounding box coordinates from MTCNN to numpy indexing format.
    Output format: np.array(y_min, y_max, x_min, x_max)
    """
    normalized_box = []

    if model_type == 'haarcascade':
        for box in boxes:
            x, y, w, h = box
            normalized_box.append([y, y+h, x, x+w])
        return np.array(normalized_box)    

    elif model_type == "MTCNN":
        for box in boxes:
            x_min, y_min, x_max, y_max = box.astype(int)
            normalized_box.append([y_min, y_max, x_min, x_max])
        return np.array(normalized_box)

    elif model_type =='face_recognition':
        for box in boxes:
            top, right, bottom, left = box
            x, y, w, h = left, top, right - left, bottom - top
            normalized_box.append([y, y+h, x, x+w])
        return np.array(normalized_box)    
    
    else: 
        raise ValueError("Choose one of the implemented models, ya cunt!")

    

def detect_faces(frame_pp, face_detector, model_type):

    if model_type == 'haarcascade':
        face_detector = initialize_face_detector(model_type)
        boxes = face_detector.detectMultiScale(frame_pp, scaleFactor=1.3, minNeighbors=3)
        
    elif model_type == 'MTCNN':
        boxes, _ = face_detector.detect(frame_pp)
        
    elif model_type == 'face_recognition':
        boxes = face_recognition.face_locations(frame_pp, number_of_times_to_upsample=1)

    else:
        raise ValueError("Choose one of the implemented models, ya cunt!")
    
    if len(boxes) == 0:
        return [],[]

    # Convert boxes to normalized format
    norm_boxes = get_norm_boxes(boxes, model_type)

    return norm_boxes, boxes


def crop_face(box, frame_pp):

    cropped_face = frame_pp[box[0]:box[1], box[2]:box[3]]
    cropped_face = cv2.cvtColor(cropped_face, cv2.COLOR_RGB2GRAY)
    cropped_face = cv2.resize(cropped_face, (64, 64))
    cropped_face = cropped_face.astype('float32')/ 255
    cropped_face = np.expand_dims(cropped_face, axis=-1)
    cropped_face = np.expand_dims(cropped_face, axis=0)

    return cropped_face

    
def get_ordered_emotions():
    return ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

def emotions_probability(face, emotion_classifier):#
    prob = emotion_classifier.predict(face)[0]  # check for underscore
    return prob


def valid_landmarks(face_landmark):
    left_eye = face_landmark['left_eye']
    right_eye = face_landmark['right_eye']
    mouth = face_landmark['top_lip'] + face_landmark['bottom_lip']

    left_eye_center = np.mean(left_eye, axis=0)
    right_eye_center = np.mean(right_eye, axis=0)
    mouth_center = np.mean(mouth, axis=0)

    eye_distance = np.linalg.norm(right_eye_center - left_eye_center)
    mouth_to_eye_distance = np.linalg.norm(mouth_center - left_eye_center)

    return mouth_to_eye_distance > eye_distance * 0.8

def resize_frame(frame, new_width):
    original_height, original_width = frame.shape[:2]
    aspect_ratio = float(original_height) / float(original_width)
    new_height = int(new_width * aspect_ratio)
    new_dimensions = (new_width, new_height)
    resized_frame = cv2.resize(frame, new_dimensions, interpolation=cv2.INTER_AREA)
    return resized_frame

def get_overview_df(emotions, frame_info, frame_info_cols):
    assert len(frame_info[0]) == len(frame_info_cols), \
        f"Number of columns in frame_info and number of passed names in frame_info_cols is not the same: {len(frame_info[0])} != {len(frame_info_cols)}."

    df_emotions = pd.DataFrame(emotions, columns=get_ordered_emotions())
    df_frame_info = pd.DataFrame(frame_info, columns=frame_info_cols)
    df_all_info = pd.concat([df_emotions, df_frame_info], axis=1)
    return df_all_info

def get_plottable_df(emotions, frame_info, frame_info_cols):

    assert len(frame_info[0]) == len(frame_info_cols), \
        f"Number of columns in frame_info and number of passed names in frame_info_cols is not the same: {len(frame_info[0])} != {len(frame_info_cols)}."

    df_emotions = pd.DataFrame(emotions, columns=get_ordered_emotions())
    df_frame_info = pd.DataFrame(frame_info, columns=frame_info_cols)
    df_all_info = pd.concat([df_emotions, df_frame_info], axis=1)
    df_plotting = pd.melt(df_all_info, id_vars=frame_info_cols, value_vars=get_ordered_emotions(), var_name='emotion', value_name='probability')
    return df_plotting

def output_video(video, new_width, filename):
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    aspect_ratio = float(height) / float(width)
    new_height = int(new_width * aspect_ratio)
    fps = int(video.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    
    return cv2.VideoWriter(filename, fourcc, 10, (new_width, new_height))

    

In [4]:
if DEBUG: start_time = time.time()

# Define video path
path = get_path('youtube')

# Set the number of frames to skip
frames_to_skip = 3

# Get Video as cv2.VideoCapture
# Can access Youtube Video or local file
video = get_stream(path)

# Initialize the face detection model
#model_type = "MTCNN"
model_type = "face_recognition"
#model_type = 'haarcascade'

face_detector = initialize_face_detector(model_type)


# Initialize the emotion detection model
emotion_classifier = load_emotion_classifier()

# Initialize lists to store emotions and frame_info
emotions = []
frame_info = []

# Initialize counters
frames_without_faces_counter = 0

# Set the new width for the resized video frames
NEW_WIDTH = 640


Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2023-03-24 18:01:39.853813: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-24 18:01:39.854170: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
writer = output_video(video, new_width=NEW_WIDTH, filename='Output_video_3.mp4') # ADDED THIS!!
face_embeddings = {}
individual_id_counter = 0


# Loop through each frame of the video
while True:

    # Read the next frame from the video
    ret, frame = video.read()
    

    # Check if the frame was successfully read
    if not ret or frame is None:
        break

    frame= resize_frame(frame,new_width=NEW_WIDTH)
    # Increment the frame counter
    current_frame_nr = int(video.get(cv2.CAP_PROP_POS_FRAMES))

    # Skip frames based on the frames_to_skip parameter
    if current_frame_nr % frames_to_skip != 0:
        continue

    
    frame_pp = preprocess_frame(frame, model_type) #frame in the format that fits to face_detector    
    norm_boxes, boxes  = detect_faces(frame_pp=frame_pp, face_detector=face_detector, model_type=model_type)

    

    for box, norm_box in zip(boxes,norm_boxes): 
        if len(boxes) == 0:
            frames_without_faces_counter += 1
            continue
        face_landmark = face_recognition.face_landmarks(frame, [box])[0]
        if not valid_landmarks(face_landmark):
            continue

        gray_cropped_face = crop_face(norm_box, frame_pp)

        prob = emotions_probability(gray_cropped_face, emotion_classifier)
        emotions.append(prob)



        max_emotion, max_prob = np.argmax(prob), np.max(prob)
        emotion_labels = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
        emotion_text = emotion_labels[max_emotion]



        face_encoding = face_recognition.face_encodings(frame, [box])[0]                  
        min_distance = float("inf")
        matched_individual_id = None

        for individual_id, individual_face_encoding in face_embeddings.items():
            distance = face_recognition.face_distance([individual_face_encoding], face_encoding)[0]
            if distance < 0.60: # Adjust this threshold value based on your requirements
                if distance < min_distance:
                    min_distance = distance
                    matched_individual_id = individual_id

        if matched_individual_id is None:
            individual_id_counter += 1
            matched_individual_id = individual_id_counter
            face_embeddings[individual_id_counter] = face_encoding




        top, right, bottom, left = box
        x, y, w, h = left, top, right-left , bottom - top
        y_min = y
        y_max=y+h
        x_min= x
        x_max = x+w
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        cv2.putText(frame, f"Prob: {max_prob:.1%}", (x_min, y_max + 60), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)
        cv2.putText(frame, f"{emotion_text}", (x_min, y_max + 30), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)
        cv2.putText(frame, f"ID: {matched_individual_id}", (x_min, y_min -20), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)

                
        frame_info.append(
            (round(video.get(cv2.CAP_PROP_POS_MSEC) / 1000, 2),
                current_frame_nr
                )
        )    


                
    writer.write(frame) # ADDED THIS!!!

    cv2.imshow("Faces found", frame)

    # Wait for Esc key to stop
    if cv2.waitKey(20) == ord('q'):
        break

    if DEBUG:
        # For debugging reasons, we stop when we have 1000 emotion values
        if len(emotions) > debug_params['max_emotions']:
            break

# Release the video and close the window
video.release() 

writer.release() 

# De-allocate any associated memory usage
cv2.destroyAllWindows()

if DEBUG: end_time = time.time()

if DEBUG:
    # When in DEBUG-mode, print some statistics about the faces and emotions detected
    operating_results = {'analyzed_emotions': len(emotions),
                         'analyzed_frames':frame_info[-1][1], 
                         'frames_without_faces':frames_without_faces_counter,
                         'frames_without_faces_ratio': round(100*frames_without_faces_counter/frame_info[-1][1],2),
                         'processed_video_time': round(frame_info[-1][0] / 1000 ,2),
                         'runtime': round(end_time - start_time,2)}
    
    print_debug_report(operating_results)
    
df_plotting = get_plottable_df(emotions, frame_info, frame_info_cols=['pos_sec', 'frame'])

OpenCV: FFMPEG: tag 0x44495658/'XVID' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
2023-03-24 18:01:40.834546: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-24 18:01:40.976472: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




1   HIToolbox                           0x00000001a6f6c5c8 _ZN15MenuBarInstance22EnsureAutoShowObserverEv + 120
2   HIToolbox                           0x00000001a6f6c188 _ZN15MenuBarInstance14EnableAutoShowEv + 60
3   HIToolbox                           0x00000001a6ed98bc _ZN15MenuBarInstance21UpdateAggregateUIModeE21MenuBarAnimationStylehhh + 1184
4   HIToolbox                           0x00000001a6f6c004 _ZN15MenuBarInstance19SetFullScreenUIModeEjj + 180
5   AppKit                              0x00000001a0d62008 -[NSApplication _setPresentationOptions:instance:flags:] + 956
6   AppKit                              0x00000001a0bf7bb0 -[NSApplication _updateFullScreenPresentationOptionsForInstance:] + 404
7   CoreFoundation                      0x000000019d819570 __CFNOTIFICATIONCENTER_IS_CALLING_OUT_TO_AN_OBSERVER__ + 148
8   CoreFoundation                      0x000000019d8b7054 ___CFXRegistrationPost_block_invoke + 88
9   CoreFoundation                      0x000000019d8b6f9c _CFXRe



1   HIToolbox                           0x00000001a6ef790c _ZN15MenuBarInstance22RemoveAutoShowObserverEv + 44
2   HIToolbox                           0x00000001a6f6cfbc _ZN15MenuBarInstance15DisableAutoShowEv + 36
3   HIToolbox                           0x00000001a6f6d0b0 _ZN15MenuBarInstanceD2Ev + 128
4   HIToolbox                           0x00000001a6f6cee0 _ZN15MenuBarInstance7ReleaseEv + 56
5   AppKit                              0x00000001a10ff5ec -[NSHIPresentationInstance discard] + 228
6   AppKit                              0x00000001a14beb58 -[_NSFullScreenSpace(PresentationInstance) discardPresentationInstance] + 32
7   AppKit                              0x00000001a14bebb0 -[_NSFullScreenSpace(PresentationInstance) activateFullScreenPresentationOptions] + 64
8   AppKit                              0x00000001a1318c20 -[_NSExitFullScreenTransitionController _doSucceededToExitFullScreen] + 40
9   AppKit                              0x00000001a13196f8 __63-[_NSExitFullScreenT

1237 faces found in 4569 frames.
0 frames had no face detected (0.0%).
Stopped operations after around 0.19 seconds into the video.
Execution time: 235.87 seconds, processing (roughly) 0.0 seconds of video per second of execution


cv2.imshow("Preprocessed Frame", rgb_cropped_face)
cv2.waitKey(0)
cv2.destroyAllWindows()