In [1]:
#!pip install facenet_pytorch

In [107]:
import os           

# Check if string is appropriate youtube link
import re
import datetime
import cv2
import numpy as np
import pandas as pd
from facenet_pytorch import MTCNN
from matplotlib import pyplot as plt
from keras.models import load_model
from pytube import YouTube

import face_recognition

import seaborn as sns
import plotly.express as px

SAVE_VIDEO = False
SHOW_PROMPT = False

# DEBUG-Mode stops operations when max_emotions were detected and also prints a short summary
DEBUG = False
if DEBUG:
    import time
    debug_params = {
        'max_emotions' : 1000
    }

In [108]:
def get_path(kind):
    if kind == "youtube":
        return "https://www.youtube.com/watch?v=vtT78TfDfXU"                   # Random video
        #return 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
        #return 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons
    if kind == "local":
        # adjust individually
        return '/Users/steve/Neue_Fische/face_demo/vids/Video_One_output.mp4'
    if kind == "error_on_purpose":
        return "wrongful path"
    else:
        raise ValueError(f"Passed Argument kind must bei in ['youtube', 'local', 'error_on_purpose'] but was: {kind}")

def youtube_stream(yt_link):
    # Load the video from YouTube
    yt_video = YouTube(yt_link)
    stream = yt_video.streams.get_highest_resolution()    #or highest resolution?
    return cv2.VideoCapture(stream.url)

def local_stream(local_path):
    return cv2.VideoCapture(local_path)

def get_stream(path):
    # Check if the string is a YouTube link
    if re.match(r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+$', path):
        return youtube_stream(path)
    # Check if the string is a local path
    elif os.path.isfile(path):
        return local_stream(path)
    # Check if the path is a local file path but no file is found
    elif os.path.exists(path):
        raise ValueError(f"File not found at path: {path}")
    # If it's neither a local path nor a YouTube link, raise an error
    else:
        raise ValueError("The input string is neither a local path nor a YouTube link.")
    
def load_emotion_classifier():
    return load_model("../models/emotion_model.hdf5", compile=False)

def preprocess_face(face, input_face_size):
    face = cv2.cvtColor(face, cv2.COLOR_RGB2GRAY)  # Convert the face to grayscale
    face = cv2.resize(face, (input_face_size[1], input_face_size[0]))  # Swap width and height
    face = face.astype('float32') / 255.0
    face = np.expand_dims(face, axis=-1)  # Add an additional dimension for grayscale channel
    face = np.expand_dims(face, axis=0)
    return face

def print_debug_report(operating_results):    
    print(f'{operating_results["analyzed_emotions"]} faces found in {operating_results["analyzed_frames"]} frames.')
    print(f'{operating_results["frames_without_faces"]} frames had no face detected ({operating_results["frames_without_faces_ratio"]}%).')
    print(f'Stopped operations after around {operating_results["processed_video_time"]} seconds into the video.')
    print(f'Execution time: {operating_results["runtime"]} seconds, processing (roughly) {round(operating_results["processed_video_time"]/operating_results["runtime"],2)} seconds of video per second of execution')

def initialize_face_detector(model_type):
    if model_type == 'haarcascade':
        return cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    elif model_type == "MTCNN":
        return MTCNN(keep_all=True, post_process=False, margin=20)
    else:
        raise ValueError("By now, only Haarcascade is implemented.")

def preprocess_frame_for_face_detection_haarcascade(frame):
    return cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

def preprocess_frame_for_emotion_detection(frame):
    return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

def normalize_boxes_mtcnn(boxes):
    """
    Normalize the bounding box coordinates from MTCNN to numpy indexing format.
    Output format: np.array(y_min, y_max, x_min, x_max)
    """
    normalized_boxes = []
    for box in boxes:
        x_min, y_min, x_max, y_max = box.astype(int)
        normalized_boxes.append([y_min, y_max, x_min, x_max])
    return np.array(normalized_boxes)

def normalize_boxes_cv2(boxes):
    """
    Normalize the bounding box coordinates from OpenCV's format to numpy indexing format.
    Output format: np.array(y_min, y_max, x_min, x_max)
    """
    normalized_boxes = []
    for box in boxes:
        x, y, w, h = box
        normalized_boxes.append([y, y+h, x, x+w])
    return np.array(normalized_boxes)

def detect_faces(frame, face_detector, model_type = 'haarcascade'):
    if model_type == 'haarcascade':
        frame_pp = preprocess_frame_for_face_detection_haarcascade(frame)
        boxes =  face_detector.detectMultiScale(frame_pp, scaleFactor = 1.3, minNeighbors = 3)
        if boxes is None:
            return None
        else:
            return normalize_boxes_cv2(boxes)
    elif model_type == "MTCNN":
        # No preprocessing needed for VideoCapture Frame
        boxes, _ = face_detector.detect(frame)
        if boxes is None:
            return None
        else:
            return normalize_boxes_mtcnn(boxes)
        #return [(x['box'][1], x['box'][0] + x['box'][2], x['box'][1] + x['box'][3], x['box'][0]) for x in bounding_boxes]
    #elif model_type == "fast_MTCNN"
    # https://towardsdatascience.com/face-detection-using-mtcnn-a-guide-for-face-extraction-with-a-focus-on-speed-c6d59f82d49
    else:
        raise ValueError("By now, only Haarcascade is implemented.")
    
def get_ordered_emotions():
    return ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

def emotions_probability(frame, face_location, emotion_classifier):
    frame_pp = preprocess_frame_for_emotion_detection(frame)
    y_min, y_max, x_min, x_max = face_location
    face = frame_pp[y_min:y_max, x_min:x_max]
    face = preprocess_face(face, input_face_size=emotion_classifier.input_shape[1:3])
    prob = emotion_classifier.predict(face)[0]  # check for underscore
    return prob

def output_video(video, filename):
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(video.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    return cv2.VideoWriter(filename, fourcc, 10, (width,height))

def get_overview_df(emotions, frame_info, frame_info_cols):
    assert len(frame_info[0]) == len(frame_info_cols), \
        f"Number of columns in frame_info and number of passed names in frame_info_cols is not the same: {len(frame_info[0])} != {len(frame_info_cols)}."

    df_emotions = pd.DataFrame(emotions, columns=get_ordered_emotions())
    df_frame_info = pd.DataFrame(frame_info, columns=frame_info_cols)
    df_all_info = pd.concat([df_emotions, df_frame_info], axis=1)
    return df_all_info

def get_plottable_df(emotions, frame_info, frame_info_cols):

    assert len(frame_info[0]) == len(frame_info_cols), \
        f"Number of columns in frame_info and number of passed names in frame_info_cols is not the same: {len(frame_info[0])} != {len(frame_info_cols)}."

    df_emotions = pd.DataFrame(emotions, columns=get_ordered_emotions())
    df_frame_info = pd.DataFrame(frame_info, columns=frame_info_cols)
    df_all_info = pd.concat([df_emotions, df_frame_info], axis=1)
    df_plotting = pd.melt(df_all_info, id_vars=frame_info_cols, value_vars=get_ordered_emotions(), var_name='emotion', value_name='probability')
    return df_plotting

In [110]:

yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'                   # 1 Actor
#yt_link = 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
#yt_link = 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons

yt_video = YouTube(yt_link)
stream = yt_video.streams.get_highest_resolution()    #or highest resolution?
video =  cv2.VideoCapture(stream.url)
model_evaluation = {'video':{}, 'models': {}}

# Initialize writer to save the annotated video
if SAVE_VIDEO: writer = output_video(video, filename='outputs/Output_video.mp4')

model_evaluation['video'] = {'available_resolutions': [streams.resolution for streams in yt_video.streams.filter(type="video", progressive=True)],
                             'used_resolution' : stream.resolution,
                             'fps_pytube': stream.fps,
                             'fps_cv2': video.get(cv2.CAP_PROP_FPS),
                             'framecount': video.get(cv2.CAP_PROP_FRAME_COUNT),
                             'length_s': round(video.get(cv2.CAP_PROP_FRAME_COUNT) / video.get(cv2.CAP_PROP_FPS),4)
                             }

for model_type in["MTCNN", "haarcascade"]:

    video =  cv2.VideoCapture(stream.url)

    start_time_initialization = time.time()
    # Initialize the face detection model

    face_detector = initialize_face_detector(model_type)
    end_time_initialization = time.time()

    frame_info = []
    faces_found = 0

    model_evaluation['models'][model_type] = {'model_name': model_type,
                                            'initialization_time': round(end_time_initialization-start_time_initialization,4)
                                            }

    start_time_loop = time.time()
    # Loop through each frame of the video
    while True:

        # Read the next frame from the video
        ret, frame = video.read()

        # Check if the frame was successfully read
        if not ret:
            break

        # Increment the frame counter
        current_frame_nr = int(video.get(cv2.CAP_PROP_POS_FRAMES))
        
        # Find faces within a frame and return list of coordinates of bounding boxes
        face_locations = detect_faces(frame, face_detector, model_type)

        # Check if any faces were found
        if face_locations is None:
            faces_found = 0
        else:
            faces_found = len(face_locations)
                    
        frame_info.append(
            (model_type, round(video.get(cv2.CAP_PROP_POS_MSEC) / 1000, 2), current_frame_nr, faces_found ))    

            #for (y_min, y_max, x_min, x_max) in face_locations:
            #   cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            #  cv2.putText(frame, f"Frame {current_frame_nr}", (30,40), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 1)

        if DEBUG:
            # For debugging reasons, we stop when we have 1000 emotion values
            if len(frame_info) > debug_params['max_emotions']:
                break

    end_time_loop = time.time()

    # Release the video and close the window
    video.release()

    frame_info_array = np.array(frame_info)

    model_evaluation['models'][model_type]['frames_analyzed'] = len(frame_info)
    model_evaluation['models'][model_type]['processing_time'] = end_time_loop-start_time_loop
    model_evaluation['models'][model_type]['processed_s_video_per_s_runtime'] = round(model_evaluation['video']['length_s']/model_evaluation['models'][model_type]['processing_time'],4)
    model_evaluation['models'][model_type]['nr_frames_no_face_found']=np.sum(np.equal(frame_info_array[:, 3].astype(int),0))
    model_evaluation['models'][model_type]['ratio_frames_no_face_found']=round(model_evaluation['models'][model_type]['nr_frames_no_face_found']/model_evaluation['models'][model_type]['frames_analyzed'],3)
    model_evaluation['models'][model_type]['nr_frames_one_face_found']=np.sum(np.equal(frame_info_array[:, 3].astype(int),1))
    model_evaluation['models'][model_type]['ratio_frames_one_face_found']=round(model_evaluation['models'][model_type]['nr_frames_one_face_found']/model_evaluation['models'][model_type]['frames_analyzed'],3)
    model_evaluation['models'][model_type]['nr_frames_multiple_faces_found']=np.sum(np.greater(frame_info_array[:, 3].astype(int),1))
    model_evaluation['models'][model_type]['ratio_frames_multiple_faces_found']=round(model_evaluation['models'][model_type]['nr_frames_multiple_faces_found']/model_evaluation['models'][model_type]['frames_analyzed'],3)

model_evaluation


[tls @ 0x177112940] Error in the pull function.
[tls @ 0x177112940] IO error: Connection reset by peer
[NULL @ 0x288075b70] Invalid NAL unit size (546 > 300).
[NULL @ 0x288075b70] missing picture in access unit with size 304
[h264 @ 0x28809d5d0] Invalid NAL unit size (546 > 300).
[h264 @ 0x28809d5d0] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x1771184a0] stream 0, offset 0x563d2a: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x1771184a0] stream 0, offset 0x564d85: partial file


{'video': {'available_resolutions': ['144p', '360p', '720p'],
  'used_resolution': '720p',
  'fps_pytube': 24,
  'fps_cv2': 24.0,
  'framecount': 1785.0,
  'length_s': 74.375},
 'models': {'MTCNN': {'model_name': 'MTCNN',
   'initialization_time': 0.0097,
   'frames_analyzed': 1651,
   'processing_time': 171.2695279121399,
   'processed_s_video_per_s_runtime': 0.4343,
   'nr_frames_no_face_found': 234,
   'ratio_frames_no_face_found': 0.142,
   'nr_frames_one_face_found': 1415,
   'ratio_frames_one_face_found': 0.857,
   'nr_frames_multiple_faces_found': 2,
   'ratio_frames_multiple_faces_found': 0.001},
  'haarcascade': {'model_name': 'haarcascade',
   'initialization_time': 0.0239,
   'frames_analyzed': 1785,
   'processing_time': 22.80318808555603,
   'processed_s_video_per_s_runtime': 3.2616,
   'nr_frames_no_face_found': 619,
   'ratio_frames_no_face_found': 0.347,
   'nr_frames_one_face_found': 1153,
   'ratio_frames_one_face_found': 0.646,
   'nr_frames_multiple_faces_found': 13

In [111]:
frame_info_array

array([['haarcascade', '0.0', '1', '1'],
       ['haarcascade', '0.04', '2', '1'],
       ['haarcascade', '0.08', '3', '1'],
       ...,
       ['haarcascade', '74.25', '1783', '1'],
       ['haarcascade', '74.29', '1784', '1'],
       ['haarcascade', '74.33', '1785', '1']], dtype='<U32')