In [1]:
import face_recognition
import cv2
import numpy as np
from pytube import YouTube
import pandas as pd
from tensorflow.keras.models import load_model
import pickle


def download_video(yt_link):
    yt = YouTube(yt_link)
    stream = yt.streams.first()
    stream.download()
    video = cv2.VideoCapture(stream.default_filename)
    return video


def init_face_emotion_models():
    face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    emotion_classifier = load_model("../models/emotion_model.hdf5", compile=False)
    return face_detector, emotion_classifier

def preprocess_face(face, input_face_size):
    face = cv2.cvtColor(face, cv2.COLOR_RGB2GRAY)  # Convert the face to grayscale
    face = cv2.resize(face, (input_face_size[1], input_face_size[0]))  # Swap width and height
    face = face.astype('float32') / 255.0
    face = np.expand_dims(face, axis=-1)  # Add an additional dimension for grayscale channel
    face = np.expand_dims(face, axis=0)
    return face



def predict_emotion(face, emotion_classifier):
    prob = emotion_classifier.predict(face)[0]
    return np.argmax(prob), np.max(prob)


def extract_faces_and_emotions(video, face_detector, emotion_classifier):
    frames_without_faces_counter = 0
    input_face_size = emotion_classifier.input_shape[1:3]
    frame_rate = 50
    skip_frames = int(video.get(cv2.CAP_PROP_FPS) / frame_rate)

    list_of_faces = []
    list_of_emotions = []
    face_embeddings = {}
    frames_without_faces_counter = 0
    individual_id_counter = 0
    frame_data = []

    while True:
        ret, frame = video.read()

        if not ret:
            break

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        faces = face_detector.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5)
        face_locations = face_recognition.face_locations(rgb_frame)

        # If there are no faces in the frame, skip it
        if not face_locations:
            continue
        
        for i, face_location in enumerate(face_locations):
            top, right, bottom, left = face_location
            face_landmark = face_recognition.face_landmarks(rgb_frame, [face_location])[0]
            x, y, w, h = left, top, right - left, bottom - top
            face = rgb_frame[top:bottom, left:right]
            face = preprocess_face(face, input_face_size)
            emotion, prob = predict_emotion(face, emotion_classifier)


        for i, face_landmark in enumerate(face_landmark):
            if not face_landmark:
                continue

            if i < len(face_locations):
                face_encoding = face_recognition.face_encodings(rgb_frame, [face_locations[i]])[0]
            else:
                continue            
            current_individual_ids = []

            # Compare the current face encoding with the existing face embeddings
            for individual_id, individual_face_encoding in face_embeddings.items():
                if face_recognition.compare_faces([individual_face_encoding], face_encoding)[0]:
                    current_individual_ids.append(individual_id)
                    break
            else:
                individual_id_counter += 1
                current_individual_ids.append(individual_id_counter)
                face_embeddings[individual_id_counter] = face_encoding

            # Preprocess face and predict emotion
            face = rgb_frame[face_locations[i][0]:face_locations[i][2], face_locations[i][3]:face_locations[i][1]]
            face = preprocess_face(face, input_face_size)
            emotion, prob = predict_emotion(face, emotion_classifier)

            # Append data to lists and dictionaries
            frame_data.append({
                'frame': video.get(cv2.CAP_PROP_POS_FRAMES),
                'num_faces': len(faces),
                'individual_id': current_individual_ids[-1],
                'emotion': emotion,
                'prob': prob
            })
            list_of_faces.append(face)
            list_of_emotions.append(emotion_classifier.predict(face[np.newaxis, :, :, :].reshape(-1, 64, 64, 1))[0])

            emotion_labels = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
            emotion_text = emotion_labels[emotion]

            font_scale = 0.4  # Adjust the text size here

            # Draw rectangle and text on the frame
            x, y, w, h = left, top, right - left, bottom - top
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, f"Prob: {prob:.1%}", (x, y + h + 40), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), 1)
            cv2.putText(frame, f"{emotion_text}", (x, y + h + 20), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), 1)
            cv2.putText(frame, f"ID: {current_individual_ids[i]}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), 1)

        scale_factor = 3
        height, width = frame.shape[:2]
        new_height = int(height * scale_factor)
        new_width = int(width * scale_factor)
        resized_frame = cv2.resize(frame, (new_width, new_height))

        # Display the frame
        cv2.imshow('Video', resized_frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        else:
            for i in range(skip_frames):
                ret, _ = video.read()
            frames_without_faces_counter = 0

    return list_of_faces, list_of_emotions, face_embeddings, frame_data


def main():
    yt_link = "https://www.youtube.com/watch?v=pnFj9w3FGc8&ab_channel=PexBell-FreeStockVideoFootage"
    video = download_video(yt_link)
    face_detector, emotion_classifier = init_face_emotion_models()

    list_of_faces, list_of_emotions, face_embeddings, frame_data = extract_faces_and_emotions(video, face_detector, emotion_classifier)

    print(f'{len(list_of_faces)} faces found in {len(list_of_emotions)} frames with at least one face.')
    global frames_without_faces_counter
    print(f'{frames_without_faces_counter} frames had no face detected ({round(100*frames_without_faces_counter/len(list_of_emotions),2)}%).')

    # Save data to files
    emotions = np.vstack(list_of_emotions)
    np.savetxt('emotions.csv', emotions, delimiter=',')
    emotions_df = pd.DataFrame(emotions, columns=['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'])
    emotions_df['Frame'] = np.arange(len(emotions_df))

    with open('face_embeddings.pickle', 'wb') as f:
        pickle.dump(face_embeddings, f)

    # Display the emotion detection results
    individual_ids = {}
    for data in frame_data:
        individual_id = data['individual_id']
        emotion = data['emotion']
        prob = data['prob']
        frame_num = data['frame']
        if individual_id in individual_ids:
            face, emotion_history = individual_ids[individual_id]
            emotion_history[emotion] += 1
        else:
            if int(frame_num) < len(list_of_faces):
                face = list_of_faces[int(frame_num)][0]
                emotion_history = [0, 0, 0, 0, 0, 0, 0]
                emotion_history[emotion] += 1
            else:
                continue


    EMOTIONS = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

    for individual_id, (face, emotion_history) in individual_ids.items():
        x, y, w, h = face_locations[0]
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        cv2.putText(frame, f"ID: {individual_id}", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        emotion_label = EMOTIONS[np.argmax(emotion_history)]
        emotion_prob = emotion_history[np.argmax(emotion_history)] / sum(emotion_history)
        cv2.putText(frame, f"Emotion: {emotion_label} ({emotion_prob*100:.1f}%)", (x, y+h+20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        cv2.imshow('Video', frame)

    cv2.waitKey(0)
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()



Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2023-03-16 20:43:12.084446: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-16 20:43:12.084829: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-03-16 20:43:12.802710: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-16 20:43:12.920662: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
