In [1]:
import numpy as np
import cv2
import os
import mediapipe as mp
import random
import matplotlib.pyplot as plt
import copy
import time
import os
from gtts import gTTS
import pygame
from pygame import mixer
import threading
import pyttsx3

pygame 2.4.0 (SDL 2.26.4, Python 3.9.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(input_image, model):
    # Convert image from BGR to RGB
    input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB)
    # Make prediction
    detection_results = model.process(input_image)
    # Convert image back to BGR
    input_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)

    return input_image, detection_results

In [4]:
def draw_landmarks(input_image, detection_results):
    # Draw face connections
    mp_drawing.draw_landmarks(input_image, detection_results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)

In [5]:
def draw_styled_landmarks(input_image, detection_results):
    # Set drawing specifications for face connections
    face_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=1) 
    face_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1)

    # Draw face connections
    mp_drawing.draw_landmarks(input_image, detection_results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, face_points, face_lines)

In [6]:
# Extract keypoints into flattened arrays
def extract_keypoints (detection_results):

    face = np.array([[results.x, results.y, results.z]
    for results in detection_results.face_landmarks.landmark]).flatten() if detection_results.face_landmarks else np.zeros(468*3)

    return np.concatenate([face])

In [7]:
# Path to store exported np array
DATA_PATH = os.path.join('EmotionData')

# Gesture that will be detected 
emotions = np.array(['Happy','Sad','Angry','Neutral','Surprised'])

length_frames = 2

In [8]:
from tensorflow.keras.models import load_model

model = load_model('My_model_ED.h5')

In [12]:
mixer.init()
pygame.init()

def play_text(text):
    def speak_text(text):
        try:
            engine = pyttsx3.init()
            engine.say(text)
            engine.runAndWait()
        except Exception as e:
            print(f"Error speaking text: {e}")

    speech_thread = threading.Thread(target=speak_text, args=(text,))
    speech_thread.start()

def play_and_delete_audio(unique_filename):
    try:
        mixer.music.load(unique_filename)
        mixer.music.play()
        while mixer.music.get_busy():
            time.sleep(0.1)
        os.remove(unique_filename)
    except Exception as e:
        print(f"Error in play_and_delete_audio: {e}")

def moving_average(predictions, window_size=3):
    if len(predictions) < window_size:
        return predictions
    return [np.mean(predictions[: i + 1]) for i in range(window_size - 1)] + [
        np.mean(predictions[i - window_size + 1 : i + 1])
        for i in range(window_size - 1, len(predictions))
    ]


sequence = []
current_emotion = []
predictions = []

threshold = 0.6
window_size = 5

cap = cv2.VideoCapture(0)

cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    prev_frame_time = 0
    prediction_frequency = 1
    frame_counter = 0
    while cap.isOpened():
        ret, input_frame = cap.read()

        input_image, detection_results = mediapipe_detection(input_frame, holistic)
        draw_styled_landmarks(input_image, detection_results)

        keypoints = extract_keypoints(detection_results)
        sequence.append(keypoints)
        sequence = sequence[-2:]

        if len(sequence) == 2 and frame_counter % prediction_frequency == 0:
            result = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(result))

            if len(predictions) >= window_size:
                smoothed_predictions = moving_average(predictions[-window_size:], window_size)

                if len(current_emotion) > 0:
                    if (
                        np.unique(smoothed_predictions)[-1] == np.argmax(result)
                        and result[np.argmax(result)] > threshold
                    ) and emotions[np.argmax(result)] != current_emotion[-1]:
                        current_emotion.append(emotions[np.argmax(result)])
                        play_text(emotions[np.argmax(result)])
                elif (
                    np.unique(smoothed_predictions)[-1] == np.argmax(result)
                    and result[np.argmax(result)] > threshold
                ):
                    current_emotion.append(emotions[np.argmax(result)])
                    play_text(emotions[np.argmax(result)])

                if len(current_emotion) > 1: 
                    current_emotion = current_emotion[-1:]
        frame_counter += 1

        new_frame_time = time.time()
        fps = 1 / (new_frame_time - prev_frame_time)
        prev_frame_time = new_frame_time
        fps_text = f"FPS: {int(fps)}"
        cv2.putText(input_image, fps_text, (input_frame.shape[1] - 80, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
        # Display the recognized emotion with a background
        emotion_text = ' '.join(current_emotion)
        (text_width, text_height), _ = cv2.getTextSize(emotion_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        cv2.rectangle(input_image, (5, 35 - text_height - 10), (5 + text_width + 10, 35 + 10), (0, 20, 0), -1)
        cv2.putText(input_image, emotion_text, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show
        cv2.imshow('Face Emotion Recognition', input_image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    mixer.quit()
    pygame.quit()

    cap.release()
    cv2.destroyAllWindows()