In [1]:
import torch
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image

In [7]:
mp_holistic = mp.solutions.holistic # Holistic model 
mp_drawing = mp.solutions.drawing_utils # Drawing utilities 
mp_face_mesh = mp.solutions.face_mesh

In [9]:
torch.cuda.is_available()

True

In [None]:
# Function to detect the holistic landmarks from an image (face, pose and hands)
def mediapipe_detection(image, model): 
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.setflags(write=False)         # Image is no longer writable
    results = model.process(image)      # Make prediction
    image.setflags(write=True)          # Image is now writable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


In [None]:
# Function to draw the landmarks on the image
def draw_styled_landmarks(image, results):

    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_face_mesh.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),  
                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                              )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [None]:
# Function to extract keypoints from the landmarks and flatten them into a single vector to be used as input to the model
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh]) # flatten arr of points x,y,z visibility values - single vector

**FER2013 dataset**

- 35,887 labeled grayscale images (48x48 pixels) with approximately 5,000 images per category

- emotion categories: 'Angry', 'Disgusted', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'

- original split has 28,709 images for training, 3,589 images for validation, and 3,589 images for testing


In [2]:
# https://www.kaggle.com/datasets/abhisheksingh016/machine-model-for-emotion-detection
model = load_model('./face_model.h5')
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 46, 46, 64)        640       
                                                                 
 batch_normalization (Batch  (None, 46, 46, 64)        256       
 Normalization)                                                  
                                                                 
 activation (Activation)     (None, 46, 46, 64)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 44, 44, 64)        36928     
                                                                 
 batch_normalization_1 (Bat  (None, 44, 44, 64)        256       
 chNormalization)                                                
                                                                 
 activation_1 (Activation)   (None, 44, 44, 64)        0

**CNN**

- includes: 
    1. **Convolutional Layer** - extract features, by applying kernel/filter (64, 64, 32, 32) to input img output shape (height*width:44*44, 20*20, 18*18, 16*16)
    2. **Batch Normalization** - stabilize learning by normalizing inputs, activations are stable (helps train faster and reliable)
    3. **Activation** - introduces non-linearity to model (ReLU), makes model learn complex patterns
    4. **Max Pooling** - reduces spatial size (height*width) by taking max value, makes network faster
    4. **Dropout** - randomly sets neurons to 0, to prevent overfitting (relying on certain neurons)
    5. **Dense** - fully connected layer, learns high-lvl combinations (final for 7 classes/neurons)
        followed by **activation** (softmax) converting raw scores to probabilities for 7 classes

- training process utilizes an ImageDataGenerator for data augmentation, enhancing the model's ability to generalize to various facial expressions

In [25]:
# actions = np.array(['idontknow', 'areyouok', 'idontunderstand'])
emotions = np.array(['Angry', 'Disgusted', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'])
emotions
# class_names = ['Angry', 'Disgusted', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

array(['Angry', 'Disgusted', 'Fear', 'Happy', 'Sad', 'Surprise',
       'Neutral'], dtype='<U9')

In [None]:
# f to render probabilities
colors = [(245,117,16),(117,245,16),(16,117,245),(255,0,0),(0,255,0),(0,0,255),(255,255,0)
]

def prob_viz(res, emotions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40),
                    (int(prob*100), 90+num*40),
                    colors[num], -1
        ) 
        # bar dynamically changes based on probability (longer = higher)
        cv2.putText(
            output_frame, emotions[num],
            (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX,
            1, (255,255,255),
            2, cv2.LINE_AA)
        
    return output_frame

In [None]:
def preprocess_frame(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, (48,48))
    gray = gray.astype("float32") / 255.0
    gray = gray.reshape(48,48,1)
    return gray

In [None]:
# 1. New detection variables (?????)
sequence = [] # append 50 frames, once got 50 frames, predict
sentence = [] # concatenate history of detection together
predictions = []
threshold = 0.5

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        ret, frame = cap.read()

        image, results = mediapipe_detection(frame, holistic)

        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints) 
        # sequence = sequence[:50] # but still taking first 50 frames, although added to the end
        if len(sequence) > 50:
            sequence = sequence[-50:]
        
        if len(sequence) == 50:
            processed = preprocess_frame(frame)
            processed = np.expand_dims(processed, axis=0)  # batch dim
            pred = model.predict(processed)
            res = pred[0]
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): # grab only last 10 preds and only unique
                if res[np.argmax(res)] > threshold: #check if res above threshold 0.5
                    if len(sentence) > 0: 
                        if emotions[np.argmax(res)] != sentence[-1]: #check if current action isn't same as last sentence
                            sentence.append(emotions[np.argmax(res)])
                    else:
                        sentence.append(emotions[np.argmax(res)])

                cv2.putText(image, pred,
                            (x_min, y_min-10),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (0,255,0),
                            2)

        cv2.imshow('OpenCV Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

KeyboardInterrupt: 