In [34]:
import torch
import cv2
import os
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical # to one-hot encoding data
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

In [35]:
DATA_PATH = os.path.join("./MP_Data_Emotions")
EPOCHS = 30
RANDOM_SEED = np.random.seed(10)
BATCH_SIZE = 32

In [16]:
mp_holistic = mp.solutions.holistic # Holistic model 
mp_drawing = mp.solutions.drawing_utils # Drawing utilities 
mp_face_mesh = mp.solutions.face_mesh

In [17]:
torch.cuda.is_available()

True

In [18]:
# Function to detect the holistic landmarks from an image (face, pose and hands)
def mediapipe_detection(image, model): 
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.setflags(write=False)         # Image is no longer writable
    results = model.process(image)      # Make prediction
    image.setflags(write=True)          # Image is now writable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


In [19]:
# Function to draw the landmarks on the image
def draw_styled_landmarks(image, results):

    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_face_mesh.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),  
                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                              )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [20]:
# Function to extract keypoints from the landmarks and flatten them into a single vector to be used as input to the model
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh]) # flatten arr of points x,y,z visibility values - single vector

In [23]:
emotions = np.array(['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'])
label_map = {label:num for num, label in enumerate(emotions)}
# label dictionary to represent each one of our action
label_map

{np.str_('Angry'): 0,
 np.str_('Disgust'): 1,
 np.str_('Fear'): 2,
 np.str_('Happy'): 3,
 np.str_('Sad'): 4,
 np.str_('Surprise'): 5,
 np.str_('Neutral'): 6}

In [24]:
features, labels = [], []

for emotion in emotions:
    emotion_path = os.path.join(DATA_PATH, emotion)
    # List all .npy files in this folder
    files = sorted([f for f in os.listdir(emotion_path) if f.endswith(".npy")])
    
    for file_name in files:
        sample_path = os.path.join(emotion_path, file_name)
        sample = np.load(sample_path)
        features.append(sample)
        labels.append(label_map[emotion])

X = np.array(features).astype("float32")
y = to_categorical(labels)

In [29]:
# Train + temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.15,
    stratify=y,
    random_state=42
)

# Val + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

Train: (297, 1662)
Val: (26, 1662)
Test: (27, 1662)


**FER2013 dataset**

- 35,887 labeled grayscale images (48x48 pixels) with approximately 5,000 images per category

- emotion categories: 'Angry', 'Disgusted', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'

- original split has 28,709 images for training, 3,589 images for validation, and 3,589 images for testing


In [30]:
# https://www.kaggle.com/datasets/abhisheksingh016/machine-model-for-emotion-detection
base_model = load_model('./face_model.h5')
base_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


**CNN**

- includes: 
    1. **Convolutional Layer** - extract features, by applying kernel/filter (64, 64, 32, 32) to input img output shape (height*width:44*44, 20*20, 18*18, 16*16)
    2. **Batch Normalization** - stabilize learning by normalizing inputs, activations are stable (helps train faster and reliable)
    3. **Activation** - introduces non-linearity to model (ReLU), makes model learn complex patterns
    4. **Max Pooling** - reduces spatial size (height*width) by taking max value, makes network faster
    4. **Dropout** - randomly sets neurons to 0, to prevent overfitting (relying on certain neurons)
    5. **Dense** - fully connected layer, learns high-lvl combinations (final for 7 classes/neurons)
        followed by **activation** (softmax) converting raw scores to probabilities for 7 classes

- training process utilizes an ImageDataGenerator for data augmentation, enhancing the model's ability to generalize to various facial expressions

In [31]:
for layer in base_model.layers[:-5]:
    layer.trainable = False

In [32]:
base_model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)


In [36]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

log_dir = os.path.join('Logs_pretrained')
tb_callback = TensorBoard(log_dir=log_dir)
# # es_callback = EarlyStopping(monitor='val_categorical_accuracy', patience=100, restore_best_weights=True, verbose=1)
rp_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=15, min_lr=1e-5, verbose=1)

In [39]:
print(X_train.shape)  # should be (num_samples, 1662)

(297, 1662)


In [None]:
history = base_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[rp_callback, tb_callback]
)



Epoch 1/10


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 1662), dtype=float32). Expected shape (None, 48, 48, 1), but input has incompatible shape (None, 1662)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 1662), dtype=float32)
  • training=True
  • mask=None
  • kwargs=<class 'inspect._empty'>

In [None]:
# # f to render probabilities
# colors = [(245,117,16),(117,245,16),(16,117,245),(255,0,0),(0,255,0),(0,0,255),(255,255,0)
# ]

# def prob_viz(res, emotions, input_frame, colors):
#     output_frame = input_frame.copy()
#     for num, prob in enumerate(res):
#         cv2.rectangle(output_frame, (0,60+num*40),
#                     (int(prob*100), 90+num*40),
#                     colors[num], -1
#         ) 
#         # bar dynamically changes based on probability (longer = higher)
#         cv2.putText(
#             output_frame, emotions[num],
#             (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX,
#             1, (255,255,255),
#             2, cv2.LINE_AA)
        
#     return output_frame

In [None]:
# def preprocess_frame(frame):
#     gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
#     gray = cv2.resize(gray, (48,48))
#     gray = gray.astype("float32") / 255.0
#     gray = gray.reshape(48,48,1)
#     return gray