In [3]:
from google.colab import drive
drive.mount('/content/drive')

import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, LSTM, Dense, Flatten, TimeDistributed
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM, Dense, TimeDistributed, Dropout



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import cv2
import numpy as np
import mediapipe as mp
import time

# Initialize Mediapipe Face Detection
mp_face_detection = mp.solutions.face_detection
face_detector = mp_face_detection.FaceDetection(min_detection_confidence=0.5)

def extract_faces_from_video(video_path, frame_count=6, face_size=(64, 64), timeout=10):
    """
    Extracts face images from a video using Mediapipe Face Detection.

    Args:
        video_path (str): Path to the video file.
        frame_count (int): Number of frames to extract per video.
        face_size (tuple): Target face size (default: 64x64).
        timeout (int): Maximum seconds to spend on one video.

    Returns:
        NumPy array of `frame_count` face images (64x64x3).
        If no face is detected, returns blank images.
    """
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error: Cannot open video {video_path}")
        return np.zeros((frame_count, *face_size, 3), dtype=np.uint8)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames == 0:
        print(f"Skipping {video_path} (No frames found)")
        return np.zeros((frame_count, *face_size, 3), dtype=np.uint8)

    frames = []
    interval = max(1, total_frames // frame_count)
    start_time = time.time()  # Start timeout timer

    for i in range(frame_count):
        if time.time() - start_time > timeout:  # If function takes too long, stop
            print(f"Timeout: Stopping extraction for {video_path}")
            break

        frame_idx = i * interval
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        success, frame = cap.read()

        if not success or frame is None:
            print(f"Skipping frame {frame_idx} in {video_path} (Frame read failed)")
            frames.append(np.zeros((*face_size, 3), dtype=np.uint8))
            continue

        # Convert to RGB for Mediapipe
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = face_detector.process(rgb_frame)

        if results.detections:
            for detection in results.detections:
                bbox = detection.location_data.relative_bounding_box
                h, w, _ = frame.shape
                x, y, w_box, h_box = int(bbox.xmin * w), int(bbox.ymin * h), int(bbox.width * w), int(bbox.height * h)

                x, y = max(0, x), max(0, y)
                face = frame[y:y + h_box, x:x + w_box]
                face = cv2.resize(face, face_size)

                frames.append(face)
                break  # Only take the first detected face
        else:
            frames.append(np.zeros((*face_size, 3), dtype=np.uint8))

    cap.release()

    # Ensure exactly `frame_count` frames
    while len(frames) < frame_count:
        frames.append(np.zeros((*face_size, 3), dtype=np.uint8))

    return np.array(frames)  # Shape: (frame_count, 64, 64, 3)




In [4]:
import glob

dataset_path = "/content/drive/MyDrive/SER_Dataset/Ravdess/Video_speech/"
actors = range(1, 25)  # Actor IDs from 1 to 24

X, y, groups = [], [], []

for actor in actors:
    actor_path = os.path.join(dataset_path, f"Actor_{actor}")
    video_files = glob.glob(os.path.join(actor_path, "*.mp4"))

    for video_file in video_files:
        filename = os.path.basename(video_file)
        parts = filename.split("-")

        if len(parts) != 7:
            continue  # Skip invalid filenames

        modality, vocal_channel, emotion, intensity, statement, repetition, actor_id = parts
        emotion = int(emotion)  # Convert emotion to integer

        # Extract faces from video
        faces = extract_faces_haar(video_file)

        if faces.shape[0] == 6:  # Ensure exactly 6 frames are extracted
            X.append(faces)
            y.append(emotion)
            groups.append(int(actor_id))  # Use Actor ID for LOSO evaluation

X = np.array(X)
y = np.array(y)
groups = np.array(groups)

# Normalize pixel values to [0,1]
X = X / 255.0

print(f"Dataset loaded: {X.shape[0]} samples with shape {X.shape[1:]}")


Dataset loaded: 0 samples with shape ()


In [None]:
'''import os
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

base_path = "/content/drive/MyDrive/SER_Dataset/Ravdess/Video_Speech"
features, labels, actors = [], [], []

for actor_id in range(1, 25):  # Actor_01 to Actor_24
    actor_path = os.path.join(base_path, f"Actor_{actor_id:02d}")  # Ensures two-digit formatting

    if not os.path.exists(actor_path):
        print(f"Skipping: {actor_path} (Folder not found)")
        continue  # Skip if folder does not exist

    for video_file in os.listdir(actor_path):
        if video_file.endswith(".mp4") and video_file.startswith("02-"):  # Only Video-Modality (02)
            parts = video_file.split("-")
            emotion_label = int(parts[2])  # Extract emotion from filename

            video_path = os.path.join(actor_path, video_file)
            face_frames = extract_faces_from_video(video_path)  # Extract 6 frames per video

            features.append(face_frames)
            labels.append(emotion_label)
            actors.append(actor_id)

features = np.array(features)  # Shape: (num_videos, 6, 64, 64, 3)
labels = np.array(labels)
actors = np.array(actors)

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels = to_categorical(labels)  # Convert to one-hot encoding

print("Feature extraction complete!")

# Define save paths
save_dir = "/content/drive/MyDrive/SER_Dataset/Processed_Features"
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

# Save extracted features, labels, and actor IDs
np.save(os.path.join(save_dir, "video_features.npy"), features)
np.save(os.path.join(save_dir, "video_labels.npy"), labels)
np.save(os.path.join(save_dir, "video_actors.npy"), actors)

print(f"Features saved successfully in {save_dir}!")'''


In [9]:
import os
import numpy as np

def process_video(video_path, save_dir, actor_id, emotion_label):
    """
    Processes a single video, checks if features exist, and extracts only if needed.

    Args:
        video_path (str): Path to the video file.
        save_dir (str): Directory to save extracted features.
        actor_id (int): ID of the actor.
        emotion_label (int): Emotion label.

    Returns:
        None (Saves extracted features if not already processed).
    """

    video_filename = os.path.basename(video_path).replace(".mp4", "")  # Remove .mp4 extension

    # Define paths for saved features
    feature_save_path = os.path.join(save_dir, f"{video_filename}_features.npy")
    label_save_path = os.path.join(save_dir, f"{video_filename}_label.npy")
    actor_save_path = os.path.join(save_dir, f"{video_filename}_actor.npy")

    # Check if the extracted files already exist
    if os.path.exists(feature_save_path) and os.path.exists(label_save_path) and os.path.exists(actor_save_path):
        print(f"Skipping {video_filename} (Features already exist)")
        return  # Skip processing

    print(f"Processing: {video_filename}")

    try:
        face_frames = extract_faces_from_video(video_path)  # Extract 6 frames per video

        # Ensure extracted frames are valid
        if face_frames is None or len(face_frames) == 0:
            print(f"Warning: No faces detected in {video_filename}. Skipping...")
            return

        # Save the extracted features
        np.save(feature_save_path, face_frames)
        np.save(label_save_path, np.array([emotion_label]))  # Save as array for consistency
        np.save(actor_save_path, np.array([actor_id]))

        print(f"Features extracted & saved for {video_filename}")

    except Exception as e:
        print(f"Error processing {video_filename}: {e}")

In [10]:
base_path = "/content/drive/MyDrive/SER_Dataset/Ravdess/Video_Speech"
save_dir = "/content/drive/MyDrive/SER_Dataset/Processed_Features"
os.makedirs(save_dir, exist_ok=True)  # Ensure save directory exists

for actor_id in range(1, 25):  # Actor_01 to Actor_24
    actor_path = os.path.join(base_path, f"Actor_{actor_id:02d}")  # Ensures two-digit formatting

    if not os.path.exists(actor_path):
        print(f"Skipping: {actor_path} (Folder not found)")
        continue  # Skip if folder does not exist

    for video_file in os.listdir(actor_path):
        if video_file.endswith(".mp4") and video_file.startswith("02-"):  # Only Video-Modality (02)
            parts = video_file.split("-")
            emotion_label = int(parts[2])  # Extract emotion from filename

            video_path = os.path.join(actor_path, video_file)

            # Process the video only if needed
            process_video(video_path, save_dir, actor_id, emotion_label)

print(" All videos processed successfully!")

Skipping 02-01-02-01-01-01-01 (Features already exist)
Skipping 02-01-02-01-01-02-01 (Features already exist)
Skipping 02-01-01-01-02-01-01 (Features already exist)
Skipping 02-01-01-01-01-01-01 (Features already exist)
Skipping 02-01-01-01-02-02-01 (Features already exist)
Skipping 02-01-01-01-01-02-01 (Features already exist)
Skipping 02-01-02-02-01-02-01 (Features already exist)
Skipping 02-01-02-02-01-01-01 (Features already exist)
Skipping 02-01-02-01-02-01-01 (Features already exist)
Skipping 02-01-02-02-02-02-01 (Features already exist)
Skipping 02-01-02-01-02-02-01 (Features already exist)
Skipping 02-01-02-02-02-01-01 (Features already exist)
Skipping 02-01-03-01-02-02-01 (Features already exist)
Skipping 02-01-03-01-02-01-01 (Features already exist)
Skipping 02-01-03-01-01-01-01 (Features already exist)
Skipping 02-01-03-02-01-02-01 (Features already exist)
Skipping 02-01-03-02-01-01-01 (Features already exist)
Skipping 02-01-03-01-01-02-01 (Features already exist)
Skipping 0

In [None]:
'''def build_cnn_lstm_model(input_shape, num_classes):
    model = Sequential([
        TimeDistributed(Conv2D(32, (3, 3), activation='relu', padding='same'), input_shape=input_shape),
        TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same')),
        TimeDistributed(Flatten()),
        LSTM(128, return_sequences=False),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

input_shape = (6, 64, 64, 3)  # 6 frames per video
num_classes = labels.shape[1]
model = build_cnn_lstm_model(input_shape, num_classes)
'''

In [11]:
# Define paths
feature_dir = "/content/drive/MyDrive/SER_Dataset/Processed_Features"

# Load extracted features
features, labels = [], []
for file in os.listdir(feature_dir):
    if file.endswith("_features.npy"):
        base_name = file.replace("_features.npy", "")

        feature_path = os.path.join(feature_dir, file)
        label_path = os.path.join(feature_dir, f"{base_name}_label.npy")

        if os.path.exists(label_path):
            feature_array = np.load(feature_path)  # Shape: (6, 64, 64, 3)
            label_array = np.load(label_path)  # Shape: (1,)

            # Ensure valid data
            if feature_array.shape == (6, 64, 64, 3) and label_array.shape == (1,):
                features.append(feature_array)
                labels.append(label_array[0])  # Extract single label value

# Convert to numpy arrays
features = np.array(features)  # Shape: (num_samples, 6, 64, 64, 3)
labels = np.array(labels)  # Shape: (num_samples,)

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)  # Convert to integer labels
labels = to_categorical(labels)  # Convert to one-hot encoding

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

# Define CNN-LSTM Model
def build_cnn_lstm_model(input_shape, num_classes):
    model = Sequential([
        TimeDistributed(Conv2D(32, (3, 3), activation='relu', padding='same')),  # CNN layer per frame
        TimeDistributed(MaxPooling2D(pool_size=(2, 2))),
        TimeDistributed(Flatten()),  # Flatten CNN output per frame
        LSTM(64, return_sequences=False),  # LSTM processes sequential frames
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # Emotion classification
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Model parameters
input_shape = (6, 64, 64, 3)  # 6 frames per video
num_classes = labels.shape[1]

# Build & Train Model
model = build_cnn_lstm_model(input_shape, num_classes)
model.fit(X_train, y_train, epochs=5, batch_size=16, validation_data=(X_test, y_test))

# Save Model
model.save("/content/drive/MyDrive/SER_Dataset/cnn_lstm_model.h5")

print("Model training complete and saved successfully!")

Epoch 1/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 1s/step - accuracy: 0.1144 - loss: 2.1582 - val_accuracy: 0.1319 - val_loss: 2.0650
Epoch 2/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 1s/step - accuracy: 0.1251 - loss: 2.0960 - val_accuracy: 0.1319 - val_loss: 2.0639
Epoch 3/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 1s/step - accuracy: 0.1418 - loss: 2.0858 - val_accuracy: 0.1354 - val_loss: 2.0632
Epoch 4/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 1s/step - accuracy: 0.1379 - loss: 2.0724 - val_accuracy: 0.1319 - val_loss: 2.0631
Epoch 5/5
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 1s/step - accuracy: 0.1210 - loss: 2.0732 - val_accuracy: 0.1354 - val_loss: 2.0628




Model training complete and saved successfully!


In [None]:
'''logo = LeaveOneGroupOut()
accuracies = []

for train_idx, test_idx in logo.split(features, labels, groups=actors):
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    model.fit(X_train, y_train, epochs=2, batch_size=8, verbose=1, validation_data=(X_test, y_test))

    loss, acc = model.evaluate(X_test, y_test)
    accuracies.append(acc)
    print(f"Actor left out: {actors[test_idx][0]} - Accuracy: {acc:.4f}")

print(f"\nFinal LOSO Accuracy: {np.mean(accuracies):.4f}")
'''

In [12]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Define paths
feature_dir = "/content/drive/MyDrive/SER_Dataset/Processed_Features"
model_path = "/content/drive/MyDrive/SER_Dataset/cnn_lstm_model.h5"

# Load extracted features
features, labels, actors = [], [], []
for file in os.listdir(feature_dir):
    if file.endswith("_features.npy"):
        base_name = file.replace("_features.npy", "")

        feature_path = os.path.join(feature_dir, file)
        label_path = os.path.join(feature_dir, f"{base_name}_label.npy")
        actor_path = os.path.join(feature_dir, f"{base_name}_actor.npy")

        if os.path.exists(label_path) and os.path.exists(actor_path):
            feature_array = np.load(feature_path)
            label_array = np.load(label_path)
            actor_array = np.load(actor_path)

            if feature_array.shape == (6, 64, 64, 3) and label_array.shape == (1,) and actor_array.shape == (1,):
                features.append(feature_array)
                labels.append(label_array[0])
                actors.append(actor_array[0])

# Convert to numpy arrays
features = np.array(features)
labels = np.array(labels)
actors = np.array(actors)

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels = to_categorical(labels)
num_classes = labels.shape[1]

# Unique actors (for LOSO)
unique_actors = np.unique(actors)
total_accuracy = []

# LOSO Evaluation
for actor in unique_actors:
    print(f"\n LOSO Fold: Leaving out Actor {actor}")

    train_idx = actors != actor
    test_idx = actors == actor

    X_train, y_train = features[train_idx], labels[train_idx]
    X_test, y_test = features[test_idx], labels[test_idx]

    model = load_model(model_path)

    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    total_accuracy.append(accuracy)

    print(f"Actor {actor} Test Accuracy: {accuracy:.4f}")

# Compute Final LOSO Accuracy
final_loso_accuracy = np.mean(total_accuracy)
print(f"\n Final LOSO Accuracy: {final_loso_accuracy:.4f}")



 LOSO Fold: Leaving out Actor 1




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 716ms/step
Actor 1 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 2




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 973ms/step
Actor 2 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 3




[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 777ms/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 814ms/step
Actor 3 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 4




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 745ms/step
Actor 4 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 5




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 731ms/step
Actor 5 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 6




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 615ms/step
Actor 6 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 7




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 642ms/step
Actor 7 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 8




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 700ms/step
Actor 8 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 9




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step
Actor 9 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 10




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 659ms/step
Actor 10 Test Accuracy: 0.1379

 LOSO Fold: Leaving out Actor 11




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 675ms/step
Actor 11 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 12




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 664ms/step
Actor 12 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 13




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 625ms/step
Actor 13 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 14




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663ms/step
Actor 14 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 15




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 655ms/step
Actor 15 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 16




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step   
Actor 16 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 17




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 667ms/step
Actor 17 Test Accuracy: 0.1379

 LOSO Fold: Leaving out Actor 18




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 631ms/step
Actor 18 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 19




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 665ms/step
Actor 19 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 20




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608ms/step
Actor 20 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 21




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 655ms/step
Actor 21 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 22




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 666ms/step
Actor 22 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 23




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step
Actor 23 Test Accuracy: 0.1333

 LOSO Fold: Leaving out Actor 24




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 634ms/step
Actor 24 Test Accuracy: 0.1333

 Final LOSO Accuracy: 0.1337
