In [1]:
import os
import glob
import numpy as np
import tensorflow as tf
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Dropout, Dense, RNN, LSTMCell
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [2]:
def load_data(dataset_path, img_size=(128, 128)):
    """
    Loads precomputed mel-spectrogram images and extracts labels from folder names.
    Each image is resized to (128, 128) and normalized.
    """
    X = []
    y = []
    class_names = sorted(os.listdir(dataset_path))  # Get emotion categories
    
    for label in class_names:
        class_path = os.path.join(dataset_path, label)
        if not os.path.isdir(class_path):
            continue
        
        files = glob.glob(os.path.join(class_path, '*.png'))
        print(f"Found {len(files)} images for class '{label}'.")
        
        for file in files:
            img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, img_size)
            X.append(img)
            y.append(label)
    
    X = np.array(X, dtype=np.float32) / 255.0  # Normalize pixel values to [0, 1]
    y = np.array(y)
    return X, y


In [3]:
def preprocess_labels(y):
    """
    Encodes string labels into numerical one-hot vectors.
    """
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_onehot = to_categorical(y_encoded)
    return y_onehot, le


In [4]:
def build_model(input_shape, num_classes):
    """
    Builds and returns a model with three Conv1D layers followed by three LSTM layers 
    and several Dense layers for classification.
    
    The input_shape is expected to be (timesteps, features), for example (128, 128).
    """
    model = Sequential()
    
    # First Conv1D block
    model.add(Conv1D(1024, kernel_size=5, strides=1, padding='same', activation='relu',
                     input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # Second Conv1D block
    model.add(Conv1D(512, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # Third Conv1D block
    model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # LSTM layers using LSTMCell wrapped in RNN to avoid cuDNN issues on AMD GPUs with DirectML
    model.add(RNN(LSTMCell(128), return_sequences=True))
    model.add(Dropout(0.3))
    
    model.add(RNN(LSTMCell(128), return_sequences=True))
    model.add(Dropout(0.3))
    
    model.add(RNN(LSTMCell(128)))
    model.add(Dropout(0.3))
    
    # Fully connected Dense layers
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.summary()
    return model


In [5]:
def scheduler(epoch, lr):
    """Simple learning rate scheduler that halves the lr every 100 epochs."""
    return lr * 0.5 if epoch > 0 and epoch % 100 == 0 else lr


In [6]:
def main():
    dataset_path = 'RAVDESS_mel_spectrograms'  # Update with your dataset path
    X, y = load_data(dataset_path)
    # X has shape (num_samples, 128, 128). Each sample is treated as a sequence of 128 time steps,
    # each with 128 features.
    
    y_onehot, le = preprocess_labels(y)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_onehot, test_size=0.2, random_state=42, stratify=np.argmax(y_onehot, axis=1)
    )
    
    input_shape = X_train.shape[1:]  # Expected to be (128, 128)
    num_classes = y_onehot.shape[1]
    model = build_model(input_shape, num_classes)
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=200, restore_best_weights=True)
    lr_scheduler = LearningRateScheduler(scheduler)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=100, min_lr=1e-6)
    
    model.fit(
        X_train, y_train,
        epochs=2000,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stop, lr_scheduler, reduce_lr]
    )
    
    loss, acc = model.evaluate(X_val, y_val)
    print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {acc:.4f}")


In [7]:
if __name__ == '__main__':
    main()

Found 192 images for class 'angry'.
Found 192 images for class 'calm'.
Found 192 images for class 'disgust'.
Found 192 images for class 'fearful'.
Found 192 images for class 'happy'.
Found 96 images for class 'neutral'.
Found 192 images for class 'sad'.
Found 192 images for class 'surprised'.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 128, 1024)         656384    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 64, 1024)         0         
 )                                                               
                                                                 
 batch_normalization (BatchN  (None, 64, 1024)         4096      
 ormalization)                                                   
                                                                 
 dropout (Dropout)        