<a href="https://www.kaggle.com/code/eshtiaqueahmed/emotion-recognition?scriptVersionId=272931981" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
pip install numpy pandas librosa soundfile scikit-learn matplotlib tensorflow

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import matplotlib.pyplot as plt
import itertools



# --------- CONFIG ----------
#import drive
# from google.colab import drive
# drive.mount('/content/drive')


DATA_PATH = "/kaggle/input/ravdess-emotional-speech-audio"   # <-- set this to where your WAV files are
SR = 22050                    # sampling rate for librosa.load
N_MFCC = 40                   # number of MFCC coefficients
MAX_PAD_LEN = 174             # number of time frames to pad/truncate to (tunable)
BATCH_SIZE = 32
EPOCHS = 40
TEST_SIZE = 0.2
RANDOM_STATE = 42
MODEL_OUT = "ravdess_ser_cnn.h5"
# ---------------------------

# RAVDESS emotion mapping (filename 3rd field)
EMOTIONS = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def extract_emotion_from_filename(filename):
    # filename is e.g. .../03-01-05-01-02-01-12.wav
    base = os.path.basename(filename)
    parts = base.split('.')[0].split('-')
    if len(parts) < 3:
        return None
    emo_code = parts[2]
    return EMOTIONS.get(emo_code)

def pad_or_truncate(mfcc, max_len=MAX_PAD_LEN):
    # mfcc shape: (n_mfcc, time)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

def extract_features(file_path, sr=SR, n_mfcc=N_MFCC, max_pad_len=MAX_PAD_LEN):
    # load audio
    try:
        y, sr = librosa.load(file_path, sr=sr)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None
    # compute MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # compute deltas
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # stack MFCC + delta + delta2 (resulting in channels = 3)
    stacked = np.vstack([mfcc, delta, delta2])  # shape (n_mfcc*3, time)
    # pad / truncate in time dimension
    stacked = pad_or_truncate(stacked, max_pad_len)
    return stacked



def load_dataset(data_path):
    pattern = os.path.join(data_path, '**', '*.wav')
    files = glob.glob(pattern, recursive=True)
    print(f"Found {len(files)} wav files.")
    X, Y = [], []
    for f in files:
        emo = extract_emotion_from_filename(f)
        if emo is None:
            continue
        feat = extract_features(f)
        if feat is None:
            continue
        X.append(feat)
        Y.append(emo)
    X = np.array(X)
    Y = np.array(Y)
    print("X shape (num_samples, features, time):", X.shape)
    return X, Y

def build_cnn_model(input_shape, num_classes):
    # input_shape: (channels, time) where channels = n_mfcc*3
    # We'll treat input as "image" with shape (channels, time, 1)
    inp = layers.Input(shape=(input_shape[0], input_shape[1], 1))
    x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.4)(x)
    out = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix'):
    if normalize:
        cm = cm.astype('float') / (cm.sum(axis=1)[:, np.newaxis] + 1e-8)
    plt.figure(figsize=(8,6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        val = f"{cm[i, j]:.2f}" if normalize else f"{int(cm[i, j])}"
        plt.text(j, i, val,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

def main():
    X, Y = load_dataset(DATA_PATH)

    # limit classes if you want (example: use only a subset). Here we use all 8 emotions.
    labels = np.unique(Y)
    print("Emotions in data:", labels)

    # reshape X for CNN: currently (N, features, time) -> convert to (N, features, time, 1)
    X = X[..., np.newaxis]

    # encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(Y)
    num_classes = len(le.classes_)
    y_cat = tf.keras.utils.to_categorical(y_encoded, num_classes=num_classes)

    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_cat, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded
    )

    print("Train shape:", X_train.shape, y_train.shape)
    print("Test shape:", X_test.shape, y_test.shape)

    # build model
    input_shape = (X.shape[1], X.shape[2])  # (features, time)
    model = build_cnn_model((input_shape[0], input_shape[1]), num_classes)
    model.summary()

    # callbacks
    cb = [
        callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
        callbacks.ModelCheckpoint(MODEL_OUT, save_best_only=True, monitor='val_loss')
    ]

    history = model.fit(
        X_train,
        y_train,
        validation_split=0.1,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=cb,
        verbose=1
    )

    # evaluate
    loss, acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test loss: {loss:.4f}, Test accuracy: {acc:.4f}")


    cnn_result = {'Model': 'CNN', 'Test Accuracy': acc, 'Test Loss': loss}


    # predictions and classification report
    y_pred = model.predict(X_test)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_true_labels = np.argmax(y_test, axis=1)

    print("Classification report:")
    print(classification_report(y_true_labels, y_pred_labels, target_names=le.classes_))

    # confusion matrix
    cm = confusion_matrix(y_true_labels, y_pred_labels)
    plot_confusion_matrix(cm, classes=le.classes_, normalize=False, title='Confusion matrix (counts)')
    plot_confusion_matrix(cm, classes=le.classes_, normalize=True, title='Confusion matrix (normalized)')

    # plot training history
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend(); plt.title('Loss')
    plt.subplot(1,2,2)
    plt.plot(history.history['accuracy'], label='acc')
    plt.plot(history.history['val_accuracy'], label='val_acc')
    plt.legend(); plt.title('Accuracy')
    plt.show()

if __name__ == "__main__":
    main()


In [None]:
#----------------------------------------------
# Experiment 1: LSTM Model Only
#----------------------------------------------
print("--- Starting LSTM Model Experiment ---")


def build_lstm_model(input_shape, num_classes):
    inp = layers.Input(shape=input_shape)
    x = layers.LSTM(128, return_sequences=True)(inp)
    x = layers.Dropout(0.3)(x)
    x = layers.LSTM(64)(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(num_classes, activation='softmax')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Step 1: Load data and encode labels
X, Y = load_dataset(DATA_PATH)
le = LabelEncoder()
y_encoded = le.fit_transform(Y)
num_classes = len(le.classes_)
y_cat = tf.keras.utils.to_categorical(y_encoded, num_classes=num_classes)

# Step 2: Reshape data for LSTM (transpose)
# Shape change: (samples, features, time) -> (samples, time, features)
print("Reshaping data for LSTM...")
X_for_rnn = np.transpose(X, (0, 2, 1))
print("New data shape:", X_for_rnn.shape)

# Step 3: Split data into Train-Test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_for_rnn, y_cat, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded
)

# Step 4: Build the LSTM model
input_shape_lstm = (X_train.shape[1], X_train.shape[2]) # Shape: (time, features)
model_lstm = build_lstm_model(input_shape_lstm, num_classes)
model_lstm.summary()

cb = [
    callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    callbacks.ModelCheckpoint(MODEL_OUT, save_best_only=True, monitor='val_loss')
]

# Step 5: Train the model
print("\n--- Starting LSTM Model Training ---")
history_lstm = model_lstm.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=cb,
    verbose=1
)

# Step 6: Evaluate the model
print("\n--- LSTM Model Evaluation ---")
loss, acc = model_lstm.evaluate(X_test, y_test, verbose=0)
print(f"LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

lstm_result = {'Model': 'LSTM', 'Test Accuracy': acc, 'Test Loss': loss}

# Classification report and confusion matrix
y_pred = model_lstm.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)
print("\nLSTM Classification Report:")
print(classification_report(y_true_labels, y_pred_labels, target_names=le.classes_))
cm = confusion_matrix(y_true_labels, y_pred_labels)

# Plot Confusion Matrix (Counts)
plot_confusion_matrix(cm, classes=le.classes_, title='LSTM Confusion Matrix (Counts)')

# Plot Confusion Matrix (Normalized)
plot_confusion_matrix(cm, classes=le.classes_, normalize=True, title='LSTM Normalized Confusion Matrix')

# Plot Training & Validation Accuracy and Loss
print("\n--- Plotting Training History ---")
plt.figure(figsize=(12, 5))

# Plot Accuracy
plt.subplot(1, 2, 1)
plt.plot(history_lstm.history['accuracy'], label='Training Accuracy')
plt.plot(history_lstm.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history_lstm.history['loss'], label='Training Loss')
plt.plot(history_lstm.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#----------------------------------------------
# Experiment 2: CNN-LSTM Hybrid Model
#----------------------------------------------
print("--- Starting CNN-LSTM Hybrid Model Experiment ---")

def build_cnn_lstm_model(input_shape, num_classes):
    # Input shape: (features, time, 1)
    inp = layers.Input(shape=input_shape)

    # CNN part for feature extraction
    x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)
    
    x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)

    # Reshape CNN output to feed into LSTM
    current_shape = x.shape
    x = layers.Reshape((current_shape[2], current_shape[1] * current_shape[3]))(x)

    # LSTM part for sequence analysis
    x = layers.LSTM(64)(x)
    x = layers.Dropout(0.4)(x)
    
    out = layers.Dense(num_classes, activation='softmax')(x)
    
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Step 1: Load data and encode labels
X, Y = load_dataset(DATA_PATH)
le = LabelEncoder()
y_encoded = le.fit_transform(Y)
num_classes = len(le.classes_)
y_cat = tf.keras.utils.to_categorical(y_encoded, num_classes=num_classes)

# Step 2: Reshape data for CNN input (add channel dimension)
# Shape change: (samples, features, time) -> (samples, features, time, 1)
print("Reshaping data for CNN-LSTM...")
X_for_cnn = X[..., np.newaxis]
print("New data shape:", X_for_cnn.shape)

# Step 3: Split data into Train-Test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_for_cnn, y_cat, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded
)

# Step 4: Build the CNN-LSTM model
input_shape_cnn_lstm = (X_train.shape[1], X_train.shape[2], X_train.shape[3]) # Shape: (features, time, 1)
model_cnn_lstm = build_cnn_lstm_model(input_shape_cnn_lstm, num_classes)
model_cnn_lstm.summary()

# Define Callbacks
cb = [
    callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    callbacks.ModelCheckpoint("ravdess_ser_cnn_lstm.h5", save_best_only=True, monitor='val_loss')
]

# Step 5: Train the model
print("\n--- Starting CNN-LSTM Model Training ---")
history_cnn_lstm = model_cnn_lstm.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=cb,
    verbose=1
)

# Step 6: Evaluate the model
print("\n--- CNN-LSTM Model Evaluation ---")
loss, acc = model_cnn_lstm.evaluate(X_test, y_test, verbose=0)
print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

cnn_lstm_result = {'Model': 'CNN-LSTM', 'Test Accuracy': acc, 'Test Loss': loss}


# Classification report and confusion matrix
y_pred = model_cnn_lstm.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)
print("\nCNN-LSTM Classification Report:")
print(classification_report(y_true_labels, y_pred_labels, target_names=le.classes_))
cm = confusion_matrix(y_true_labels, y_pred_labels)

# Plot Confusion Matrix (Counts)
plot_confusion_matrix(cm, classes=le.classes_, title='CNN-LSTM Confusion Matrix (Counts)')

# Plot Confusion Matrix (Normalized)
plot_confusion_matrix(cm, classes=le.classes_, normalize=True, title='CNN-LSTM Normalized Confusion Matrix')

# Plot Training & Validation Accuracy and Loss
print("\n--- Plotting Training History ---")
plt.figure(figsize=(12, 5))

# Plot Accuracy
plt.subplot(1, 2, 1)
plt.plot(history_cnn_lstm.history['accuracy'], label='Training Accuracy')
plt.plot(history_cnn_lstm.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history_cnn_lstm.history['loss'], label='Training Loss')
plt.plot(history_cnn_lstm.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# ===================================================================
#        Concise Final Comparison (Assuming Functions are Pre-defined)
# ===================================================================

# Step 1: Prepare data shapes and results list
print("Loading data once for all experiments...")
X, Y = load_dataset(DATA_PATH)
le = LabelEncoder()
y_encoded = le.fit_transform(Y)
num_classes = len(le.classes_)
y_cat = tf.keras.utils.to_categorical(y_encoded, num_classes=num_classes)

# Prepare different data shapes required for the models
X_cnn_shape = X[..., np.newaxis]
X_lstm_shape = np.transpose(X, (0, 2, 1))

results = []
cb = [callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]

# ---------------------------
# Step 2: Run Experiments Sequentially
# ---------------------------

# --- Experiment 1: CNN ---
print("\n\n--- Running CNN Model Experiment ---")
X_train, X_test, y_train, y_test = train_test_split(X_cnn_shape, y_cat, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded)
model_cnn = build_cnn_model((X_train.shape[1], X_train.shape[2], 1), num_classes)
model_cnn.fit(X_train, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=cb, verbose=0)
loss, acc = model_cnn.evaluate(X_test, y_test, verbose=0)
results.append({'Model': 'CNN', 'Test Accuracy': acc, 'Test Loss': loss})
print("CNN Model Evaluation Complete.")

# --- Experiment 2: LSTM ---
print("\n--- Running LSTM Model Experiment ---")
X_train, X_test, y_train, y_test = train_test_split(X_lstm_shape, y_cat, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded)
model_lstm = build_lstm_model((X_train.shape[1], X_train.shape[2]), num_classes)
model_lstm.fit(X_train, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=cb, verbose=0)
loss, acc = model_lstm.evaluate(X_test, y_test, verbose=0)
results.append({'Model': 'LSTM', 'Test Accuracy': acc, 'Test Loss': loss})
print("LSTM Model Evaluation Complete.")

# --- Experiment 3: CNN-LSTM ---
print("\n--- Running CNN-LSTM Model Experiment ---")
X_train, X_test, y_train, y_test = train_test_split(X_cnn_shape, y_cat, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded)
model_cnn_lstm = build_cnn_lstm_model((X_train.shape[1], X_train.shape[2], 1), num_classes)
model_cnn_lstm.fit(X_train, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=cb, verbose=0)
loss, acc = model_cnn_lstm.evaluate(X_test, y_test, verbose=0)
results.append({'Model': 'CNN-LSTM', 'Test Accuracy': acc, 'Test Loss': loss})
print("CNN-LSTM Model Evaluation Complete.")

# ---------------------------
# Step 3: Display Final Summary Table
# ---------------------------
print("\n\n" + "="*50)
print("             FINAL MODEL COMPARISON")
print("="*50)

df_results = pd.DataFrame(results)
df_results['Test Accuracy'] = df_results['Test Accuracy'].apply(lambda x: f"{x:.4f}")
df_results['Test Loss'] = df_results['Test Loss'].apply(lambda x: f"{x:.4f}")
df_results.sort_values(by='Test Accuracy', ascending=False, inplace=True)
df_results.set_index('Model', inplace=True)

print(df_results)
print("\n--- Comparison Complete ---")