In [20]:
#RAVDESS Speech Emotion Recognition by Arif
# MFCC Extraction + LSTM Model

!pip install librosa soundfile tensorflow keras tqdm --quiet

import os
import librosa
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
DATA_PATH = "/kaggle/input/ravdess-emotional-speech-audio"

In [21]:
#Emotion labels according to filename convention
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fear',
    '07': 'disgust',
    '08': 'surprise'
}

In [22]:
#Feature Extraction Function (MFCCs)
def extract_features(file_path, max_len=216):
    audio, sr = librosa.load(file_path, duration=3, offset=0.5)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)

    # Pad or slice MFCCs to ensure consistent length
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc

In [23]:
#Load Dataset and Extract Features
X, Y = [], []

print("Extracting MFCC features from RAVDESS dataset...\n")

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(root, file)
            emotion_id = file.split("-")[2]     # "03" → happy
            emotion = emotion_map[emotion_id]

            features = extract_features(file_path)
            X.append(features)
            Y.append(emotion)

X = np.array(X)
Y = np.array(Y)

print("Feature extraction completed!")
print("X shape:", X.shape)
print("Y shape:", Y.shape)

Extracting MFCC features from RAVDESS dataset...

Feature extraction completed!
X shape: (2880, 40, 216)
Y shape: (2880,)


In [24]:
#Encode Labels + Train/Test Split
encoder = LabelEncoder()
Y_encoded = encoder.fit_transform(Y)
Y_onehot = to_categorical(Y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    X, Y_onehot, test_size=0.2, shuffle=True, random_state=42
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (2304, 40, 216)
Test shape: (576, 40, 216)


In [25]:
#Build LSTM Model
model = Sequential([
    LSTM(128, return_sequences=False, input_shape=(40, 216)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(8, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [26]:
#Train Model
history = model.fit(
    X_train, y_train,
    epochs=60,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)

Epoch 1/60
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.1766 - loss: 2.1206 - val_accuracy: 0.3108 - val_loss: 1.8726
Epoch 2/60
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3778 - loss: 1.7486 - val_accuracy: 0.4236 - val_loss: 1.6856
Epoch 3/60
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5012 - loss: 1.4748 - val_accuracy: 0.5330 - val_loss: 1.4834
Epoch 4/60
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6237 - loss: 1.2044 - val_accuracy: 0.5816 - val_loss: 1.2891
Epoch 5/60
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7131 - loss: 0.9450 - val_accuracy: 0.6927 - val_loss: 1.0544
Epoch 6/60
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7996 - loss: 0.7056 - val_accuracy: 0.7396 - val_loss: 0.8960
Epoch 7/60
[1m72/72[0m [32m━━━━━━━━━

In [27]:
#Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nModel Test Accuracy: {accuracy * 100:.2f}%")


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8350 - loss: 0.7653

Model Test Accuracy: 84.20%


In [28]:
#Predict Emotion from Custom Audio File
def predict_emotion(audio_path):
    mfcc = extract_features(audio_path)
    mfcc = mfcc.reshape(1, 40, 216)

    pred = model.predict(mfcc)
    emotion = encoder.inverse_transform([np.argmax(pred)])
    return emotion[0]

# Example usage (upload your .wav file in Kaggle first):
# print(predict_emotion("/kaggle/input/yourfile.wav"))