In [12]:
pip install librosa numpy pandas scikit-learn matplotlib tensorflow sounddevice

Note: you may need to restart the kernel to use updated packages.


In [13]:
import zipfile
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Path to the RAVDESS dataset folder
dataset_path = r"C:\Users\Hp\Downloads\RAVDESS"

# Emotion labels dictionary
emotion_labels = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}

# Lists to store extracted features and labels
X = []
y = []

# Loop through all audio files in the dataset
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.wav'):
            # Extract emotion from the filename (3rd field in the name)
            file_parts = file.split('-')
            emotion_code = file_parts[2]  # Third part of the filename is the emotion code

            # Check if the emotion code is valid
            if emotion_code in emotion_labels:
                emotion = emotion_labels[emotion_code]

                # Load audio file using librosa
                file_path = os.path.join(root, file)
                y_audio, sr = librosa.load(file_path, sr=None)

                # Extract MFCC features
                mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=40)
                mfccs_mean = np.mean(mfccs.T, axis=0)  # Mean of MFCCs

                # Append features and label
                X.append(mfccs_mean)
                y.append(emotion)

# Convert features and labels to numpy arrays
X = np.array(X)
y = np.array(y)

# Print the sizes of X and y to check if they are empty
print(f"Length of X: {len(X)}")
print(f"Length of y: {len(y)}")

# Encode emotion labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Print the shapes of X and y_encoded
print(f"Shape of X: {X.shape}")
print(f"Shape of y_encoded: {y_encoded.shape}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


Length of X: 2880
Length of y: 2880
Shape of X: (2880, 40)
Shape of y_encoded: (2880,)


In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

# Define the LSTM model
model = Sequential()

# LSTM Layer
model.add(LSTM(128, return_sequences=False, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.3))

# Fully connected Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))

# Output layer (number of unique emotions)
model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Reshape X_train and X_test for LSTM input
X_train_lstm = X_train[..., np.newaxis]
X_test_lstm = X_test[..., np.newaxis]

# Train the model
history = model.fit(X_train_lstm, y_train, epochs=100, batch_size=32, validation_data=(X_test_lstm, y_test))



Epoch 1/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.1540 - loss: 2.0732 - val_accuracy: 0.2361 - val_loss: 1.9571
Epoch 2/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.2282 - loss: 1.9709 - val_accuracy: 0.2674 - val_loss: 1.8995
Epoch 3/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.2693 - loss: 1.9112 - val_accuracy: 0.3021 - val_loss: 1.8567
Epoch 4/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.2823 - loss: 1.8782 - val_accuracy: 0.3142 - val_loss: 1.7928
Epoch 5/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.3000 - loss: 1.8110 - val_accuracy: 0.3142 - val_loss: 1.7768
Epoch 6/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.2920 - loss: 1.8102 - val_accuracy: 0.2969 - val_loss: 1.7583
Epoch 7/100
[1m72/72[0m [

In [27]:
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test_lstm, y_test)
print(f'Test accuracy: {test_accuracy:.2f}')


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7507 - loss: 0.8984
Test accuracy: 0.76


In [28]:
# Save the model
model.save('ravdess_emotion_recognition_model.h5')
print("Model saved as 'ravdess_emotion_recognition_model.h5'")



Model saved as 'ravdess_emotion_recognition_model.h5'


In [22]:
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# Define the emotion labels dictionary (same as used during training)
emotion_labels = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}

# Load the trained model
model = tf.keras.models.load_model('ravdess_emotion_recognition_model.h5')

# Initialize Label Encoder (used to encode the labels)
label_encoder = LabelEncoder()
label_encoder.fit(list(emotion_labels.values()))

# Function to extract MFCC features from an audio file
def extract_features(audio_path):
    y_audio, sr = librosa.load(audio_path, sr=None)  # Load audio
    mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=40)  # Extract MFCC features
    mfccs_mean = np.mean(mfccs.T, axis=0)  # Take mean of MFCCs
    return mfccs_mean

# Function to predict emotion
def predict_emotion(audio_path):
    # Extract MFCC features
    features = extract_features(audio_path)

    # Reshape the features for the model input (LSTM expects 3D input)
    features_reshaped = np.expand_dims(features, axis=(0, 2))  # Shape: (1, number_of_features, 1)

    # Predict using the model
    predictions = model.predict(features_reshaped)
    predicted_label = np.argmax(predictions, axis=1)

    # Decode the predicted label to the corresponding emotion
    predicted_emotion = label_encoder.inverse_transform(predicted_label)

    return predicted_emotion[0]

# Example usage
audio_file_path = r'C:\Users\Hp\Downloads\RAVDESS\Actor_01\03-01-01-01-02-02-01.wav'
predicted_emotion = predict_emotion(audio_file_path)
print(f"The predicted emotion is: {predicted_emotion}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
The predicted emotion is: calm
