In [44]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical

In [45]:
# Function to extract features from an audio file
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    return mfccs_scaled

In [46]:
# Path to your RAVDESS dataset
main_dataset_path = 'RAVDESS_dataset/'

# Emotions dictionary
emotions = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}

# Initialize lists to hold the data and labels
X_train = []
X_test = []
y_train = []
y_test = []

In [47]:
# Iterate over each actor folder in the train and test directories
for dataset_folder in ['train', 'test']:
    dataset_path = os.path.join(main_dataset_path, dataset_folder)
    # Iterate over each actor folder in the dataset path
    for actor_folder in os.listdir(dataset_path):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            # Iterate over each file in the actor folder
            for file_name in os.listdir(actor_path):
                if file_name.endswith('.wav'):
                    file_path = os.path.join(actor_path, file_name)
                    # Extract features and append them to the respective lists
                    feature = extract_features(file_path)
                    if dataset_folder == 'train':
                        X_train.append(feature)
                        # Extract the emotion label from the file name and append to the training labels list
                        emotion = int(file_name.split('-')[2]) - 1  # Adjust label to be zero-based
                        y_train.append(emotion)
                    elif dataset_folder == 'test':
                        X_test.append(feature)
                        # Extract the emotion label from the file name and append to the testing labels list
                        emotion = int(file_name.split('-')[2]) - 1  # Adjust label to be zero-based
                        y_test.append(emotion)

In [40]:

# Convert lists to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)


In [57]:
# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = to_categorical(label_encoder.fit_transform(y_train))
y_test_encoded = to_categorical(label_encoder.transform(y_test))

In [53]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train_encoded.shape[1], activation='softmax'))

In [54]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train[..., np.newaxis], y_train_encoded, epochs=50, batch_size=32, validation_data=(X_test[..., np.newaxis], y_test_encoded))


Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 83ms/step - accuracy: 0.1296 - loss: 2.0782 - val_accuracy: 0.2049 - val_loss: 1.9965
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 58ms/step - accuracy: 0.2085 - loss: 1.9936 - val_accuracy: 0.2826 - val_loss: 1.9237
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step - accuracy: 0.2340 - loss: 1.9508 - val_accuracy: 0.2882 - val_loss: 1.8799
Epoch 4/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.2648 - loss: 1.9060 - val_accuracy: 0.2958 - val_loss: 1.8354
Epoch 5/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.3018 - loss: 1.8684 - val_accuracy: 0.2840 - val_loss: 1.8628
Epoch 6/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.2820 - loss: 1.8779 - val_accuracy: 0.3229 - val_loss: 1.7772
Epoch 7/50
[1m45/45[0m [32m━━━━

<keras.src.callbacks.history.History at 0x282ee106410>

In [62]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test[..., np.newaxis], y_test_encoded)
print(f'Test accuracy: {test_acc}')

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.7694 - loss: 0.6682
Test accuracy: 0.7708333134651184


In [64]:
# Save the trained model
model.save('Audio_emotion_model.h5')

