In [1]:
# Install all required libraries
!pip install librosa resampy scikit-learn tensorflow

Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy
Successfully installed resampy-0.4.3


In [3]:
# Imports
import os
import librosa
import resampy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models

In [4]:
!wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
!unzip Audio_Speech_Actors_01-24.zip


--2025-07-10 09:10:30--  https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.48.194, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/1188976/files/Audio_Speech_Actors_01-24.zip [following]
--2025-07-10 09:10:30--  https://zenodo.org/records/1188976/files/Audio_Speech_Actors_01-24.zip
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 208468073 (199M) [application/octet-stream]
Saving to: ‘Audio_Speech_Actors_01-24.zip’


2025-07-10 09:11:21 (3.92 MB/s) - ‘Audio_Speech_Actors_01-24.zip’ saved [208468073/208468073]

Archive:  Audio_Speech_Actors_01-24.zip
   creating: Actor_01/
  inflating: Actor_01/03-01-01-01-01-01-01.wav  
  inflating: Actor_01/03-01-01-01-01-02-01.wav  
  inflating: Actor_01/03-01-01-01-02-01-01.wav  
  inflating: Actor

In [5]:
import os
import pandas as pd

base_dir = '.'
data = []

for actor in sorted(os.listdir(base_dir)):
    actor_dir = os.path.join(base_dir, actor)
    if not os.path.isdir(actor_dir):
        continue
    if not actor.startswith('Actor'):
        continue
    for filename in os.listdir(actor_dir):
        if not filename.endswith('.wav'):
            continue
        parts = filename.split('-')
        emotion = int(parts[2])
        path = os.path.join(actor_dir, filename)
        data.append([path, emotion])

df = pd.DataFrame(data, columns=['path', 'emotion'])
print("Total samples:", len(df))
print(df.head())

Total samples: 1440
                                  path  emotion
0  ./Actor_01/03-01-08-02-02-01-01.wav        8
1  ./Actor_01/03-01-04-02-02-01-01.wav        4
2  ./Actor_01/03-01-05-01-01-01-01.wav        5
3  ./Actor_01/03-01-02-01-01-02-01.wav        2
4  ./Actor_01/03-01-04-02-01-01-01.wav        4


In [6]:
import librosa
import numpy as np

def extract_features(file_path, max_pad_len=174):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        if pad_width > 0:
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
        return mfccs
    except Exception as e:
        print("Error with:", file_path)
        return None

X, y = [], []

for index, row in df.iterrows():
    features = extract_features(row['path'])
    if features is not None:
        X.append(features)
        y.append(row['emotion'])

X = np.array(X)
y = np.array(y)

X = X[..., np.newaxis]  # Add channel dim for CNN
print("Features shape:", X.shape)
print("Labels shape:", y.shape)

Features shape: (1440, 40, 174, 1)
Labels shape: (1440,)


In [7]:
np.save('X_ravdess.npy', X)
np.save('y_ravdess.npy', y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (1152, 40, 174, 1)
Test shape: (288, 40, 174, 1)


In [9]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

clf = RandomForestClassifier()
clf.fit(X_train_flat, y_train)

y_pred = clf.predict(X_test_flat)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.20      0.20      0.20        15
           2       0.60      0.87      0.71        38
           3       0.48      0.40      0.44        40
           4       0.47      0.41      0.44        34
           5       0.76      0.59      0.67        37
           6       0.59      0.37      0.46        43
           7       0.47      0.64      0.54        36
           8       0.54      0.60      0.57        45

    accuracy                           0.53       288
   macro avg       0.51      0.51      0.50       288
weighted avg       0.54      0.53      0.53       288



In [10]:
model = models.Sequential([
    layers.Conv2D(64, (3,3), activation='relu', input_shape=X_train.shape[1:]),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(9, activation='softmax')  # 8 emotions + neutral
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train, y_train, epochs=30, batch_size=32,
                    validation_data=(X_test, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - accuracy: 0.1310 - loss: 27.3186 - val_accuracy: 0.2708 - val_loss: 1.8994
Epoch 2/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 962ms/step - accuracy: 0.3001 - loss: 1.8548 - val_accuracy: 0.4201 - val_loss: 1.6567
Epoch 3/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 962ms/step - accuracy: 0.4238 - loss: 1.5405 - val_accuracy: 0.4479 - val_loss: 1.4810
Epoch 4/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 939ms/step - accuracy: 0.5846 - loss: 1.1756 - val_accuracy: 0.5069 - val_loss: 1.3818
Epoch 5/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 1s/step - accuracy: 0.6567 - loss: 0.9967 - val_accuracy: 0.5278 - val_loss: 1.2768
Epoch 6/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 983ms/step - accuracy: 0.7499 - loss: 0.6905 - val_accuracy: 0.5417 - val_loss: 1.3026
Epoch 7/30
[1m36/36[0m [

In [12]:
model.save('ser_cnn_model.h5')  # Save locally



In [13]:
from tensorflow.keras.models import load_model

model = load_model('ser_cnn_model.h5')




In [14]:
def extract_features(file_path, max_pad_len=174):
    try:
        audio, sample_rate = librosa.load(file_path, sr=None)  # use native sr
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        if pad_width > 0:
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
        return mfccs
    except Exception as e:
        print("Error with:", file_path)
        print(e)
        return None

new_features = extract_features('Actor_01/03-01-05-01-02-02-12.wav')

if new_features is None:
    print("Feature extraction failed. Try another file.")
else:
    new_features = np.expand_dims(new_features, axis=(0, -1))
    prediction = model.predict(new_features)
    predicted_label = np.argmax(prediction)
    print("Predicted emotion:", predicted_label)


Error with: Actor_01/03-01-05-01-02-02-12.wav
[Errno 2] No such file or directory: 'Actor_01/03-01-05-01-02-02-12.wav'
Feature extraction failed. Try another file.


  audio, sample_rate = librosa.load(file_path, sr=None)  # use native sr
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [18]:
# Example: use a file you verified actually exists
test_file = './Actor_01/03-01-05-01-02-02-01.wav'
print("Does file exist?", os.path.exists(test_file))  # should print True

new_features = extract_features(test_file)

if new_features is None:
    print("Feature extraction failed.")
else:
    new_features = np.expand_dims(new_features, axis=(0, -1))
    prediction = model.predict(new_features)
    predicted_label = np.argmax(prediction)
    print("Predicted emotion:", predicted_label)


Does file exist? True
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 701ms/step
Predicted emotion: 2


In [19]:
emotion_dict = {
    1: "neutral",
    2: "calm",
    3: "happy",
    4: "sad",
    5: "angry",
    6: "fearful",
    7: "disgust",
    8: "surprised"
}

print("Predicted emotion label:", predicted_label)
print("Predicted emotion name:", emotion_dict.get(predicted_label, "Unknown"))


Predicted emotion label: 2
Predicted emotion name: calm
