Installing required Libraries And Packages.

In [None]:
!pip install numpy pandas librosa tensorflow soundfile scikit-learn




Download & Extract RAVDESS Dataset

In [None]:



!wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
!unzip Audio_Speech_Actors_01-24.zip -d ravdess_data


--2025-02-18 13:46:08--  https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
Resolving zenodo.org (zenodo.org)... 188.185.48.194, 188.185.43.25, 188.185.45.92, ...
Connecting to zenodo.org (zenodo.org)|188.185.48.194|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/1188976/files/Audio_Speech_Actors_01-24.zip [following]
--2025-02-18 13:46:08--  https://zenodo.org/records/1188976/files/Audio_Speech_Actors_01-24.zip
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 208468073 (199M) [application/octet-stream]
Saving to: ‘Audio_Speech_Actors_01-24.zip.2’


2025-02-18 13:50:54 (715 KB/s) - ‘Audio_Speech_Actors_01-24.zip.2’ saved [208468073/208468073]

Archive:  Audio_Speech_Actors_01-24.zip
replace ravdess_data/Actor_01/03-01-01-01-01-01-01.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ravdess_data/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: ravdess_data/

Load & Preprocess Audio Data

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Path to dataset folder
DATA_PATH = "ravdess_data/"

# Emotion labels (From RAVDESS filenames)
emotion_labels = {
    "01": "calm", "02": "calm", "03": "happy", "04": "sad",
    "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
}

# Extract MFCCs from an audio file
def extract_features(file_path, max_pad_len=100):
    y, sr = librosa.load(file_path, sr=16000)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    # Pad or truncate to fixed size
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]

    return mfccs

# Load dataset
data, labels = [], []
for root, _, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            emotion = emotion_labels[file.split("-")[2]]  # Extract emotion from filename
            features = extract_features(os.path.join(root, file))
            data.append(features)
            labels.append(emotion)

# Convert to numpy arrays
X = np.array(data)
y = pd.get_dummies(labels).values  # One-hot encoding

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape for CNN input
X_train = X_train.reshape(X_train.shape[0], 40, 100, 1)
X_test = X_test.reshape(X_test.shape[0], 40, 100, 1)


Build & Train the Deep Learning Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, LSTM, Dropout, Reshape, Input

# Build CNN-LSTM Model
model = Sequential([
    # Input Layer
    Input(shape=(40, 100, 1)),

    # CNN Layers
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(128, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(256, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(512, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),
    # Reshape CNN Output for LSTM
    Reshape((8, -1)),  # Adjust time steps dynamically

    # LSTM Layer
    LSTM(64, return_sequences=False),
    Dropout(0.3),

    # Fully Connected Layer
    Dense(64, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

# Compile Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Save Model
model.save("voice_emotion_model.h5")
print("✅ Model saved successfully!")


Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1s/step - accuracy: 0.1600 - loss: 1.9798 - val_accuracy: 0.2014 - val_loss: 1.9444
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 1s/step - accuracy: 0.1761 - loss: 1.9578 - val_accuracy: 0.2014 - val_loss: 1.9358
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 1s/step - accuracy: 0.1546 - loss: 1.9574 - val_accuracy: 0.2014 - val_loss: 1.9306
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 1s/step - accuracy: 0.1749 - loss: 1.9566 - val_accuracy: 0.2014 - val_loss: 1.9286
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 1s/step - accuracy: 0.1791 - loss: 1.9465 - val_accuracy: 0.1285 - val_loss: 1.9467
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 1s/step - accuracy: 0.1580 - loss: 1.9562 - val_accuracy: 0.2014 - val_loss: 1.9349
Epoch 7/50
[1m36/36[0m [32m━━━━━━━━━━



✅ Model saved successfully!


Test the Model in Google Colab

In [None]:
from google.colab import files

# Upload an audio file
uploaded = files.upload()

# Get file path
file_path = list(uploaded.keys())[0]
print(f"📂 Uploaded file: {file_path}")


Saving y.wav to y (1).wav
📂 Uploaded file: y (1).wav


Predict Emotion from the Uploaded File

In [None]:
import librosa
import numpy as np

# Extract Features Function
def extract_features(file_path, max_pad_len=100):
    y, sr = librosa.load(file_path, sr=16000)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    # Pad or truncate
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]

    return mfccs.reshape(1, 40, 100, 1)  # Reshape for model input

# Predict Emotion
def predict_emotion(file_path):
    features = extract_features(file_path)
    prediction = model.predict(features)
    emotion_index = np.argmax(prediction)
    emotion = list(emotion_labels.values())[emotion_index]

    print(f"🎭 Detected Emotion: {emotion}")

predict_emotion(file_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 354ms/step
🎭 Detected Emotion: calm


Download the Trained Model

In [None]:
from google.colab import files
files.download("voice_emotion_model.h5")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>