In [20]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Set path to dataset (update this if needed)
DATA_PATH = "/Users/macbookpro/Desktop/Audio_Song_Actors_01-24"

# Map emotion codes to labels (RAVDESS format)
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}


In [3]:
def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, res_type='kaiser_fast')
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfcc_scaled = np.mean(mfcc.T, axis=0)
        return mfcc_scaled
    except Exception as e:
        print(f"Error processing file: {file_path}\n {e}")
        return None


In [4]:
features = []
labels = []

for folder in tqdm(os.listdir(DATA_PATH)):
    folder_path = os.path.join(DATA_PATH, folder)
    if not os.path.isdir(folder_path) or folder.startswith('.'):
        continue  # Skip system files like .DS_Store
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            emotion_code = file.split("-")[2]
            emotion = emotion_map.get(emotion_code)
            if emotion:
                feature = extract_features(os.path.join(folder_path, file))
                if feature is not None:
                    features.append(feature)
                    labels.append(emotion)

df = pd.DataFrame(features)
df['label'] = labels
df.head()


100%|███████████████████████████████████████████| 26/26 [01:16<00:00,  2.96s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,label
0,-523.92218,36.252666,-19.905519,12.971987,-9.616239,-15.922702,-15.657978,-8.893446,-14.755669,6.043025,...,2.381103,-6.505177,-6.48541,-2.818655,1.587407,0.349844,1.351712,6.588576,7.494193,happy
1,-587.782104,43.909641,-17.656189,13.169994,-7.121557,-13.213776,-12.981407,-11.661613,-12.594746,6.062702,...,3.169533,-4.859681,-6.721389,-4.410873,1.020319,0.56767,0.182629,5.530495,8.593582,happy
2,-564.099976,48.837444,-21.918192,7.789908,-10.597126,-16.043745,-17.649284,-11.985186,-10.88835,3.36891,...,2.934443,-6.871836,-8.179089,-4.061671,0.859207,0.858561,0.053792,8.430861,9.631927,calm
3,-583.713745,41.609371,-17.050117,9.65369,-8.41925,-13.660363,-19.128326,-11.646488,-10.947925,4.704983,...,1.834656,-7.705764,-7.545528,-3.399522,2.211384,0.861499,2.038282,7.278339,8.831779,calm
4,-619.61792,36.146912,-9.827855,9.078827,-10.291913,-10.712646,-16.410923,-7.202985,-11.408434,0.565057,...,0.205038,-3.876109,-3.776702,-4.453256,1.194393,0.420987,0.994291,1.842649,8.704984,neutral


In [5]:
# Encode emotion labels to integers
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# Separate features and labels
X = df.drop('label', axis=1)
y = df['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define a simple model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(le.classes_), activation='softmax'))  # Output layer

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.1904 - loss: 33.8758 - val_accuracy: 0.1970 - val_loss: 5.6314
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1939 - loss: 9.6931 - val_accuracy: 0.2167 - val_loss: 2.9975
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1952 - loss: 5.5906 - val_accuracy: 0.1527 - val_loss: 2.1091
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1947 - loss: 3.9362 - val_accuracy: 0.2069 - val_loss: 1.9451
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1757 - loss: 3.0521 - val_accuracy: 0.1478 - val_loss: 1.7834
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1740 - loss: 2.6636 - val_accuracy: 0.2069 - val_loss: 1.7945
Epoch 7/50
[1m26/26[0m [32m━━━━━━━━

In [7]:
# Evaluate model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Print results
print("✅ Accuracy:", accuracy_score(y_test, y_pred_classes))
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred_classes, target_names=le.classes_))


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
✅ Accuracy: 0.22660098522167488

📊 Classification Report:
              precision    recall  f1-score   support

       angry       0.27      0.73      0.39        30
        calm       0.00      0.00      0.00        44
     fearful       0.13      0.45      0.21        29
       happy       0.00      0.00      0.00        39
     neutral       0.00      0.00      0.00        21
         sad       0.50      0.28      0.35        40

    accuracy                           0.23       203
   macro avg       0.15      0.24      0.16       203
weighted avg       0.16      0.23      0.16       203



In [19]:
import joblib
model.save("final_emotion_model.h5")
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [17]:
import numpy as np
import librosa
import joblib
from tensorflow.keras.models import load_model

# Load trained model and label encoder
model = load_model("final_emotion_model.h5")
label_encoder = joblib.load("label_encoder.pkl")

def predict_emotion(audio_path):
    try:
        audio, sr = librosa.load(audio_path, res_type='kaiser_fast')
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfcc_scaled = np.mean(mfcc.T, axis=0).reshape(1, -1)

        prediction = model.predict(mfcc_scaled)
        predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])

        print(f"✅ Predicted Emotion: {predicted_label[0]}")
    except Exception as e:
        print(f"❌ Error: {e}")

# 🔁 Run prediction with a real audio file path
predict_emotion("/Users/macbookpro/Desktop/Audio_Song_Actors_01-24/Actor_01/03-02-01-01-01-01-01.wav")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
✅ Predicted Emotion: fearful


In [18]:
import os
import librosa
import numpy as np
from tensorflow.keras.models import load_model
import joblib

# Load the trained model and label encoder
model = load_model("final_emotion_model.h5")
label_encoder = joblib.load("label_encoder.pkl")

# Function to predict emotion
def predict_emotion(audio_path):
    try:
        audio, sr = librosa.load(audio_path, res_type='kaiser_fast')
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfcc_scaled = np.mean(mfcc.T, axis=0).reshape(1, -1)
        prediction = model.predict(mfcc_scaled)
        predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
        return predicted_label[0]
    except Exception as e:
        return f"❌ Error for {audio_path}: {str(e)}"

# Folder containing audio files
audio_folder = "/Users/macbookpro/Desktop/Audio_Song_Actors_01-24/Actor_01"

# Predict emotions for all .wav files
for file in os.listdir(audio_folder):
    if file.endswith(".wav"):
        full_path = os.path.join(audio_folder, file)
        emotion = predict_emotion(full_path)
        print(f"🎧 {file} => 🧠 Predicted Emotion: {emotion}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
🎧 03-02-03-02-02-02-01.wav => 🧠 Predicted Emotion: angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
🎧 03-02-03-01-01-02-01.wav => 🧠 Predicted Emotion: angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
🎧 03-02-02-02-01-01-01.wav => 🧠 Predicted Emotion: fearful
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
🎧 03-02-02-01-02-01-01.wav => 🧠 Predicted Emotion: fearful
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
🎧 03-02-01-01-02-02-01.wav => 🧠 Predicted Emotion: fearful
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
🎧 03-02-06-01-02-02-01.wav => 🧠 Predicted Emotion: fearful
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
🎧 03-02-05-01-02-01-01.wav => 🧠 Predicted Emotion: angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
🎧 03-02-05-02-01-