In [13]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Emotion mappings
ravdess_emotions = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}
cremad_emotions = {
    'NEU': 'neutral', 'HAP': 'happy', 'SAD': 'sad',
    'ANG': 'angry', 'FEA': 'fearful', 'DIS': 'disgust'
}





In [15]:
# Label Encoder
label_encoder = LabelEncoder()
label_encoder.fit(list(set(ravdess_emotions.values()) | set(cremad_emotions.values())))

# Data augmentation
def add_noise(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    return (audio + noise_factor * noise).astype(np.float32)

# Feature extraction
def extract_features(file_path, max_pad_len=174, augment=False):
    try:
        audio, sr = librosa.load(file_path, sr=22050)
        if augment:
            audio = add_noise(audio)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        pad_width = max_pad_len - mfcc.shape[1]
        if pad_width > 0:
            mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_len]
        return mfcc
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Load dataset
def load_dataset(ravdess_path, cremad_path):
    X, y = [], []

    # RAVDESS
    for root, _, files in os.walk(ravdess_path):
        for file in files:
            if not file.endswith('.wav') or '-' not in file:
                continue
            parts = file.split('-')
            if len(parts) < 3:
                continue
            emo_code = parts[2]
            label = ravdess_emotions.get(emo_code)
            if label:
                path = os.path.join(root, file)
                for aug in [False, True]:
                    feat = extract_features(path, augment=aug)
                    if feat is not None:
                        X.append(feat)
                        y.append(label)

    # CREMA-D
    for root, _, files in os.walk(cremad_path):
        for file in files:
            if not file.endswith('.wav') or '_' not in file:
                continue
            parts = file.split('_')
            if len(parts) < 3:
                continue
            emo_code = parts[-2]
            label = cremad_emotions.get(emo_code)
            if label:
                path = os.path.join(root, file)
                for aug in [False, True]:
                    feat = extract_features(path, augment=aug)
                    if feat is not None:
                        X.append(feat)
                        y.append(label)

    return np.array(X), np.array(y)

# Paths to datasets (update accordingly)
ravdess_path = r'/Users/Appy/work/project_speech/archive (1)'
cremad_path = r'/Users/Appy/work/project_speech/AudioWAV'

# Load and prepare data
X, y = load_dataset(ravdess_path, cremad_path)
if len(X) == 0:
    raise RuntimeError("No data loaded. Check dataset paths and structure.")

In [16]:
y_cremad

['angry',
 'angry',
 'neutral',
 'neutral',
 'disgust',
 'disgust',
 'sad',
 'fearful',
 'sad',
 'happy',
 'fearful',
 'sad',
 'happy',
 'happy',
 'happy',
 'fearful',
 'fearful',
 'angry',
 'angry',
 'disgust',
 'disgust',
 'angry',
 'sad',
 'neutral',
 'sad',
 'neutral',
 'angry',
 'happy',
 'fearful',
 'neutral',
 'disgust',
 'disgust',
 'happy',
 'fearful',
 'disgust',
 'happy',
 'fearful',
 'neutral',
 'angry',
 'neutral',
 'neutral',
 'angry',
 'angry',
 'disgust',
 'disgust',
 'sad',
 'sad',
 'happy',
 'happy',
 'fearful',
 'fearful',
 'sad',
 'disgust',
 'sad',
 'disgust',
 'sad',
 'fearful',
 'angry',
 'fearful',
 'fearful',
 'happy',
 'disgust',
 'disgust',
 'happy',
 'neutral',
 'neutral',
 'angry',
 'angry',
 'sad',
 'sad',
 'neutral',
 'neutral',
 'neutral',
 'happy',
 'disgust',
 'disgust',
 'disgust',
 'disgust',
 'disgust',
 'fearful',
 'disgust',
 'disgust',
 'disgust',
 'happy',
 'happy',
 'disgust',
 'disgust',
 'sad',
 'happy',
 'sad',
 'sad',
 'neutral',
 'fearful'

In [17]:
y_encoded = label_encoder.transform(y)
y_categorical = to_categorical(y_encoded)
X = X[..., np.newaxis]

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, stratify=y, random_state=42)

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(40, 174, 1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.4),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train with EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=100, callbacks=[early_stop], verbose=1)

# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")

# Save model
model.save('cnn_emotion_model_combined.keras')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 77ms/step - accuracy: 0.2390 - loss: 2.4551 - val_accuracy: 0.2952 - val_loss: 1.6431
Epoch 2/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 85ms/step - accuracy: 0.2962 - loss: 1.6574 - val_accuracy: 0.3904 - val_loss: 1.4936
Epoch 3/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 93ms/step - accuracy: 0.3291 - loss: 1.6050 - val_accuracy: 0.4442 - val_loss: 1.4461
Epoch 4/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 114ms/step - accuracy: 0.3551 - loss: 1.5615 - val_accuracy: 0.4883 - val_loss: 1.3771
Epoch 5/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 107ms/step - accuracy: 0.3851 - loss: 1.5076 - val_accuracy: 0.4289 - val_loss: 1.4552
Epoch 6/100
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 106ms/step - accuracy: 0.4025 - loss: 1.4737 - val_accuracy: 0.4996 - val_loss: 1.2698
Epoch 7

In [37]:
model.summary()

In [18]:
def predict_emotion(file_path):
    features = extract_features(file_path, augment=False)
    if features is None:
        return "Error processing audio"
    features = features[np.newaxis, ..., np.newaxis]
    prediction = model.predict(features)
    emotion = label_encoder.inverse_transform([np.argmax(prediction)])[0]
    return emotion

In [39]:
# Run a prediction
if __name__ == "__main__":
    test_file = r'/Users/Appy/work/project_speech/audio/Thapar-University-Area-9_2.wav'  # Replace with valid file path

    if os.path.isfile(test_file):
        predicted = predict_emotion(test_file)
        print(f"Predicted Emotion: {predicted}")
    else:
        print("Test file not found.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Predicted Emotion: angry
