In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score
)
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Input, Conv2D, BatchNormalization, MaxPooling2D,
    GlobalAveragePooling2D, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
)


In [None]:
DATASET_PATH = "/content/drive/MyDrive/ProjectData/Audio_Speech_Actors_01-24"
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

data = []
for root, _, files in os.walk(DATASET_PATH):
    for file in files:
        if file.endswith(".wav"):
            parts = file.split("-")
            emotion = emotion_map[parts[2]]
            gender = "male" if int(parts[6].split(".")[0]) % 2 == 1 else "female"
            data.append([os.path.join(root, file), emotion, gender])

df = pd.DataFrame(data, columns=["path", "emotion", "gender"])
print(f"Loaded {len(df)} audio files")
print(df.head())

In [None]:
class AudioAugmenter:
    """Advanced audio augmentation"""
    def __init__(self, sr=22050):
        self.sr = sr

    def pitch_shift(self, y, n_steps):
        return librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)

    def time_stretch(self, y, rate):
        return librosa.effects.time_stretch(y, rate=rate)

    def add_noise(self, y, noise_factor=0.005):
        noise = np.random.randn(len(y))
        return y + noise_factor * noise

    def augment_audio(self, y):
        augmentations = [
            lambda x: self.pitch_shift(x, np.random.randint(-2, 3)),
            lambda x: self.time_stretch(x, np.random.uniform(0.9, 1.1)),
            lambda x: self.add_noise(x)
        ]
        return np.random.choice(augmentations)(y)

print("Audio augmenter created")

In [None]:
SR = 22050
N_MELS = 128
MAX_LEN = 300

def preprocess_audio(path, augment=False):
    try:
        y, sr = librosa.load(path, sr=SR)
        y, _ = librosa.effects.trim(y, top_db=20)

        if augment:
            augmenter = AudioAugmenter(sr=sr)
            y = augmenter.augment_audio(y)

        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
        mel = librosa.power_to_db(mel, ref=np.max)

        if mel.shape[1] < MAX_LEN:
            mel = np.pad(mel, ((0, 0), (0, MAX_LEN - mel.shape[1])))
        else:
            mel = mel[:, :MAX_LEN]

        mel = (mel - mel.mean()) / (mel.std() + 1e-6)
        mel = mel[..., np.newaxis]
        return mel
    except Exception as e:
        print(f"Error: {e}")
        return None

print("Preprocessing function created")

In [None]:
print("Loading and augmenting audio files...")
X, y = [], []

for idx, (_, row) in enumerate(df.iterrows()):
    if idx % 100 == 0:
        print(f"Processing {idx}/{len(df)}...")


    mel = preprocess_audio(row["path"], augment=False)
    if mel is not None:
        X.append(mel)
        y.append(row["emotion"])


    for _ in range(2):
        mel_aug = preprocess_audio(row["path"], augment=True)
        if mel_aug is not None:
            X.append(mel_aug)
            y.append(row["emotion"])

X = np.array(X)
y = np.array(y)

print(f"Total samples after augmentation: {len(X)}")
print(f"Shape: {X.shape}")

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.10, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1111, stratify=y_temp, random_state=42
)


le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)


def normalize_spec(spec):
    mean = np.mean(spec)
    std = np.std(spec) + 1e-6
    return (spec - mean) / std

X_train = np.array([normalize_spec(x) for x in X_train])
X_val = np.array([normalize_spec(x) for x in X_val])
X_test = np.array([normalize_spec(x) for x in X_test])

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
model = Sequential([
    Input(shape=(128, 300, 1)),

    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Conv2D(256, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    Conv2D(256, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    GlobalAveragePooling2D(),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(8, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=3e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("Model created")
model.summary()

In [None]:
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

class_weight_dict[le.transform(['happy'])[0]] *= 2
class_weight_dict[le.transform(['sad'])[0]] *= 2
class_weight_dict[le.transform(['neutral'])[0]] *= 1.5

print("Updated class weights:", class_weight_dict)

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
model_checkpoint = ModelCheckpoint('best_emotion_model.h5', monitor='val_accuracy', save_best_only=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop, lr_reduce, model_checkpoint],
    verbose=1
)

print("Training complete")

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=1)

print("\n" + "="*80)
print("EVALUATION METRICS")
print("="*80)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Macro F1: {f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"Weighted F1: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\n" + classification_report(y_test, y_pred, target_names=le.classes_))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*80)
print("BIAS ANALYSIS - PERFORMANCE BY GENDER")
print("="*80)

df_test = df.iloc[-len(y_test):].copy()
df_test['pred'] = y_pred
df_test['true'] = y_test

for gender in ['male', 'female']:
    subset = df_test[df_test['gender'] == gender]
    if len(subset) > 0:
        gender_acc = accuracy_score(subset['true'], subset['pred'])
        gender_f1 = f1_score(subset['true'], subset['pred'], average='macro')
        print(f"\n{gender.upper()}:")
        print(f"  Accuracy: {gender_acc:.4f}")
        print(f"  F1-Score: {gender_f1:.4f}")
        print(f"  Samples: {len(subset)}")

print("="*80)

model.save('/content/drive/MyDrive/emotion_recognition_model.h5')
print("\nModel saved to Google Drive")