In [None]:
import numpy as np
import pandas as pd
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.regularizers import l2
import librosa

ravdess_emotion_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}
tess_emotion_map = {
    'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry',
    'fear': 'fearful', 'disgust': 'disgust', 'ps': 'surprised'
}

ravdess_root_dir = "/kaggle/input/audio-data"
tess_root_dir = "/kaggle/input/toronto-emotional-speech-set-tess"

print("Processing file paths...")
all_data = []

if os.path.exists(ravdess_root_dir):
    for subdir, _, files in os.walk(ravdess_root_dir):
        for file in files:
            if file.endswith(".wav"):
                try:
                    emotion_code = file.split("-")[2]
                    emotion = ravdess_emotion_map[emotion_code]
                    filepath = os.path.join(subdir, file)
                    all_data.append((filepath, emotion, 'RAVDESS'))
                except (KeyError, IndexError):
                    continue
else:
    print(f"Warning: RAVDESS directory not found at {ravdess_root_dir}")

if os.path.exists(tess_root_dir):
    for subdir, _, files in os.walk(tess_root_dir):
        for file in files:
            if file.endswith(".wav"):
                try:
                    filename_lower = file.lower()
                    emotion = next(em for key, em in tess_emotion_map.items() if key in filename_lower)
                    filepath = os.path.join(subdir, file)
                    all_data.append((filepath, emotion, 'TESS'))
                except StopIteration:
                    continue
else:
    print(f"Warning: TESS directory not found at {tess_root_dir}")

df = pd.DataFrame(all_data, columns=["path", "emotion", "dataset"])
df = df[df['emotion'] != 'calm'].reset_index(drop=True)

print("\nRebalancing datasets to address performance disparity...")
ravdess_df = df[df['dataset'] == 'RAVDESS']
tess_df = df[df['dataset'] == 'TESS']
tess_df_sampled = tess_df.sample(n=len(ravdess_df), random_state=42)
balanced_df = pd.concat([ravdess_df, tess_df_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)


def add_noise(y, noise_factor=0.005):
    noise = np.random.randn(len(y))
    return (y + noise * noise_factor).astype(np.float32)

def pitch_shift(y, sr, n_steps=3):
    return librosa.effects.pitch_shift(y=y, sr=sr, n_steps=n_steps)

def time_stretch(y, rate=0.9):
    return librosa.effects.time_stretch(y=y, rate=rate)

def spec_augment(spec, time_masking_para=40, freq_masking_para=30, num_masks=1):
    spec_copy = spec.copy()
    n_mels, n_steps = spec_copy.shape
    for _ in range(num_masks):
        f = random.randrange(0, freq_masking_para)
        f0 = random.randrange(0, n_mels - f)
        if f > 0:
            spec_copy[f0:f0+f, :] = 0
        t = random.randrange(0, time_masking_para)
        t0 = random.randrange(0, n_steps - t)
        if t > 0:
            spec_copy[:, t0:t0+t] = 0
    return spec_copy

def extract_features_robust(y, sr, n_mels=128, n_mfcc=40, max_len=250):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmin=20)
    log_mel = librosa.power_to_db(mel, ref=np.max, top_db=80)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if np.std(mfcc) < 1e-5:
        return None
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.vstack((log_mel, mfcc, mfcc_delta, mfcc_delta2))
    if features.shape[1] < max_len:
        features = np.pad(features, ((0, 0), (0, max_len - features.shape[1])), mode='constant')
    else:
        features = features[:, :max_len]
    features = (features - np.mean(features)) / (np.std(features) + 1e-6)
    if np.isnan(features).any() or np.isinf(features).any():
        return None
    return features

le = LabelEncoder()
balanced_df['emotion_encoded'] = le.fit_transform(balanced_df['emotion'])

train_df, test_df = train_test_split(
    balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['emotion']
)

def process_data(df, augment=False):
    X, y = [], []
    SR = 16000
    for idx, row in df.iterrows():
        try:
            y_audio, _ = librosa.load(row['path'], sr=SR)
            if len(y_audio) < 400:
                continue
            audios_to_process = [y_audio]
            if augment:
                audios_to_process.extend([
                    add_noise(y_audio),
                    pitch_shift(y_audio, SR),
                    time_stretch(y_audio)
                ])
            for audio in audios_to_process:
                features = extract_features_robust(audio, SR)
                if features is not None:
                    if augment and audio is not y_audio:
                        features = spec_augment(features)
                    X.append(features)
                    y.append(row['emotion_encoded'])
        except Exception as e:
            print(f"Error processing file {row['path']}: {e}")
    return np.array(X), np.array(y)

print("Processing training data (with augmentation)...")
X_train, y_train = process_data(train_df, augment=True)

print("\nProcessing testing data (no augmentation)...")
X_test, y_test = process_data(test_df, augment=False)

X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

if X_train.size == 0 or X_test.size == 0:
    raise ValueError("Training or testing data is empty. Please check data processing pipeline.")

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    def build_model(input_shape, num_classes):
        inp = Input(shape=input_shape)
        l2_strength = 0.005
        x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(l2_strength))(inp)
        x = BatchNormalization()(x)
        x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(l2_strength))(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D((2, 2))(x)
        x = Dropout(0.25)(x)
        x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(l2_strength))(x)
        x = BatchNormalization()(x)
        x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(l2_strength))(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D((2, 2))(x)
        x = Dropout(0.3)(x)
        x = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(l2_strength))(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D((2, 3))(x)
        x = Dropout(0.35)(x)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(num_classes, activation='softmax')(x)
        model = Model(inputs=inp, outputs=x)
        optimizer = Adam(learning_rate=0.0005)
        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model

    model = build_model(X_train.shape[1:], len(le.classes_))

BATCH_SIZE = 32 * strategy.num_replicas_in_sync
checkpoint = ModelCheckpoint("best_model_balanced.keras", save_best_only=True, monitor="val_accuracy", mode="max", verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-7, verbose=1)

history = model.fit(
    X_train, y_train,
    epochs=150,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, reduce_lr, checkpoint],
    batch_size=BATCH_SIZE,
    class_weight=class_weights_dict
)

model.load_weights("best_model_balanced.keras")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

test_df_reset = test_df.reset_index(drop=True)
pred_df = test_df_reset.copy()
pred_df['pred_emotion_encoded'] = y_pred
pred_df['correct'] = pred_df['emotion_encoded'] == pred_df['pred_emotion_encoded']
dataset_accuracy = pred_df.groupby('dataset')['correct'].mean()
print(dataset_accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix (Balanced Dataset)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

2025-06-15 05:58:22.818958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749967102.983950      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749967103.032855      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Processing file paths...

Rebalancing datasets to address performance disparity...
Processing training data (with augmentation)...

Processing testing data (no augmentation)...


I0000 00:00:1749967361.103072      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1749967361.103695      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/150


E0000 00:00:1749967376.231419      35 meta_optimizer.cc:966] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/functional_1/dropout_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
I0000 00:00:1749967377.895721     123 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1749967378.534418     125 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 783ms/step - accuracy: nan - loss: nan   
Epoch 1: val_accuracy improved from -inf to 0.13435, saving model to best_model_balanced.keras
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 828ms/step - accuracy: nan - loss: nan - val_accuracy: 0.1344 - val_loss: 7.2400 - learning_rate: 5.0000e-04
Epoch 2/150
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 773ms/step - accuracy: nan - loss: nan   
Epoch 2: val_accuracy improved from 0.13435 to 0.13650, saving model to best_model_balanced.keras
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 791ms/step - accuracy: nan - loss: nan - val_accuracy: 0.1365 - val_loss: 8.1752 - learning_rate: 5.0000e-04
Epoch 3/150
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 780ms/step - accuracy: nan - loss: nan 
Epoch 3: val_accuracy did not improve from 0.13650
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

--- Final Model Evaluation on Test Set (using best saved weights) ---

Overall Test Loss: 0.7289

Overall Test Accuracy: 91.02%16/16 ━━━━━━━━━━━━━━━━━━━━ 2s 109ms/step

Classification Report (Overall):

precision recall f1-score support



angry 0.99 0.96 0.97 76

disgust 0.92 0.92 0.92 78

fearful 0.93 0.86 0.90 74

happy 0.87 0.93 0.90 73

neutral 0.96 0.93 0.94 54

sad 0.82 0.94 0.88 72

surprised 0.97 0.88 0.92 73



accuracy 0.92 500

macro avg 0.92 0.92 0.92 500

weighted avg 0.92 0.92 0.92 500





--- HONEST PERFORMANCE METRICS (Per-Dataset Accuracy) ---

dataset

RAVDESS 0.846154

TESS 0.995833