# Sound Emotion Detection System

This notebook trains a deep learning model to detect emotions from audio files using the TESS Toronto emotional speech dataset.

## Dataset Structure:
- **OAF_***: Older actress female recordings
- **YAF_***: Younger actress female recordings
- **Emotions**: angry, disgust, fear, happy, neutral, sad, pleasant_surprise

## Features:
- **MFCCs**: Mel-frequency cepstral coefficients
- **Chroma**: Pitch-related features
- **Spectral Contrast**: Frequency band differences
- **Zero Crossing Rate**: Signal changes
- **Spectral Rolloff**: Frequency distribution

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import joblib

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Define data path and parameters
data_dir = "../data/sound_data/TESS Toronto emotional speech set data"
sample_rate = 22050  # Standard audio sample rate
duration = 3  # Duration in seconds (will pad/crop to this)
n_mfcc = 40  # Number of MFCC features
n_fft = 2048  # FFT window size
hop_length = 512  # Hop length for STFT

print(f"Data directory: {data_dir}")
print(f"Sample rate: {sample_rate} Hz")
print(f"Duration: {duration} seconds")
print(f"MFCC features: {n_mfcc}")

In [None]:
# Explore dataset structure
emotions = []
file_counts = {}

for folder in os.listdir(data_dir):
    if os.path.isdir(os.path.join(data_dir, folder)):
        # Extract emotion from folder name
        if '_' in folder:
            emotion = folder.split('_')[-1]
            if emotion not in emotions:
                emotions.append(emotion)
        
        # Count files
        file_path = os.path.join(data_dir, folder)
        file_count = len([f for f in os.listdir(file_path) if f.endswith('.wav')])
        file_counts[folder] = file_count

print("Dataset Structure:")
for folder, count in file_counts.items():
    print(f"  {folder}: {count} files")

print(f"\nEmotions found: {emotions}")
print(f"Total files: {sum(file_counts.values())}")

In [None]:
def extract_audio_features(file_path, max_pad_length=173):
    """
    Extract comprehensive audio features from audio file
    """
    try:
        # Load audio file
        y, sr = librosa.load(file_path, duration=duration, sr=sample_rate)
        
        # Ensure consistent length
        if len(y) < max_pad_length:
            y = np.pad(y, (0, max_pad_length - len(y)), mode='constant')
        else:
            y = y[:max_pad_length]
        
        features = {}
        
        # 1. MFCCs (most important for speech emotion)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
        features['mfcc'] = mfccs
        
        # 2. Chroma features (pitch-related)
        chroma = librosa.feature.chroma(y=y, sr=sr, hop_length=hop_length)
        features['chroma'] = chroma
        
        # 3. Spectral Contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)
        features['spectral_contrast'] = spectral_contrast
        
        # 4. Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y, hop_length=hop_length)
        features['zcr'] = zcr
        
        # 5. Spectral Rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=hop_length)
        features['spectral_rolloff'] = spectral_rolloff
        
        # 6. RMS Energy
        rms = librosa.feature.rms(y=y, hop_length=hop_length)
        features['rms'] = rms
        
        return features
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [None]:
# Test feature extraction on a sample file
sample_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]
sample_folder = os.path.join(data_dir, sample_folders[0])
sample_files = [f for f in os.listdir(sample_folder) if f.endswith('.wav')]
sample_file = os.path.join(sample_folder, sample_files[0])

print(f"Sample file: {sample_file}")

# Extract features
features = extract_audio_features(sample_file)

if features:
    print("\nFeature shapes:")
    for name, feature in features.items():
        print(f"  {name}: {feature.shape}")
    
    # Visualize MFCCs
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 3, 1)
    librosa.display.specshow(features['mfcc'], sr=sample_rate, hop_length=hop_length, x_axis='time', cmap='coolwarm')
    plt.colorbar()
    plt.title('MFCCs')
    
    plt.subplot(2, 3, 2)
    librosa.display.specshow(features['chroma'], sr=sample_rate, hop_length=hop_length, x_axis='time', y_axis='chroma', cmap='coolwarm')
    plt.colorbar()
    plt.title('Chroma')
    
    plt.subplot(2, 3, 3)
    librosa.display.specshow(features['spectral_contrast'], sr=sample_rate, hop_length=hop_length, x_axis='time', cmap='coolwarm')
    plt.colorbar()
    plt.title('Spectral Contrast')
    
    plt.subplot(2, 3, 4)
    plt.plot(features['zcr'][0])
    plt.title('Zero Crossing Rate')
    
    plt.subplot(2, 3, 5)
    plt.plot(features['spectral_rolloff'][0])
    plt.title('Spectral Rolloff')
    
    plt.subplot(2, 3, 6)
    plt.plot(features['rms'][0])
    plt.title('RMS Energy')
    
    plt.tight_layout()
    plt.show()
else:
    print("Failed to extract features")

In [None]:
# Process entire dataset
X = []
y = []
file_paths = []
processed_files = 0

print("Processing audio files...")

for folder in os.listdir(data_dir):
    if os.path.isdir(os.path.join(data_dir, folder)):
        # Extract emotion label
        if '_' in folder:
            emotion = folder.split('_')[-1]
            # Normalize emotion names
            if emotion == 'pleasant_surprise':
                emotion = 'surprise'
            elif emotion == 'pleasant_surprised':
                emotion = 'surprise'
        else:
            continue
        
        folder_path = os.path.join(data_dir, folder)
        
        for file in os.listdir(folder_path):
            if file.endswith('.wav'):
                file_path = os.path.join(folder_path, file)
                
                # Extract features
                features = extract_audio_features(file_path)
                
                if features is not None:
                    # Combine all features
                    combined_features = np.concatenate([
                        features['mfcc'],
                        features['chroma'],
                        features['spectral_contrast'],
                        features['zcr'],
                        features['spectral_rolloff'],
                        features['rms']
                    ], axis=0)
                    
                    X.append(combined_features)
                    y.append(emotion)
                    file_paths.append(file_path)
                    processed_files += 1
                    
                    if processed_files % 100 == 0:
                        print(f"Processed {processed_files} files...")

print(f"\nProcessing complete!")
print(f"Total files processed: {processed_files}")
print(f"Feature shape: {np.array(X).shape}")

In [None]:
# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

print(f"Final dataset shape:")
print(f"  X (features): {X.shape}")
print(f"  y (labels): {y.shape}")

# Check emotion distribution
unique_emotions, counts = np.unique(y, return_counts=True)
print(f"\nEmotion distribution:")
for emotion, count in zip(unique_emotions, counts):
    print(f"  {emotion}: {count} samples")

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
n_classes = len(label_encoder.classes_)

print(f"Encoded labels:")
for i, emotion in enumerate(label_encoder.classes_):
    print(f"  {i}: {emotion}")

print(f"\nNumber of classes: {n_classes}")

In [None]:
# Normalize features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

print(f"Features normalized")
print(f"Normalized shape: {X_normalized.shape}")

# Save scaler for later use
joblib.dump(scaler, '../model/sound_emotion_scaler.pkl')
print("Scaler saved to ../model/sound_emotion_scaler.pkl")

In [None]:
# Reshape features for CNN/LSTM input
# Current shape: (n_samples, n_features)
# We need: (n_samples, time_steps, n_features_per_step)

# Calculate time steps and features per step
total_features = X_normalized.shape[1]
n_time_steps = 173  # Based on MFCC time frames
n_features_per_step = total_features // n_time_steps

# Reshape for sequential models
X_reshaped = X_normalized.reshape(X_normalized.shape[0], n_time_steps, n_features_per_step)

print(f"Reshaped for sequential processing:")
print(f"  Original shape: {X_normalized.shape}")
print(f"  Reshaped: {X_reshaped.shape}")
print(f"  Time steps: {n_time_steps}")
print(f"  Features per step: {n_features_per_step}")

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Dataset split:")
print(f"  Training: {X_train.shape} ({len(X_train)} samples)")
print(f"  Validation: {X_val.shape} ({len(X_val)} samples)")
print(f"  Testing: {X_test.shape} ({len(X_test)} samples)")

# Convert to categorical
y_train_cat = to_categorical(y_train, num_classes=n_classes)
y_val_cat = to_categorical(y_val, num_classes=n_classes)
y_test_cat = to_categorical(y_test, num_classes=n_classes)

In [None]:
# Build advanced sound emotion detection model
def build_sound_emotion_model(input_shape, n_classes):
    model = Sequential([
        # CNN layers for feature extraction
        Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape, padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        Conv1D(128, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        Conv1D(256, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        # LSTM layers for temporal patterns
        LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3),
        LSTM(64, dropout=0.3, recurrent_dropout=0.3),
        
        # Dense layers for classification
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        
        Dense(n_classes, activation='softmax')
    ])
    
    return model

# Build model
input_shape = (n_time_steps, n_features_per_step)
model = build_sound_emotion_model(input_shape, n_classes)

# Compile model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
# Setup callbacks
callbacks = [
    EarlyStopping(
        monitor='val_accuracy',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        '../model/best_sound_emotion_model.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

# Training parameters
epochs = 50
batch_size = 32

print(f"Training parameters:")
print(f"  Epochs: {epochs}")
print(f"  Batch size: {batch_size}")
print(f"  Training samples: {len(X_train)}")
print(f"  Validation samples: {len(X_val)}")

In [None]:
# Train the model
print("\nðŸŽµ Training Sound Emotion Detection Model...")
print("=" * 50)

history = model.fit(
    X_train, y_train_cat,
    validation_data=(X_val, y_val_cat),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Evaluate model
print("\nðŸ“Š Evaluating Model Performance...")
print("=" * 40)

# Test set evaluation
test_loss, test_accuracy = model.evaluate(X_test, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Test Loss: {test_loss:.4f}")

# Generate predictions
y_pred = model.predict(X_test, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = y_test

# Classification report
print("\nClassification Report:")
class_names = label_encoder.classes_
print(classification_report(y_true_classes, y_pred_classes, target_names=class_names))

# Confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_true_classes, y_pred_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix - Sound Emotion Detection')
plt.ylabel('True Emotion')
plt.xlabel('Predicted Emotion')
plt.show()

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy plot
ax1.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
ax1.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Loss plot
ax2.plot(history.history['loss'], label='Training Loss', linewidth=2)
ax2.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTraining Summary:")
print(f"  Final training accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"  Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")
print(f"  Best validation accuracy: {max(history.history['val_accuracy']):.4f}")

In [None]:
# Save the final model and components
model.save('../model/sound_emotion_detector.keras')
print("âœ… Model saved as: ../model/sound_emotion_detector.keras")

# Save label encoder
joblib.dump(label_encoder, '../model/sound_emotion_label_encoder.pkl')
print("âœ… Label encoder saved as: ../model/sound_emotion_label_encoder.pkl")

# Save model architecture
with open('../model/sound_emotion_model_info.txt', 'w') as f:
    f.write(f"Sound Emotion Detection Model\n")
    f.write(f"Input shape: {input_shape}\n")
    f.write(f"Number of classes: {n_classes}\n")
    f.write(f"Classes: {list(class_names)}\n")
    f.write(f"Sample rate: {sample_rate}\n")
    f.write(f"Duration: {duration} seconds\n")
    f.write(f"MFCC features: {n_mfcc}\n")

print("âœ… Model information saved")
print("\nðŸŽ‰ Sound Emotion Detection Model Training Complete!")

In [None]:
# Test model on a new audio file
def predict_sound_emotion(audio_path, model, scaler, label_encoder):
    """
    Predict emotion from audio file
    """
    # Extract features
    features = extract_audio_features(audio_path)
    
    if features is None:
        return None
    
    # Combine features
    combined_features = np.concatenate([
        features['mfcc'],
        features['chroma'],
        features['spectral_contrast'],
        features['zcr'],
        features['spectral_rolloff'],
        features['rms']
    ], axis=0)
    
    # Normalize
    normalized_features = scaler.transform([combined_features])
    
    # Reshape
    reshaped_features = normalized_features.reshape(1, n_time_steps, n_features_per_step)
    
    # Predict
    prediction = model.predict(reshaped_features, verbose=0)
    predicted_class = np.argmax(prediction, axis=1)[0]
    confidence = np.max(prediction)
    
    # Decode label
    predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]
    
    return {
        'emotion': predicted_emotion,
        'confidence': float(confidence),
        'all_probabilities': {
            label_encoder.inverse_transform([i])[0]: float(prob) 
            for i, prob in enumerate(prediction[0])
        }
    }

# Test on a sample file
if len(file_paths) > 0:
    test_file = file_paths[0]
    print(f"\nTesting on: {test_file}")
    
    prediction = predict_sound_emotion(test_file, model, scaler, label_encoder)
    
    if prediction:
        print(f"Predicted emotion: {prediction['emotion']}")
        print(f"Confidence: {prediction['confidence']:.4f}")
        print(f"\nAll probabilities:")
        for emotion, prob in prediction['all_probabilities'].items():
            print(f"  {emotion}: {prob:.4f}")
    else:
        print("Failed to predict emotion")