In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ============================================
# CELL 1: Setup and Installation
# ============================================

In [2]:
# ============================================
# CELL 1: Setup and Installation (FINAL VERSION)
# ============================================
"""
Speech Emotion Recognition System
For: Speech Processing & ANN/DL Course
Author: Ahad Imran
"""

import warnings
warnings.filterwarnings('ignore')

# Check what's already installed
import sys
print(f"Python: {sys.version}")

# Check librosa
try:
    import librosa
    print(f"‚úì librosa {librosa.__version__}")
except:
    print("Installing librosa...")
    !pip install -q librosa

# Skip audiomentations - not critical, we have built-in augmentation

# Import all required packages
import os
import gc
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Optional
import pickle
import json
from collections import Counter

# Audio processing
import librosa
import librosa.display
import soundfile as sf
import IPython.display as ipd

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchaudio
import torchaudio.transforms as T

# Machine Learning
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set seeds
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

print("\n Setup complete! Ready to proceed.")

Python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
‚úì librosa 0.11.0

Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB

 Setup complete! Ready to proceed.


# ============================================
# CELL 1a: Setup for Dual T4 GPUs
# ============================================

In [3]:
from torch.nn.parallel import DataParallel

# Check available GPUs
print(f"GPUs available: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")

if torch.cuda.device_count() > 1:
    print("\n‚úÖ Multiple GPUs detected! Will use DataParallel for faster training.")
else:
    print("\n‚úÖ Single GPU detected. Will proceed with standard training.")

GPUs available: 2
GPU 0: Tesla T4
Memory: 15.83 GB
GPU 1: Tesla T4
Memory: 15.83 GB

‚úÖ Multiple GPUs detected! Will use DataParallel for faster training.


# ============================================
# CELL 2: Download and Prepare Datasets
# ============================================

To set up environment variables for the Kaggle API using Python, you can use the `os` module to assign your credentials directly in your script. This is especially useful when you don‚Äôt want to rely on a `kaggle.json` file. Here's how to do it:

---

### üîë Step-by-Step: Set Kaggle API Key with `os.environ`

```python
import os

# Set your Kaggle credentials
os.environ['KAGGLE_USERNAME'] = 'your_kaggle_username'
os.environ['KAGGLE_KEY'] = 'your_kaggle_api_key'
```

Replace `'your_kaggle_username'` and `'your_kaggle_api_key'` with the actual values from your [Kaggle account settings](https://www.kaggle.com/settings).

---

### üì¶ Then You Can Download Datasets Like This

```python
!pip install kaggle

# Example: Download Titanic dataset
!kaggle competitions download -c titanic
```

This will work in environments like Jupyter, Colab, or Kaggle Notebooks ‚Äî as long as the API key is valid and you've accepted the competition rules (if required).

## Usage Example
```python
# !kaggle competitions download -c titanic


# import zipfile

# with zipfile.ZipFile('/kaggle/working/titanic.zip', 'r') as zip_ref:
#    zip_ref.extractall('/kaggle/working')
```

In [4]:
"""
Using Kaggle datasets for emotion recognition
"""

import os
import zipfile
from pathlib import Path

# Setup Kaggle API credentials using Secrets
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    
    print("‚úì Kaggle API configured with secrets")
    api_available = True
except Exception as e:
    print(f"‚ö†Ô∏è Kaggle secrets not found: {e}")
    print("Please add datasets via 'Add Data' button or configure secrets.")
    api_available = False

# Create directory structure
os.makedirs('/kaggle/working/data', exist_ok=True)
os.makedirs('/kaggle/working/models', exist_ok=True)
os.makedirs('/kaggle/working/results', exist_ok=True)

# Check if datasets are already added via UI
datasets_found = False
if os.path.exists('/kaggle/input/'):
    input_datasets = os.listdir('/kaggle/input/')
    if len(input_datasets) > 0:
        print("Datasets found in /kaggle/input/:")
        for dataset in input_datasets:
            print(f"  ‚úì {dataset}")
        datasets_found = True
        DATA_PATH = '/kaggle/input/'

# Method 2: Download if not added via UI (only if API is available)
if not datasets_found and api_available:  # <-- FIXED: Added api_available check
    print("\nNo datasets found in input. Downloading...")
    
    # Only download if not already present
    if not os.path.exists('/kaggle/working/ravdess'):
        print("Downloading RAVDESS...")
        !kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio -p /kaggle/working --quiet
        
        if os.path.exists('/kaggle/working/ravdess-emotional-speech-audio.zip'):
            with zipfile.ZipFile('/kaggle/working/ravdess-emotional-speech-audio.zip', 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/ravdess')
            os.remove('/kaggle/working/ravdess-emotional-speech-audio.zip')
            print("‚úì RAVDESS downloaded")
    
    if not os.path.exists('/kaggle/working/tess'):
        print("Downloading TESS...")
        !kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess -p /kaggle/working --quiet
        
        if os.path.exists('/kaggle/working/toronto-emotional-speech-set-tess.zip'):
            with zipfile.ZipFile('/kaggle/working/toronto-emotional-speech-set-tess.zip', 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/tess')
            os.remove('/kaggle/working/toronto-emotional-speech-set-tess.zip')
            print("‚úì TESS downloaded")
    
    if not os.path.exists('/kaggle/working/cremad'):
        print("Downloading CREMA-D...")
        !kaggle datasets download -d ejlok1/cremad -p /kaggle/working --quiet
        
        if os.path.exists('/kaggle/working/cremad.zip'):
            with zipfile.ZipFile('/kaggle/working/cremad.zip', 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/cremad')
            os.remove('/kaggle/working/cremad.zip')
            print("‚úì CREMA-D downloaded")
    
    DATA_PATH = '/kaggle/working/'

elif not datasets_found and not api_available:
    print("\n‚ö†Ô∏è No datasets found and API not configured.")
    print("Please either:")
    print("1. Add datasets using the 'Add Data' button, or")
    print("2. Configure Kaggle API secrets (kaggle_username and kaggle_key)")
    DATA_PATH = '/kaggle/working/'  # Set default path anyway

else:
    # Datasets already found
    pass

# Verify final state
print(f"\nUsing DATA_PATH: {DATA_PATH}")
if os.path.exists(DATA_PATH):
    contents = os.listdir(DATA_PATH)
    if contents:
        print(f"Found {len(contents)} items in {DATA_PATH}")
    else:
        print("‚ö†Ô∏è DATA_PATH is empty. Please add datasets.")

print("\n Setup complete!")

‚úì Kaggle API configured with secrets

No datasets found in input. Downloading...
Downloading RAVDESS...
Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
‚úì RAVDESS downloaded
Downloading TESS...
Dataset URL: https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
‚úì TESS downloaded
Downloading CREMA-D...
Dataset URL: https://www.kaggle.com/datasets/ejlok1/cremad
License(s): ODC Attribution License (ODC-By)
‚úì CREMA-D downloaded

Using DATA_PATH: /kaggle/working/
Found 7 items in /kaggle/working/

 Setup complete!


In [5]:
# Quick Debug: Check what's actually in the directories
# DEBUG CELL: Check dataset structure
import os

for dataset in ['ravdess', 'tess', 'cremad']:
    path = f'/kaggle/working/{dataset}'
    if os.path.exists(path):
        print(f"\n{dataset.upper()} structure:")
        for root, dirs, files in os.walk(path):
            level = root.replace(path, '').count(os.sep)
            if level < 3:  # Only show first 3 levels
                indent = ' ' * 2 * level
                print(f"{indent}{os.path.basename(root)}/")
                if level < 2:
                    wav_files = [f for f in files if f.endswith('.wav')]
                    if wav_files:
                        print(f"{indent}  [{len(wav_files)} .wav files]")
                        print(f"{indent}  Sample: {wav_files[0]}")


RAVDESS structure:
ravdess/
  Actor_16/
    [60 .wav files]
    Sample: 03-01-03-02-02-02-16.wav
  Actor_02/
    [60 .wav files]
    Sample: 03-01-03-01-01-02-02.wav
  Actor_01/
    [60 .wav files]
    Sample: 03-01-02-02-01-02-01.wav
  Actor_10/
    [60 .wav files]
    Sample: 03-01-05-01-01-02-10.wav
  Actor_12/
    [60 .wav files]
    Sample: 03-01-05-02-01-01-12.wav
  Actor_15/
    [60 .wav files]
    Sample: 03-01-07-01-01-02-15.wav
  Actor_13/
    [60 .wav files]
    Sample: 03-01-01-01-01-01-13.wav
  Actor_05/
    [60 .wav files]
    Sample: 03-01-02-02-01-01-05.wav
  Actor_14/
    [60 .wav files]
    Sample: 03-01-07-01-01-01-14.wav
  Actor_07/
    [60 .wav files]
    Sample: 03-01-04-02-02-01-07.wav
  Actor_06/
    [60 .wav files]
    Sample: 03-01-08-01-01-01-06.wav
  Actor_03/
    [60 .wav files]
    Sample: 03-01-03-01-02-01-03.wav
  Actor_08/
    [60 .wav files]
    Sample: 03-01-03-01-02-01-08.wav
  Actor_09/
    [60 .wav files]
    Sample: 03-01-05-01-01-02-09.wav
  Act

# ============================================
# CELL 3: Configuration
# ============================================

In [13]:
# ============================================
# IMPROVED CELL 3: Better Configuration
# ============================================
class Config:
    """Improved configuration for better accuracy"""
    
    # Project
    project_name = "Speech Emotion Recognition"
    
    # Data
    sample_rate = 16000
    duration = 3.0
    n_classes = 7
    
    # Features
    n_mfcc = 40
    n_mels = 128
    n_fft = 2048
    hop_length = 512
    
    # Data splits
    train_size = 0.7
    val_size = 0.15
    test_size = 0.15
    
    # IMPROVED Training parameters
    batch_size = 64
    epochs = 150  # Increased from 100
    learning_rate = 5e-4  # Reduced from 1e-3
    early_stopping_patience = 15  # Increased from 10
    
    # Model
    model_type = 'ensemble'
    dropout = 0.4  # Increased from 0.3 for better regularization
    
    # IMPROVED Augmentation
    use_augmentation = True
    augment_prob = 0.7  # Increased from 0.5

     # Paths
    data_path = '/kaggle/working/'  # Or '/kaggle/input/' if using Add Data
    save_path = '/kaggle/working/'
    
config = Config()
print("‚úÖ Configuration loaded")
print(f"Model type: {config.model_type}")
print(f"Batch size: {config.batch_size}")
print(f"Epochs: {config.epochs}")

‚úÖ Configuration loaded
Model type: ensemble
Batch size: 64
Epochs: 150


# ============================================
# CELL 4: Dataset Class with Memory Optimization
# ============================================

In [14]:
class EmotionDataset(Dataset):
    """
    Memory-efficient dataset for Kaggle
    """
    
    def __init__(
        self, 
        file_paths: List[str],
        labels: List[int],
        config: Config,
        transform=None,
        augment=False
    ):
        self.file_paths = file_paths
        self.labels = labels
        self.config = config
        self.transform = transform
        self.augment = augment
        
        # Pre-calculate fixed length
        self.target_length = int(config.sample_rate * config.duration)
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        # Load audio on-demand to save memory
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        
        try:
            # Load audio
            waveform, sr = librosa.load(audio_path, sr=self.config.sample_rate, mono=True)
            
            # Pad or truncate
            if len(waveform) > self.target_length:
                waveform = waveform[:self.target_length]
            else:
                waveform = np.pad(waveform, (0, self.target_length - len(waveform)))
            
            # Convert to tensor
            waveform = torch.FloatTensor(waveform).unsqueeze(0)
            
            # Apply augmentation
            if self.augment and random.random() < self.config.augment_prob:
                waveform = self.augment_audio(waveform)
            
            # Extract features
            features = self.extract_features(waveform)
            
            return features, label
            
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            # Return zeros if error
            return torch.zeros((self.config.n_mels, 94)), label
    
    def augment_audio(self, waveform):
        """Simple augmentation"""
        # Add noise
        if random.random() > 0.5:
            noise = torch.randn_like(waveform) * 0.005
            waveform = waveform + noise
        
        # Time shift
        if random.random() > 0.5:
            shift = int(random.uniform(-0.1, 0.1) * waveform.shape[1])
            waveform = torch.roll(waveform, shift, dims=1)
        
        return waveform
    
    def extract_features(self, waveform):
        """Extract mel-spectrogram features"""
        mel_transform = T.MelSpectrogram(
            sample_rate=self.config.sample_rate,
            n_mels=self.config.n_mels,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length
        )
        
        mel_spec = mel_transform(waveform)
        mel_spec_db = T.AmplitudeToDB()(mel_spec)
        
        return mel_spec_db.squeeze(0)

# ============================================
# CELL 5: Data Loading and Preparation
# ============================================

In [15]:
def prepare_data(config):
    """
    Load and prepare datasets with correct paths
    """
    all_files = []
    all_labels = []
    
    # Emotion mapping
    emotion_map = {
        'neutral': 0, 'calm': 0,  # Merge calm into neutral
        'happy': 1, 'sad': 2, 'angry': 3,
        'fearful': 4, 'fear': 4,  # Handle variations
        'disgust': 5, 'surprised': 6, 'surprise': 6
    }
    
    base_path = Path('/kaggle/working')
    
    # RAVDESS dataset - files are in Actor_XX folders
    ravdess_path = base_path / 'ravdess'
    if ravdess_path.exists():
        print("Loading RAVDESS dataset...")
        # Look for Actor folders
        for actor_folder in ravdess_path.glob('Actor_*'):
            if actor_folder.is_dir():
                for audio_file in actor_folder.glob('*.wav'):
                    # Parse RAVDESS filename (03-01-06-01-02-01-12.wav)
                    parts = audio_file.stem.split('-')
                    if len(parts) >= 3:
                        emotion_code = int(parts[2])
                        ravdess_emotions = {
                            1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
                            5: 'angry', 6: 'fear', 7: 'disgust', 8: 'surprise'
                        }
                        if emotion_code in ravdess_emotions:
                            emotion = ravdess_emotions[emotion_code]
                            all_files.append(str(audio_file))
                            all_labels.append(emotion_map[emotion])
        print(f"  Found {len(all_files)} RAVDESS files")
    
    # TESS dataset - files are in emotion-specific folders
    tess_path = base_path / 'tess' / 'TESS Toronto emotional speech set data'
    if tess_path.exists():
        print("Loading TESS dataset...")
        initial_count = len(all_files)
        
        # TESS has folders like OAF_angry, YAF_happy, etc.
        for emotion_folder in tess_path.glob('*'):
            if emotion_folder.is_dir():
                folder_name = emotion_folder.name.lower()
                
                # Extract emotion from folder name
                if 'angry' in folder_name:
                    emotion = 'angry'
                elif 'disgust' in folder_name:
                    emotion = 'disgust'
                elif 'fear' in folder_name:
                    emotion = 'fear'
                elif 'happy' in folder_name:
                    emotion = 'happy'
                elif 'sad' in folder_name:
                    emotion = 'sad'
                elif 'neutral' in folder_name:
                    emotion = 'neutral'
                elif 'surprise' in folder_name or 'surprised' in folder_name:
                    emotion = 'surprise'
                else:
                    continue  # Skip unknown folders
                
                # Add all wav files from this emotion folder
                for audio_file in emotion_folder.glob('*.wav'):
                    all_files.append(str(audio_file))
                    all_labels.append(emotion_map[emotion])
        
        print(f"  Found {len(all_files) - initial_count} TESS files")
    
    # CREMA-D dataset - files are in AudioWAV folder
    cremad_path = base_path / 'cremad' / 'AudioWAV'
    if cremad_path.exists():
        print("Loading CREMA-D dataset...")
        initial_count = len(all_files)
        
        for audio_file in cremad_path.glob('*.wav'):
            # CREMA-D format: 1001_DFA_ANG_XX.wav
            filename = audio_file.stem
            if '_' in filename:
                parts = filename.split('_')
                if len(parts) >= 3:
                    emotion_code = parts[2]
                    cremad_emotions = {
                        'ANG': 'angry', 'DIS': 'disgust', 'FEA': 'fear',
                        'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'
                    }
                    if emotion_code in cremad_emotions:
                        emotion = cremad_emotions[emotion_code]
                        all_files.append(str(audio_file))
                        all_labels.append(emotion_map[emotion])
        
        print(f"  Found {len(all_files) - initial_count} CREMA-D files")
    
    # Summary
    if len(all_files) == 0:
        print("\n‚ö†Ô∏è No audio files found. Please check dataset paths.")
        print("Creating synthetic data for testing...")
        for i in range(100):
            all_files.append(f"dummy_{i}.wav")
            all_labels.append(random.randint(0, 6))
    else:
        print(f"\n‚úÖ Successfully loaded all datasets!")
    
    print(f"Total samples: {len(all_files)}")
    
    # Show label distribution
    label_counts = Counter(all_labels)
    emotion_names = {v: k for k, v in emotion_map.items()}
    print("\nEmotion distribution:")
    for label, count in sorted(label_counts.items()):
        emotion_name = [k for k, v in emotion_map.items() if v == label][0]
        print(f"  {emotion_name}: {count} samples")
    
    # Update number of classes
    config.n_classes = len(set(all_labels))
    print(f"\nNumber of emotion classes: {config.n_classes}")
    
    return all_files, all_labels

# Load data with the fixed function
file_paths, labels = prepare_data(config)

# Split data
X_temp, X_test, y_temp, y_test = train_test_split(
    file_paths, labels, test_size=config.test_size, 
    stratify=labels, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=config.val_size/(1-config.test_size),
    stratify=y_temp, random_state=42
)

print(f"\nDataset splits:")
print(f"  Train: {len(X_train)} samples")
print(f"  Val: {len(X_val)} samples")
print(f"  Test: {len(X_test)} samples")

Loading RAVDESS dataset...
  Found 1440 RAVDESS files
Loading TESS dataset...
  Found 2800 TESS files
Loading CREMA-D dataset...
  Found 7442 CREMA-D files

‚úÖ Successfully loaded all datasets!
Total samples: 11682

Emotion distribution:
  neutral: 1775 samples
  happy: 1863 samples
  sad: 1863 samples
  angry: 1863 samples
  fearful: 1863 samples
  disgust: 1863 samples
  surprised: 592 samples

Number of emotion classes: 7

Dataset splits:
  Train: 8176 samples
  Val: 1753 samples
  Test: 1753 samples


# ============================================
# CELL 6: Model Architectures
# ============================================

In [17]:
class CNNModel(nn.Module):
    """CNN for emotion recognition"""
    
    def __init__(self, config):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        # Global pooling
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(64, config.n_classes)
        )
        
    def forward(self, x):
        # Add channel dimension if needed
        if x.dim() == 3:
            x = x.unsqueeze(1)
        
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        
        return x


class LSTMModel(nn.Module):
    """LSTM for emotion recognition"""
    
    def __init__(self, config):
        super().__init__()
        
        self.lstm = nn.LSTM(
            input_size=config.n_mels,
            hidden_size=128,
            num_layers=2,
            batch_first=True,
            dropout=config.dropout,
            bidirectional=True
        )
        
        self.attention = nn.Sequential(
            nn.Linear(256, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, config.n_classes)
        )
        
    def forward(self, x):
        # Reshape for LSTM (batch, time, features)
        if x.dim() == 4:
            x = x.squeeze(1)
        x = x.transpose(1, 2)
        
        lstm_out, _ = self.lstm(x)
        
        # Attention
        attn_weights = self.attention(lstm_out)
        attn_weights = F.softmax(attn_weights, dim=1)
        attended = torch.sum(lstm_out * attn_weights, dim=1)
        
        return self.classifier(attended)


class TransformerModel(nn.Module):
    """Transformer for emotion recognition"""
    
    def __init__(self, config):
        super().__init__()
        
        self.input_projection = nn.Linear(config.n_mels, 256)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=256,
            nhead=8,
            dim_feedforward=512,
            dropout=config.dropout,
            batch_first=True
        )
        
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=4)
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, config.n_classes)
        )
        
    def forward(self, x):
        # Reshape (batch, time, features)
        if x.dim() == 4:
            x = x.squeeze(1)
        x = x.transpose(1, 2)
        
        x = self.input_projection(x)
        x = self.transformer(x)
        
        # Global average pooling
        x = x.mean(dim=1)
        
        return self.classifier(x)


class EnsembleModel(nn.Module):
    """Ensemble of multiple models"""
    
    def __init__(self, config):
        super().__init__()
        
        self.cnn = CNNModel(config)
        self.lstm = LSTMModel(config)
        self.transformer = TransformerModel(config)
        
        # Learnable weights for ensemble
        self.weights = nn.Parameter(torch.ones(3) / 3)
        
    def forward(self, x):
        cnn_out = self.cnn(x)
        lstm_out = self.lstm(x)
        transformer_out = self.transformer(x)
        
        # Weighted average
        w = F.softmax(self.weights, dim=0)
        output = w[0] * cnn_out + w[1] * lstm_out + w[2] * transformer_out
        
        return output

    def augment_audio(self, waveform):
        """Enhanced augmentation for better generalization"""
        
        # Apply multiple augmentations
        augmentations_applied = 0
        
        # 1. Add noise (30% chance)
        if random.random() > 0.7:
            noise_factor = random.uniform(0.002, 0.01)
            noise = torch.randn_like(waveform) * noise_factor
            waveform = waveform + noise
            augmentations_applied += 1
        
        # 2. Time shift (30% chance)
        if random.random() > 0.7:
            shift = int(random.uniform(-0.2, 0.2) * waveform.shape[1])
            waveform = torch.roll(waveform, shift, dims=1)
            augmentations_applied += 1
        
        # 3. Speed change simulation (30% chance)
        if random.random() > 0.7:
            speed_factor = random.uniform(0.9, 1.1)
            # Simple speed change by resampling
            old_length = waveform.shape[1]
            new_length = int(old_length * speed_factor)
            indices = torch.linspace(0, old_length - 1, new_length).long()
            waveform = waveform[:, indices]
            # Pad or truncate back to original length
            if waveform.shape[1] > old_length:
                waveform = waveform[:, :old_length]
            else:
                padding = old_length - waveform.shape[1]
                waveform = torch.nn.functional.pad(waveform, (0, padding))
            augmentations_applied += 1
        
        # 4. Volume change (30% chance)
        if random.random() > 0.7:
            volume_factor = random.uniform(0.7, 1.3)
            waveform = waveform * volume_factor
            augmentations_applied += 1
        
        return waveform

# ============================================
# CELL 7: Training Functions
# ============================================

In [18]:
# ============================================
# CELL 7: IMPROVED Trainer with Better Scheduling
# ============================================
from torch.cuda.amp import autocast, GradScaler
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import gc

class Trainer:
    """Training manager with improved learning rate scheduling"""
    
    def __init__(self, model, config, device):
        self.model = model.to(device)
        self.config = config
        self.device = device
        
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
        
        # IMPROVED: Use Cosine Annealing with Warm Restarts for better convergence
        self.scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer, 
            T_0=10,  # Restart every 10 epochs
            T_mult=2,  # Double the restart interval each time
            eta_min=1e-6  # Minimum learning rate
        )
        
        # Mixed precision for T4
        self.scaler = GradScaler()
        
        self.train_losses = []
        self.val_losses = []
        self.train_accs = []
        self.val_accs = []
        
    def train_epoch(self, dataloader):
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch_idx, (features, labels) in enumerate(tqdm(dataloader, desc="Training")):
            features = features.to(self.device)
            labels = labels.to(self.device)
            
            self.optimizer.zero_grad()
            
            # Mixed precision training
            with autocast():
                outputs = self.model(features)
                loss = self.criterion(outputs, labels)
            
            # Scaled backprop for mixed precision
            self.scaler.scale(loss).backward()
            
            # Gradient clipping
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            
            self.scaler.step(self.optimizer)
            self.scaler.update()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            # Clear cache periodically
            if batch_idx % 10 == 0:
                torch.cuda.empty_cache()
        
        return total_loss / len(dataloader), 100. * correct / total
    
    def validate(self, dataloader):
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            with autocast():
                for features, labels in tqdm(dataloader, desc="Validation"):
                    features = features.to(self.device)
                    labels = labels.to(self.device)
                    
                    outputs = self.model(features)
                    loss = self.criterion(outputs, labels)
                    
                    total_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()
        
        return total_loss / len(dataloader), 100. * correct / total
    
    def fit(self, train_loader, val_loader):
        best_val_acc = 0
        patience_counter = 0
        
        for epoch in range(self.config.epochs):
            print(f"\nEpoch {epoch+1}/{self.config.epochs}")
            
            # Training
            train_loss, train_acc = self.train_epoch(train_loader)
            self.train_losses.append(train_loss)
            self.train_accs.append(train_acc)
            
            # Validation
            val_loss, val_acc = self.validate(val_loader)
            self.val_losses.append(val_loss)
            self.val_accs.append(val_acc)
            
            # IMPROVED: Step scheduler every epoch (for CosineAnnealingWarmRestarts)
            self.scheduler.step()
            current_lr = self.scheduler.get_last_lr()[0]
            
            print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
            print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
            print(f"Learning Rate: {current_lr:.6f}")
            
            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'val_acc': val_acc,
                    'config': self.config
                }, '/kaggle/working/best_model.pth')
                patience_counter = 0
                print(f"‚úì Saved best model with {val_acc:.2f}% accuracy")
            else:
                patience_counter += 1
            
            # Early stopping
            if patience_counter >= self.config.early_stopping_patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
            
            # Memory cleanup
            gc.collect()
            torch.cuda.empty_cache()
        
        return self.model

# ============================================
# CELL 8: Create DataLoaders
# ============================================

In [19]:
# Create datasets
train_dataset = EmotionDataset(X_train, y_train, config, augment=True)
val_dataset = EmotionDataset(X_val, y_val, config, augment=False)
test_dataset = EmotionDataset(X_test, y_test, config, augment=False)

# Create dataloaders with num_workers=0 for Kaggle
train_loader = DataLoader(
    train_dataset, batch_size=config.batch_size, 
    shuffle=True, num_workers=0, pin_memory=True
)

val_loader = DataLoader(
    val_dataset, batch_size=config.batch_size, 
    shuffle=False, num_workers=0, pin_memory=True
)

test_loader = DataLoader(
    test_dataset, batch_size=config.batch_size, 
    shuffle=False, num_workers=0, pin_memory=True
)

print(f"DataLoaders created successfully!")

DataLoaders created successfully!


# ============================================
# CELL 9: Train Model with DataParallel
# ============================================

In [20]:
# ============================================
# CELL 9a: Train Individual Models for Comparison
# ============================================

results = {}

# Train CNN only
print("Training CNN model...")
config.model_type = 'cnn'
config.epochs = 50
cnn_model = CNNModel(config)
cnn_trainer = Trainer(cnn_model, config, device)
cnn_trainer.fit(train_loader, val_loader)
results['CNN'] = max(cnn_trainer.val_accs)

# Train LSTM only
print("\nTraining LSTM model...")
config.model_type = 'lstm'
lstm_model = LSTMModel(config)
lstm_trainer = Trainer(lstm_model, config, device)
lstm_trainer.fit(train_loader, val_loader)
results['LSTM'] = max(lstm_trainer.val_accs)

# Train Transformer only
print("\nTraining Transformer model...")
config.model_type = 'transformer'
transformer_model = TransformerModel(config)
transformer_trainer = Trainer(transformer_model, config, device)
transformer_trainer.fit(train_loader, val_loader)
results['Transformer'] = max(transformer_trainer.val_accs)

print("\nIndividual Model Results:")
for model_name, acc in results.items():
    print(f"  {model_name}: {acc:.2f}%")

# Now train ensemble
print("\nTraining Ensemble model...")
config.model_type = 'ensemble'
config.epochs = 100
# Continue with original CELL 9...

Training CNN model...

Epoch 1/50


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:54<00:00,  2.33it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:06<00:00,  4.00it/s]


Train Loss: 1.7324, Train Acc: 28.96%
Val Loss: 1.5430, Val Acc: 36.28%
Learning Rate: 0.000488
‚úì Saved best model with 36.28% accuracy

Epoch 2/50


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:37<00:00,  3.39it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:06<00:00,  4.05it/s]


Train Loss: 1.5942, Train Acc: 34.21%
Val Loss: 1.7762, Val Acc: 27.44%
Learning Rate: 0.000452

Epoch 3/50


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:37<00:00,  3.38it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:06<00:00,  4.10it/s]


Train Loss: 1.5504, Train Acc: 35.42%
Val Loss: 1.4692, Val Acc: 39.53%
Learning Rate: 0.000397
‚úì Saved best model with 39.53% accuracy

Epoch 4/50


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:37<00:00,  3.39it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:06<00:00,  4.18it/s]


Train Loss: 1.5025, Train Acc: 37.63%
Val Loss: 1.4117, Val Acc: 41.64%
Learning Rate: 0.000328
‚úì Saved best model with 41.64% accuracy

Epoch 5/50


Training:   4%|‚ñç         | 5/128 [00:01<00:40,  3.04it/s]


KeyboardInterrupt: 

In [None]:
# ============================================
# CELL 9: Train Model (COMPLETE VERSION)
# ============================================

# Select model based on config
if config.model_type == 'cnn':
    model = CNNModel(config)
elif config.model_type == 'lstm':
    model = LSTMModel(config)
elif config.model_type == 'transformer':
    model = TransformerModel(config)
else:  # ensemble
    model = EnsembleModel(config)

# Use DataParallel if multiple GPUs available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = DataParallel(model)

print(f"Model: {config.model_type}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Create trainer and actually train the model
trainer = Trainer(model, config, device)

# THIS IS THE IMPORTANT PART - Actually run training!
print("\n" + "="*50)
print("Starting Training...")
print("="*50)

trained_model = trainer.fit(train_loader, val_loader)

print("\n" + "="*50)
print("Training Complete!")
print("="*50)

# Verify the model was saved
import os
if os.path.exists('/kaggle/working/best_model.pth'):
    print("‚úÖ Model saved successfully!")
    file_size = os.path.getsize('/kaggle/working/best_model.pth') / (1024*1024)
    print(f"Model file size: {file_size:.2f} MB")
else:
    print("‚ö†Ô∏è Model file not found. Training may have failed.")

# ============================================
# CELL 10: Evaluation and Visualization
# ============================================

In [None]:
def evaluate_model(model, test_loader, device):
    """Comprehensive model evaluation"""
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for features, labels in tqdm(test_loader, desc="Testing"):
            features = features.to(device)
            outputs = model(features)
            probs = F.softmax(outputs, dim=1)
            _, predicted = outputs.max(1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
            all_probs.extend(probs.cpu().numpy())
    
    return np.array(all_preds), np.array(all_labels), np.array(all_probs)

# Load best model - FIXED for PyTorch 2.6
checkpoint = torch.load('/kaggle/working/best_model.pth', weights_only=False)  # <-- Added weights_only=False
model.load_state_dict(checkpoint['model_state_dict'])
print(f"‚úÖ Loaded best model from epoch {checkpoint['epoch']} with {checkpoint['val_acc']:.2f}% validation accuracy")

# Evaluate on test set
print("\nEvaluating on test set...")
preds, labels, probs = evaluate_model(model, test_loader, device)

# Calculate metrics
accuracy = accuracy_score(labels, preds)
print(f"\nüéØ Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Classification report
emotion_names = ['neutral', 'happy', 'sad', 'angry', 'fear', 'disgust', 'surprise']
print("\n" + "="*60)
print("Classification Report:")
print("="*60)
print(classification_report(labels, preds, target_names=emotion_names[:config.n_classes], digits=3))

# Confusion Matrix
cm = confusion_matrix(labels, preds)
print("\n" + "="*60)
print("Confusion Matrix:")
print("="*60)
print(cm)

# Calculate per-class accuracy
per_class_acc = cm.diagonal() / cm.sum(axis=1)
print("\n" + "="*60)
print("Per-Class Accuracy:")
print("="*60)
for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(per_class_acc):
        print(f"  {emotion:10s}: {per_class_acc[i]:.3f} ({per_class_acc[i]*100:.1f}%)")

print("\n‚úÖ Evaluation complete!")

# ============================================
# CELL 11: Advanced Visualizations
# ============================================

In [None]:
# 1. Training History
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Loss', 'Accuracy')
)

fig.add_trace(
    go.Scatter(y=trainer.train_losses, name='Train Loss', mode='lines'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(y=trainer.val_losses, name='Val Loss', mode='lines'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=trainer.train_accs, name='Train Acc', mode='lines'),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(y=trainer.val_accs, name='Val Acc', mode='lines'),
    row=1, col=2
)

fig.update_layout(height=400, title_text="Training History")
fig.show()

# 2. Confusion Matrix Heatmap
fig = px.imshow(
    cm,
    labels=dict(x="Predicted", y="True", color="Count"),
    x=emotion_names[:config.n_classes],
    y=emotion_names[:config.n_classes],
    title="Confusion Matrix",
    color_continuous_scale="Blues",
    text_auto=True
)
fig.update_layout(width=600, height=500)
fig.show()

# 3. Per-class Performance
per_class_acc = cm.diagonal() / cm.sum(axis=1)
fig = go.Figure(data=[
    go.Bar(x=emotion_names[:config.n_classes], y=per_class_acc)
])
fig.update_layout(
    title="Per-Class Accuracy",
    xaxis_title="Emotion",
    yaxis_title="Accuracy",
    yaxis_range=[0, 1]
)
fig.show()

# ============================================
# CELL 12: Feature Importance Analysis
# ============================================

In [None]:
def extract_features_classical(file_paths, config):
    """Extract features for classical ML"""
    features = []
    
    for path in tqdm(file_paths[:100], desc="Extracting features"):  # Limit for demo
        try:
            y, sr = librosa.load(path, sr=config.sample_rate)
            
            # MFCC
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=config.n_mfcc)
            mfcc_mean = np.mean(mfcc, axis=1)
            mfcc_std = np.std(mfcc, axis=1)
            
            # Chroma
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            chroma_mean = np.mean(chroma, axis=1)
            chroma_std = np.std(chroma, axis=1)
            
            # Spectral features
            spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
            zcr = np.mean(librosa.feature.zero_crossing_rate(y))
            
            # Combine features
            feature_vector = np.hstack([
                mfcc_mean, mfcc_std,
                chroma_mean, chroma_std,
                spec_cent, spec_bw, rolloff, zcr
            ])
            
            features.append(feature_vector)
        except:
            features.append(np.zeros(104))  # Default feature size
    
    return np.array(features)

# Extract features for classical ML comparison
print("Extracting classical features for comparison...")
X_train_classical = extract_features_classical(X_train[:100], config)
y_train_classical = y_train[:100]

# Train Random Forest for comparison
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_classical, y_train_classical)

# Feature importance
feature_names = (
    [f'MFCC_{i}_mean' for i in range(config.n_mfcc)] +
    [f'MFCC_{i}_std' for i in range(config.n_mfcc)] +
    [f'Chroma_{i}_mean' for i in range(12)] +
    [f'Chroma_{i}_std' for i in range(12)] +
    ['Spec_Centroid', 'Spec_Bandwidth', 'Rolloff', 'ZCR']
)

importances = rf_model.feature_importances_
top_features_idx = np.argsort(importances)[-20:]

fig = go.Figure(data=[
    go.Bar(
        x=importances[top_features_idx],
        y=[feature_names[i] for i in top_features_idx],
        orientation='h'
    )
])
fig.update_layout(
    title="Top 20 Most Important Features (Random Forest)",
    xaxis_title="Importance",
    yaxis_title="Feature",
    height=500
)
fig.show()

# ============================================
# CELL 13: Model Interpretation (Attention Weights)
# ============================================

In [None]:
# ============================================
# CELL 13: Model Interpretation (MODIFIED)
# ============================================

# Since we're using ensemble, let's analyze ensemble weights
if config.model_type == 'ensemble':
    # Check if model is wrapped in DataParallel
    if isinstance(model, DataParallel):
        weights = model.module.weights
    else:
        weights = model.weights
    
    weights_normalized = F.softmax(weights, dim=0)
    
    print("Ensemble Model Weights:")
    print(f"  CNN Weight: {weights_normalized[0].item():.3f}")
    print(f"  LSTM Weight: {weights_normalized[1].item():.3f}")
    print(f"  Transformer Weight: {weights_normalized[2].item():.3f}")
    
    # Visualize ensemble weights
    import plotly.graph_objects as go
    
    fig = go.Figure(data=[
        go.Bar(
            x=['CNN', 'LSTM', 'Transformer'],
            y=weights_normalized.detach().cpu().numpy(),
            marker_color=['blue', 'green', 'red']
        )
    ])
    fig.update_layout(
        title="Ensemble Model Contribution Weights",
        yaxis_title="Weight",
        yaxis_range=[0, 1]
    )
    fig.show()

# Analyze common misclassifications
print("\n" + "="*60)
print("Most Common Misclassifications:")
print("="*60)

# Create confusion pairs
confusion_pairs = []
for true_idx in range(len(cm)):
    for pred_idx in range(len(cm)):
        if true_idx != pred_idx and cm[true_idx, pred_idx] > 10:
            true_emotion = emotion_names[true_idx]
            pred_emotion = emotion_names[pred_idx]
            count = cm[true_idx, pred_idx]
            confusion_pairs.append((true_emotion, pred_emotion, count))

# Sort by frequency
confusion_pairs.sort(key=lambda x: x[2], reverse=True)

for true_em, pred_em, count in confusion_pairs[:10]:
    print(f"  {true_em:10s} misclassified as {pred_em:10s}: {count} times")

# Success rate by emotion
print("\n" + "="*60)
print("Performance Summary by Emotion:")
print("="*60)

performance = []
for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(per_class_acc):
        total = cm[i].sum()
        correct = cm[i, i]
        performance.append({
            'Emotion': emotion,
            'Accuracy': per_class_acc[i],
            'Correct': correct,
            'Total': total,
            'Errors': total - correct
        })

# Sort by accuracy
performance.sort(key=lambda x: x['Accuracy'], reverse=True)

print(f"{'Rank':<5} {'Emotion':<10} {'Accuracy':<10} {'Correct/Total':<15}")
print("-" * 50)
for rank, perf in enumerate(performance, 1):
    print(f"{rank:<5} {perf['Emotion']:<10} {perf['Accuracy']*100:>6.1f}%    {perf['Correct']:>3}/{perf['Total']:<3}")

# ============================================
# CELL 14: Error Analysis
# ============================================

In [None]:
def error_analysis(preds, labels, probs, emotion_names):
    """Analyze model errors"""
    
    # Find misclassified samples
    errors = preds != labels
    error_indices = np.where(errors)[0]
    
    if len(error_indices) > 0:
        print(f"Total errors: {len(error_indices)} / {len(labels)} ({100*len(error_indices)/len(labels):.1f}%)")
        
        # Confusion pairs
        confusion_pairs = {}
        for idx in error_indices:
            true_label = emotion_names[labels[idx]]
            pred_label = emotion_names[preds[idx]]
            pair = f"{true_label} -> {pred_label}"
            confusion_pairs[pair] = confusion_pairs.get(pair, 0) + 1
        
        # Most common confusions
        sorted_pairs = sorted(confusion_pairs.items(), key=lambda x: x[1], reverse=True)
        
        print("\nMost Common Confusions:")
        for pair, count in sorted_pairs[:10]:
            print(f"  {pair}: {count} times")
        
        # Confidence analysis
        correct_confidence = probs[~errors].max(axis=1).mean()
        error_confidence = probs[errors].max(axis=1).mean()
        
        print(f"\nAverage Confidence:")
        print(f"  Correct predictions: {correct_confidence:.3f}")
        print(f"  Incorrect predictions: {error_confidence:.3f}")
        
        # Plot confidence distribution
        fig = go.Figure()
        fig.add_trace(go.Histogram(
            x=probs[~errors].max(axis=1),
            name='Correct',
            opacity=0.7,
            nbinsx=30
        ))
        fig.add_trace(go.Histogram(
            x=probs[errors].max(axis=1),
            name='Incorrect',
            opacity=0.7,
            nbinsx=30
        ))
        fig.update_layout(
            title="Confidence Distribution",
            xaxis_title="Confidence",
            yaxis_title="Count",
            barmode='overlay'
        )
        fig.show()

# Perform error analysis
error_analysis(preds, labels, probs, emotion_names[:config.n_classes])

# ============================================
# CELL 15: Save Results and Model
# ============================================

In [None]:
# Save results
results = {
    'test_accuracy': accuracy,
    'predictions': preds.tolist(),
    'true_labels': labels.tolist(),
    'probabilities': probs.tolist(),
    'confusion_matrix': cm.tolist(),
    'training_history': {
        'train_losses': trainer.train_losses,
        'val_losses': trainer.val_losses,
        'train_accs': trainer.train_accs,
        'val_accs': trainer.val_accs
    }
}

with open('/kaggle/working/results.json', 'w') as f:
    json.dump(results, f)

print("Results saved to results.json")

# Save model for deployment
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': config,
    'emotion_names': emotion_names[:config.n_classes],
    'test_accuracy': accuracy
}, '/kaggle/working/final_model.pth')

print("Model saved to final_model.pth")

# ============================================
# CELL 16: Generate Project Report
# ============================================

In [None]:
report = f"""
# Speech Emotion Recognition Project Report

## 1. Project Overview
- **Objective**: Develop a deep learning system for emotion recognition from speech
- **Model Type**: {config.model_type.upper()}
- **Number of Classes**: {config.n_classes}
- **Total Samples**: {len(file_paths)}
- **Train/Val/Test Split**: {config.train_size}/{config.val_size}/{config.test_size}

## 2. Model Architecture
- **Parameters**: {sum(p.numel() for p in model.parameters()):,}
- **Input Features**: Mel-spectrogram ({config.n_mels} bins)
- **Batch Size**: {config.batch_size}
- **Learning Rate**: {config.learning_rate}
- **Epochs Trained**: {len(trainer.train_losses)}

## 3. Performance Results
- **Test Accuracy**: {accuracy:.4f}
- **Best Validation Accuracy**: {checkpoint['val_acc']:.2f}%

## 4. Per-Class Performance
"""

for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(per_class_acc):
        report += f"- {emotion}: {per_class_acc[i]:.3f}\n"

report += """
## 5. Key Findings
1. The model successfully learns to distinguish between different emotions
2. Some emotion pairs show higher confusion rates (see error analysis)
3. Ensemble models generally perform better than individual architectures

## 6. Future Improvements
1. Implement data augmentation techniques (pitch shift, time stretch)
2. Try pre-trained models (Wav2Vec2, HuBERT)
3. Collect more diverse training data
4. Implement real-time emotion recognition

## 7. Technologies Used
- **Deep Learning**: PyTorch, TorchAudio
- **Audio Processing**: Librosa
- **Visualization**: Plotly
- **Environment**: Kaggle GPU
"""

print(report)

# Save report
with open('/kaggle/working/project_report.md', 'w') as f:
    f.write(report)

print("\nProject completed successfully! üéâ")
print("Files saved:")
print("- best_model.pth")
print("- final_model.pth")
print("- results.json")
print("- project_report.md")

# ============================================
# CELL 17: Test Time Augmentation for Better Accuracy
# ============================================

In [None]:
def evaluate_with_tta(model, test_loader, device, n_augmentations=5):
    """
    Evaluate with Test Time Augmentation
    This can improve accuracy by 0.5-2% without retraining!
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_probs_list = []
    
    print(f"Applying Test Time Augmentation with {n_augmentations} augmentations per sample...")
    
    with torch.no_grad():
        for features, labels in tqdm(test_loader, desc="TTA Testing"):
            batch_probs = []
            
            # Original prediction
            features_gpu = features.to(device)
            with autocast():
                outputs = model(features_gpu)
            probs = F.softmax(outputs, dim=1)
            batch_probs.append(probs.cpu())
            
            # Augmented predictions
            for aug_idx in range(n_augmentations - 1):
                # Apply different augmentations
                aug_features = features.clone()
                
                if aug_idx == 0:
                    # Add slight noise
                    aug_features = aug_features + torch.randn_like(aug_features) * 0.003
                elif aug_idx == 1:
                    # Slight time shift
                    shift_amount = torch.randint(-5, 5, (1,)).item()
                    aug_features = torch.roll(aug_features, shifts=shift_amount, dims=-1)
                elif aug_idx == 2:
                    # Slight amplitude scaling
                    scale = 1.0 + (torch.rand(1).item() - 0.5) * 0.1
                    aug_features = aug_features * scale
                else:
                    # Random small perturbation
                    aug_features = aug_features + torch.randn_like(aug_features) * 0.002
                
                aug_features = aug_features.to(device)
                with autocast():
                    aug_outputs = model(aug_features)
                aug_probs = F.softmax(aug_outputs, dim=1)
                batch_probs.append(aug_probs.cpu())
            
            # Average predictions from all augmentations
            avg_probs = torch.stack(batch_probs).mean(dim=0)
            _, predicted = avg_probs.max(1)
            
            all_preds.extend(predicted.numpy())
            all_labels.extend(labels.numpy())
            all_probs_list.append(avg_probs.numpy())
    
    return np.array(all_preds), np.array(all_labels), np.vstack(all_probs_list)

# Apply TTA to your already trained model
print("="*60)
print("EVALUATING WITH TEST TIME AUGMENTATION")
print("="*60)

# Load best model if not already loaded
if not 'model' in globals():
    checkpoint = torch.load('/kaggle/working/best_model.pth', weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded model with {checkpoint['val_acc']:.2f}% validation accuracy")

# Run TTA evaluation
tta_preds, tta_labels, tta_probs = evaluate_with_tta(model, test_loader, device, n_augmentations=5)

# Calculate improved metrics
tta_accuracy = accuracy_score(tta_labels, tta_preds)
print(f"\nüéØ Original Test Accuracy: 71.93%")
print(f"üöÄ TTA Test Accuracy: {tta_accuracy:.4f} ({tta_accuracy*100:.2f}%)")
print(f"üìà Improvement: +{(tta_accuracy - 0.7193)*100:.2f}%")

# Detailed classification report
print("\n" + "="*60)
print("TTA Classification Report:")
print("="*60)
print(classification_report(tta_labels, tta_preds, target_names=emotion_names[:config.n_classes], digits=3))

# Confusion Matrix
tta_cm = confusion_matrix(tta_labels, tta_preds)
print("\n" + "="*60)
print("TTA Confusion Matrix:")
print("="*60)
print(tta_cm)

# Per-class accuracy
tta_per_class_acc = tta_cm.diagonal() / tta_cm.sum(axis=1)
print("\n" + "="*60)
print("TTA Per-Class Accuracy:")
print("="*60)
for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(tta_per_class_acc):
        improvement = (tta_per_class_acc[i] - per_class_acc[i]) * 100
        print(f"  {emotion:10s}: {tta_per_class_acc[i]:.3f} ({tta_per_class_acc[i]*100:.1f}%) [{'‚Üë' if improvement > 0 else '‚Üì'}{abs(improvement):.1f}%]")

print("\n‚úÖ TTA Evaluation Complete!")
print("üí° TTA typically improves accuracy by 0.5-2% without any retraining!")

# Memory Management Tips for Kaggle

```python
# Add these between cells if you run out of memory
import gc
gc.collect()
torch.cuda.empty_cache()

# Monitor GPU usage
!nvidia-smi

# Clear variables
del train_loader, val_loader  # After training
```