In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ============================================
# CELL 1: Setup and Installation
# ============================================

In [5]:
# ============================================
# CELL 1: Setup and Installation (FINAL VERSION)
# ============================================
"""
Speech Emotion Recognition System
For: Speech Processing & ANN/DL Course
Author: Ahad Imran
"""

import warnings
warnings.filterwarnings('ignore')

# Check what's already installed
import sys
print(f"Python: {sys.version}")

# Check librosa
try:
    import librosa
    print(f"‚úì librosa {librosa.__version__}")
except:
    print("Installing librosa...")
    !pip install -q librosa

# Skip audiomentations - not critical, we have built-in augmentation

# Import all required packages
import os
import gc
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Optional
import pickle
import json
from collections import Counter

# Audio processing
import librosa
import librosa.display
import soundfile as sf
import IPython.display as ipd

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchaudio
import torchaudio.transforms as T

# Machine Learning
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set seeds
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

print("\n Setup complete! Ready to proceed.")

Python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
‚úì librosa 0.11.0

Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB

 Setup complete! Ready to proceed.


# ============================================
# CELL 1a: Setup for Dual T4 GPUs
# ============================================

In [6]:
from torch.nn.parallel import DataParallel

# Check available GPUs
print(f"GPUs available: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")

if torch.cuda.device_count() > 1:
    print("\n‚úÖ Multiple GPUs detected! Will use DataParallel for faster training.")
else:
    print("\n‚úÖ Single GPU detected. Will proceed with standard training.")

GPUs available: 2
GPU 0: Tesla T4
Memory: 15.83 GB
GPU 1: Tesla T4
Memory: 15.83 GB

‚úÖ Multiple GPUs detected! Will use DataParallel for faster training.


# ============================================
# CELL 2: Download and Prepare Datasets
# ============================================

To set up environment variables for the Kaggle API using Python, you can use the `os` module to assign your credentials directly in your script. This is especially useful when you don‚Äôt want to rely on a `kaggle.json` file. Here's how to do it:

---

### üîë Step-by-Step: Set Kaggle API Key with `os.environ`

```python
import os

# Set your Kaggle credentials
os.environ['KAGGLE_USERNAME'] = 'your_kaggle_username'
os.environ['KAGGLE_KEY'] = 'your_kaggle_api_key'
```

Replace `'your_kaggle_username'` and `'your_kaggle_api_key'` with the actual values from your [Kaggle account settings](https://www.kaggle.com/settings).

---

### üì¶ Then You Can Download Datasets Like This

```python
!pip install kaggle

# Example: Download Titanic dataset
!kaggle competitions download -c titanic
```

This will work in environments like Jupyter, Colab, or Kaggle Notebooks ‚Äî as long as the API key is valid and you've accepted the competition rules (if required).

## Usage Example
```python
# !kaggle competitions download -c titanic


# import zipfile

# with zipfile.ZipFile('/kaggle/working/titanic.zip', 'r') as zip_ref:
#    zip_ref.extractall('/kaggle/working')
```

In [7]:
"""
Using Kaggle datasets for emotion recognition
"""

import os
import zipfile
from pathlib import Path

# Setup Kaggle API credentials using Secrets
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    
    print("‚úì Kaggle API configured with secrets")
    api_available = True
except Exception as e:
    print(f"‚ö†Ô∏è Kaggle secrets not found: {e}")
    print("Please add datasets via 'Add Data' button or configure secrets.")
    api_available = False

# Create directory structure
os.makedirs('/kaggle/working/data', exist_ok=True)
os.makedirs('/kaggle/working/models', exist_ok=True)
os.makedirs('/kaggle/working/results', exist_ok=True)

# Check if datasets are already added via UI
datasets_found = False
if os.path.exists('/kaggle/input/'):
    input_datasets = os.listdir('/kaggle/input/')
    if len(input_datasets) > 0:
        print("Datasets found in /kaggle/input/:")
        for dataset in input_datasets:
            print(f"  ‚úì {dataset}")
        datasets_found = True
        DATA_PATH = '/kaggle/input/'

# Method 2: Download if not added via UI (only if API is available)
if not datasets_found and api_available:  # <-- FIXED: Added api_available check
    print("\nNo datasets found in input. Downloading...")
    
    # Only download if not already present
    if not os.path.exists('/kaggle/working/ravdess'):
        print("Downloading RAVDESS...")
        !kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio -p /kaggle/working --quiet
        
        if os.path.exists('/kaggle/working/ravdess-emotional-speech-audio.zip'):
            with zipfile.ZipFile('/kaggle/working/ravdess-emotional-speech-audio.zip', 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/ravdess')
            os.remove('/kaggle/working/ravdess-emotional-speech-audio.zip')
            print("‚úì RAVDESS downloaded")
    
    if not os.path.exists('/kaggle/working/tess'):
        print("Downloading TESS...")
        !kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess -p /kaggle/working --quiet
        
        if os.path.exists('/kaggle/working/toronto-emotional-speech-set-tess.zip'):
            with zipfile.ZipFile('/kaggle/working/toronto-emotional-speech-set-tess.zip', 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/tess')
            os.remove('/kaggle/working/toronto-emotional-speech-set-tess.zip')
            print("‚úì TESS downloaded")
    
    if not os.path.exists('/kaggle/working/cremad'):
        print("Downloading CREMA-D...")
        !kaggle datasets download -d ejlok1/cremad -p /kaggle/working --quiet
        
        if os.path.exists('/kaggle/working/cremad.zip'):
            with zipfile.ZipFile('/kaggle/working/cremad.zip', 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/cremad')
            os.remove('/kaggle/working/cremad.zip')
            print("‚úì CREMA-D downloaded")
    
    DATA_PATH = '/kaggle/working/'

elif not datasets_found and not api_available:
    print("\n‚ö†Ô∏è No datasets found and API not configured.")
    print("Please either:")
    print("1. Add datasets using the 'Add Data' button, or")
    print("2. Configure Kaggle API secrets (kaggle_username and kaggle_key)")
    DATA_PATH = '/kaggle/working/'  # Set default path anyway

else:
    # Datasets already found
    pass

# Verify final state
print(f"\nUsing DATA_PATH: {DATA_PATH}")
if os.path.exists(DATA_PATH):
    contents = os.listdir(DATA_PATH)
    if contents:
        print(f"Found {len(contents)} items in {DATA_PATH}")
    else:
        print("‚ö†Ô∏è DATA_PATH is empty. Please add datasets.")

print("\n Setup complete!")

‚úì Kaggle API configured with secrets

No datasets found in input. Downloading...
Downloading RAVDESS...
Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
‚úì RAVDESS downloaded
Downloading TESS...
Dataset URL: https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
‚úì TESS downloaded
Downloading CREMA-D...
Dataset URL: https://www.kaggle.com/datasets/ejlok1/cremad
License(s): ODC Attribution License (ODC-By)
‚úì CREMA-D downloaded

Using DATA_PATH: /kaggle/working/
Found 7 items in /kaggle/working/

 Setup complete!


In [12]:
# Quick Debug: Check what's actually in the directories
# DEBUG CELL: Check dataset structure
import os

for dataset in ['ravdess', 'tess', 'cremad']:
    path = f'/kaggle/working/{dataset}'
    if os.path.exists(path):
        print(f"\n{dataset.upper()} structure:")
        for root, dirs, files in os.walk(path):
            level = root.replace(path, '').count(os.sep)
            if level < 3:  # Only show first 3 levels
                indent = ' ' * 2 * level
                print(f"{indent}{os.path.basename(root)}/")
                if level < 2:
                    wav_files = [f for f in files if f.endswith('.wav')]
                    if wav_files:
                        print(f"{indent}  [{len(wav_files)} .wav files]")
                        print(f"{indent}  Sample: {wav_files[0]}")


RAVDESS structure:
ravdess/
  Actor_11/
    [60 .wav files]
    Sample: 03-01-04-01-02-01-11.wav
  Actor_19/
    [60 .wav files]
    Sample: 03-01-02-01-01-01-19.wav
  Actor_20/
    [60 .wav files]
    Sample: 03-01-07-02-01-02-20.wav
  Actor_01/
    [60 .wav files]
    Sample: 03-01-06-02-01-01-01.wav
  Actor_08/
    [60 .wav files]
    Sample: 03-01-03-01-01-01-08.wav
  Actor_15/
    [60 .wav files]
    Sample: 03-01-04-02-02-02-15.wav
  Actor_14/
    [60 .wav files]
    Sample: 03-01-01-01-01-02-14.wav
  Actor_10/
    [60 .wav files]
    Sample: 03-01-05-01-01-02-10.wav
  Actor_13/
    [60 .wav files]
    Sample: 03-01-03-02-01-01-13.wav
  Actor_06/
    [60 .wav files]
    Sample: 03-01-05-02-01-01-06.wav
  Actor_17/
    [60 .wav files]
    Sample: 03-01-03-01-02-02-17.wav
  Actor_05/
    [60 .wav files]
    Sample: 03-01-08-01-02-02-05.wav
  Actor_18/
    [60 .wav files]
    Sample: 03-01-06-01-01-01-18.wav
  Actor_03/
    [60 .wav files]
    Sample: 03-01-08-01-02-01-03.wav
  Act

# ============================================
# CELL 3: Configuration
# ============================================

In [8]:
# ============================================
# MODIFIED Config for Dual T4 GPUs
# ============================================
class Config:
    """Configuration optimized for dual T4 GPUs"""
    
    # Project
    project_name = "Speech Emotion Recognition"
    
    # Data
    sample_rate = 16000
    duration = 3.0
    n_classes = 8
    
    # Features
    n_mfcc = 40
    n_mels = 128
    n_fft = 2048
    hop_length = 512
    
    # Data splits
    train_size = 0.7
    val_size = 0.15
    test_size = 0.15
    
    # Training - Optimized for T4 √ó2
    batch_size = 64  # Increased for dual GPU (was 32)
    epochs = 100  # Can train longer with faster GPUs
    learning_rate = 1e-3
    early_stopping_patience = 10
    
    # Model
    model_type = 'ensemble'
    dropout = 0.3
    
    # Augmentation
    use_augmentation = True
    augment_prob = 0.5
    
    # Memory optimization
    gradient_accumulation_steps = 2  # For even larger effective batch size
    mixed_precision = True  # T4 supports mixed precision well
    
    # Paths
    data_path = DATA_PATH
    save_path = '/kaggle/working/'
    
config = Config()

# ============================================
# CELL 4: Dataset Class with Memory Optimization
# ============================================

In [9]:
class EmotionDataset(Dataset):
    """
    Memory-efficient dataset for Kaggle
    """
    
    def __init__(
        self, 
        file_paths: List[str],
        labels: List[int],
        config: Config,
        transform=None,
        augment=False
    ):
        self.file_paths = file_paths
        self.labels = labels
        self.config = config
        self.transform = transform
        self.augment = augment
        
        # Pre-calculate fixed length
        self.target_length = int(config.sample_rate * config.duration)
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        # Load audio on-demand to save memory
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        
        try:
            # Load audio
            waveform, sr = librosa.load(audio_path, sr=self.config.sample_rate, mono=True)
            
            # Pad or truncate
            if len(waveform) > self.target_length:
                waveform = waveform[:self.target_length]
            else:
                waveform = np.pad(waveform, (0, self.target_length - len(waveform)))
            
            # Convert to tensor
            waveform = torch.FloatTensor(waveform).unsqueeze(0)
            
            # Apply augmentation
            if self.augment and random.random() < self.config.augment_prob:
                waveform = self.augment_audio(waveform)
            
            # Extract features
            features = self.extract_features(waveform)
            
            return features, label
            
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            # Return zeros if error
            return torch.zeros((self.config.n_mels, 94)), label
    
    def augment_audio(self, waveform):
        """Simple augmentation"""
        # Add noise
        if random.random() > 0.5:
            noise = torch.randn_like(waveform) * 0.005
            waveform = waveform + noise
        
        # Time shift
        if random.random() > 0.5:
            shift = int(random.uniform(-0.1, 0.1) * waveform.shape[1])
            waveform = torch.roll(waveform, shift, dims=1)
        
        return waveform
    
    def extract_features(self, waveform):
        """Extract mel-spectrogram features"""
        mel_transform = T.MelSpectrogram(
            sample_rate=self.config.sample_rate,
            n_mels=self.config.n_mels,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length
        )
        
        mel_spec = mel_transform(waveform)
        mel_spec_db = T.AmplitudeToDB()(mel_spec)
        
        return mel_spec_db.squeeze(0)

# ============================================
# CELL 5: Data Loading and Preparation
# ============================================

In [13]:
def prepare_data(config):
    """
    Load and prepare datasets with correct paths
    """
    all_files = []
    all_labels = []
    
    # Emotion mapping
    emotion_map = {
        'neutral': 0, 'calm': 0,  # Merge calm into neutral
        'happy': 1, 'sad': 2, 'angry': 3,
        'fearful': 4, 'fear': 4,  # Handle variations
        'disgust': 5, 'surprised': 6, 'surprise': 6
    }
    
    base_path = Path('/kaggle/working')
    
    # RAVDESS dataset - files are in Actor_XX folders
    ravdess_path = base_path / 'ravdess'
    if ravdess_path.exists():
        print("Loading RAVDESS dataset...")
        # Look for Actor folders
        for actor_folder in ravdess_path.glob('Actor_*'):
            if actor_folder.is_dir():
                for audio_file in actor_folder.glob('*.wav'):
                    # Parse RAVDESS filename (03-01-06-01-02-01-12.wav)
                    parts = audio_file.stem.split('-')
                    if len(parts) >= 3:
                        emotion_code = int(parts[2])
                        ravdess_emotions = {
                            1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
                            5: 'angry', 6: 'fear', 7: 'disgust', 8: 'surprise'
                        }
                        if emotion_code in ravdess_emotions:
                            emotion = ravdess_emotions[emotion_code]
                            all_files.append(str(audio_file))
                            all_labels.append(emotion_map[emotion])
        print(f"  Found {len(all_files)} RAVDESS files")
    
    # TESS dataset - files are in emotion-specific folders
    tess_path = base_path / 'tess' / 'TESS Toronto emotional speech set data'
    if tess_path.exists():
        print("Loading TESS dataset...")
        initial_count = len(all_files)
        
        # TESS has folders like OAF_angry, YAF_happy, etc.
        for emotion_folder in tess_path.glob('*'):
            if emotion_folder.is_dir():
                folder_name = emotion_folder.name.lower()
                
                # Extract emotion from folder name
                if 'angry' in folder_name:
                    emotion = 'angry'
                elif 'disgust' in folder_name:
                    emotion = 'disgust'
                elif 'fear' in folder_name:
                    emotion = 'fear'
                elif 'happy' in folder_name:
                    emotion = 'happy'
                elif 'sad' in folder_name:
                    emotion = 'sad'
                elif 'neutral' in folder_name:
                    emotion = 'neutral'
                elif 'surprise' in folder_name or 'surprised' in folder_name:
                    emotion = 'surprise'
                else:
                    continue  # Skip unknown folders
                
                # Add all wav files from this emotion folder
                for audio_file in emotion_folder.glob('*.wav'):
                    all_files.append(str(audio_file))
                    all_labels.append(emotion_map[emotion])
        
        print(f"  Found {len(all_files) - initial_count} TESS files")
    
    # CREMA-D dataset - files are in AudioWAV folder
    cremad_path = base_path / 'cremad' / 'AudioWAV'
    if cremad_path.exists():
        print("Loading CREMA-D dataset...")
        initial_count = len(all_files)
        
        for audio_file in cremad_path.glob('*.wav'):
            # CREMA-D format: 1001_DFA_ANG_XX.wav
            filename = audio_file.stem
            if '_' in filename:
                parts = filename.split('_')
                if len(parts) >= 3:
                    emotion_code = parts[2]
                    cremad_emotions = {
                        'ANG': 'angry', 'DIS': 'disgust', 'FEA': 'fear',
                        'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'
                    }
                    if emotion_code in cremad_emotions:
                        emotion = cremad_emotions[emotion_code]
                        all_files.append(str(audio_file))
                        all_labels.append(emotion_map[emotion])
        
        print(f"  Found {len(all_files) - initial_count} CREMA-D files")
    
    # Summary
    if len(all_files) == 0:
        print("\n‚ö†Ô∏è No audio files found. Please check dataset paths.")
        print("Creating synthetic data for testing...")
        for i in range(100):
            all_files.append(f"dummy_{i}.wav")
            all_labels.append(random.randint(0, 6))
    else:
        print(f"\n‚úÖ Successfully loaded all datasets!")
    
    print(f"Total samples: {len(all_files)}")
    
    # Show label distribution
    label_counts = Counter(all_labels)
    emotion_names = {v: k for k, v in emotion_map.items()}
    print("\nEmotion distribution:")
    for label, count in sorted(label_counts.items()):
        emotion_name = [k for k, v in emotion_map.items() if v == label][0]
        print(f"  {emotion_name}: {count} samples")
    
    # Update number of classes
    config.n_classes = len(set(all_labels))
    print(f"\nNumber of emotion classes: {config.n_classes}")
    
    return all_files, all_labels

# Load data with the fixed function
file_paths, labels = prepare_data(config)

# Split data
X_temp, X_test, y_temp, y_test = train_test_split(
    file_paths, labels, test_size=config.test_size, 
    stratify=labels, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=config.val_size/(1-config.test_size),
    stratify=y_temp, random_state=42
)

print(f"\nDataset splits:")
print(f"  Train: {len(X_train)} samples")
print(f"  Val: {len(X_val)} samples")
print(f"  Test: {len(X_test)} samples")

Loading RAVDESS dataset...
  Found 1440 RAVDESS files
Loading TESS dataset...
  Found 2800 TESS files
Loading CREMA-D dataset...
  Found 7442 CREMA-D files

‚úÖ Successfully loaded all datasets!
Total samples: 11682

Emotion distribution:
  neutral: 1775 samples
  happy: 1863 samples
  sad: 1863 samples
  angry: 1863 samples
  fearful: 1863 samples
  disgust: 1863 samples
  surprised: 592 samples

Number of emotion classes: 7

Dataset splits:
  Train: 8176 samples
  Val: 1753 samples
  Test: 1753 samples


# ============================================
# CELL 6: Model Architectures
# ============================================

In [14]:
class CNNModel(nn.Module):
    """CNN for emotion recognition"""
    
    def __init__(self, config):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        # Global pooling
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(64, config.n_classes)
        )
        
    def forward(self, x):
        # Add channel dimension if needed
        if x.dim() == 3:
            x = x.unsqueeze(1)
        
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        
        return x


class LSTMModel(nn.Module):
    """LSTM for emotion recognition"""
    
    def __init__(self, config):
        super().__init__()
        
        self.lstm = nn.LSTM(
            input_size=config.n_mels,
            hidden_size=128,
            num_layers=2,
            batch_first=True,
            dropout=config.dropout,
            bidirectional=True
        )
        
        self.attention = nn.Sequential(
            nn.Linear(256, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, config.n_classes)
        )
        
    def forward(self, x):
        # Reshape for LSTM (batch, time, features)
        if x.dim() == 4:
            x = x.squeeze(1)
        x = x.transpose(1, 2)
        
        lstm_out, _ = self.lstm(x)
        
        # Attention
        attn_weights = self.attention(lstm_out)
        attn_weights = F.softmax(attn_weights, dim=1)
        attended = torch.sum(lstm_out * attn_weights, dim=1)
        
        return self.classifier(attended)


class TransformerModel(nn.Module):
    """Transformer for emotion recognition"""
    
    def __init__(self, config):
        super().__init__()
        
        self.input_projection = nn.Linear(config.n_mels, 256)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=256,
            nhead=8,
            dim_feedforward=512,
            dropout=config.dropout,
            batch_first=True
        )
        
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=4)
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, config.n_classes)
        )
        
    def forward(self, x):
        # Reshape (batch, time, features)
        if x.dim() == 4:
            x = x.squeeze(1)
        x = x.transpose(1, 2)
        
        x = self.input_projection(x)
        x = self.transformer(x)
        
        # Global average pooling
        x = x.mean(dim=1)
        
        return self.classifier(x)


class EnsembleModel(nn.Module):
    """Ensemble of multiple models"""
    
    def __init__(self, config):
        super().__init__()
        
        self.cnn = CNNModel(config)
        self.lstm = LSTMModel(config)
        self.transformer = TransformerModel(config)
        
        # Learnable weights for ensemble
        self.weights = nn.Parameter(torch.ones(3) / 3)
        
    def forward(self, x):
        cnn_out = self.cnn(x)
        lstm_out = self.lstm(x)
        transformer_out = self.transformer(x)
        
        # Weighted average
        w = F.softmax(self.weights, dim=0)
        output = w[0] * cnn_out + w[1] * lstm_out + w[2] * transformer_out
        
        return output

# ============================================
# CELL 7: Training Functions
# ============================================

In [15]:
# ============================================
# OPTIMIZED Trainer with Mixed Precision for T4
# ============================================
from torch.cuda.amp import autocast, GradScaler
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import gc

class Trainer:
    """Training manager optimized for T4 GPUs"""
    
    def __init__(self, model, config, device):
        self.model = model.to(device)  # FIXED: Added .to(device)
        self.config = config
        self.device = device
        
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', patience=3, factor=0.5
        )
        
        # Mixed precision for T4
        self.scaler = GradScaler()
        
        self.train_losses = []
        self.val_losses = []
        self.train_accs = []
        self.val_accs = []
        
    def train_epoch(self, dataloader):
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch_idx, (features, labels) in enumerate(tqdm(dataloader, desc="Training")):
            features = features.to(self.device)
            labels = labels.to(self.device)
            
            self.optimizer.zero_grad()
            
            # Mixed precision training
            with autocast():
                outputs = self.model(features)
                loss = self.criterion(outputs, labels)
            
            # Scaled backprop for mixed precision
            self.scaler.scale(loss).backward()
            
            # Gradient clipping
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            
            self.scaler.step(self.optimizer)
            self.scaler.update()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            # Clear cache periodically
            if batch_idx % 10 == 0:
                torch.cuda.empty_cache()
        
        return total_loss / len(dataloader), 100. * correct / total
    
    def validate(self, dataloader):
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            # Use mixed precision for validation too
            with autocast():
                for features, labels in tqdm(dataloader, desc="Validation"):
                    features = features.to(self.device)
                    labels = labels.to(self.device)
                    
                    outputs = self.model(features)
                    loss = self.criterion(outputs, labels)
                    
                    total_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()
        
        return total_loss / len(dataloader), 100. * correct / total
    
    def fit(self, train_loader, val_loader):
        best_val_acc = 0
        patience_counter = 0
        
        for epoch in range(self.config.epochs):
            print(f"\nEpoch {epoch+1}/{self.config.epochs}")
            
            # Training
            train_loss, train_acc = self.train_epoch(train_loader)
            self.train_losses.append(train_loss)
            self.train_accs.append(train_acc)
            
            # Validation
            val_loss, val_acc = self.validate(val_loader)
            self.val_losses.append(val_loss)
            self.val_accs.append(val_acc)
            
            # Scheduler
            self.scheduler.step(val_loss)
            
            print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
            print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
            
            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'val_acc': val_acc,
                    'config': self.config
                }, '/kaggle/working/best_model.pth')
                patience_counter = 0
                print(f"‚úì Saved best model with {val_acc:.2f}% accuracy")
            else:
                patience_counter += 1
            
            # Early stopping
            if patience_counter >= self.config.early_stopping_patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
            
            # Memory cleanup
            gc.collect()
            torch.cuda.empty_cache()
        
        return self.model

# ============================================
# CELL 8: Create DataLoaders
# ============================================

In [16]:
# Create datasets
train_dataset = EmotionDataset(X_train, y_train, config, augment=True)
val_dataset = EmotionDataset(X_val, y_val, config, augment=False)
test_dataset = EmotionDataset(X_test, y_test, config, augment=False)

# Create dataloaders with num_workers=0 for Kaggle
train_loader = DataLoader(
    train_dataset, batch_size=config.batch_size, 
    shuffle=True, num_workers=0, pin_memory=True
)

val_loader = DataLoader(
    val_dataset, batch_size=config.batch_size, 
    shuffle=False, num_workers=0, pin_memory=True
)

test_loader = DataLoader(
    test_dataset, batch_size=config.batch_size, 
    shuffle=False, num_workers=0, pin_memory=True
)

print(f"DataLoaders created successfully!")

DataLoaders created successfully!


# ============================================
# CELL 9: Train Model with DataParallel
# ============================================

In [19]:
# ============================================
# CELL 9: Train Model (COMPLETE VERSION)
# ============================================

# Select model based on config
if config.model_type == 'cnn':
    model = CNNModel(config)
elif config.model_type == 'lstm':
    model = LSTMModel(config)
elif config.model_type == 'transformer':
    model = TransformerModel(config)
else:  # ensemble
    model = EnsembleModel(config)

# Use DataParallel if multiple GPUs available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = DataParallel(model)

print(f"Model: {config.model_type}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Create trainer and actually train the model
trainer = Trainer(model, config, device)

# THIS IS THE IMPORTANT PART - Actually run training!
print("\n" + "="*50)
print("Starting Training...")
print("="*50)

trained_model = trainer.fit(train_loader, val_loader)

print("\n" + "="*50)
print("Training Complete!")
print("="*50)

# Verify the model was saved
import os
if os.path.exists('/kaggle/working/best_model.pth'):
    print("‚úÖ Model saved successfully!")
    file_size = os.path.getsize('/kaggle/working/best_model.pth') / (1024*1024)
    print(f"Model file size: {file_size:.2f} MB")
else:
    print("‚ö†Ô∏è Model file not found. Training may have failed.")

Using 2 GPUs!
Model: ensemble
Parameters: 3,313,689

Starting Training...

Epoch 1/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:58<00:00,  2.21it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.66it/s]


Train Loss: 1.5105, Train Acc: 38.94%
Val Loss: 1.2533, Val Acc: 50.94%
‚úì Saved best model with 50.94% accuracy

Epoch 2/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:42<00:00,  2.99it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.72it/s]


Train Loss: 1.2124, Train Acc: 52.54%
Val Loss: 1.0757, Val Acc: 58.36%
‚úì Saved best model with 58.36% accuracy

Epoch 3/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:42<00:00,  3.04it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.72it/s]


Train Loss: 1.0941, Train Acc: 57.18%
Val Loss: 1.0337, Val Acc: 58.36%

Epoch 4/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.19it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.90it/s]


Train Loss: 1.0394, Train Acc: 58.72%
Val Loss: 1.0121, Val Acc: 60.30%
‚úì Saved best model with 60.30% accuracy

Epoch 5/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.21it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.97it/s]


Train Loss: 1.0034, Train Acc: 60.49%
Val Loss: 0.9761, Val Acc: 63.03%
‚úì Saved best model with 63.03% accuracy

Epoch 6/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.19it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.93it/s]


Train Loss: 0.9627, Train Acc: 62.40%
Val Loss: 0.9803, Val Acc: 61.15%

Epoch 7/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.20it/s]
Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.15it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.90it/s]


Train Loss: 0.8598, Train Acc: 66.87%
Val Loss: 1.0081, Val Acc: 61.67%

Epoch 11/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.20it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.98it/s]


Train Loss: 0.8310, Train Acc: 68.08%
Val Loss: 0.8980, Val Acc: 66.00%
‚úì Saved best model with 66.00% accuracy

Epoch 12/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.22it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.87it/s]


Train Loss: 0.8045, Train Acc: 69.18%
Val Loss: 0.9948, Val Acc: 63.61%

Epoch 13/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.25it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.90it/s]


Train Loss: 0.7922, Train Acc: 69.58%
Val Loss: 0.9857, Val Acc: 63.26%

Epoch 14/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.21it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.86it/s]


Train Loss: 0.7670, Train Acc: 70.91%
Val Loss: 0.9593, Val Acc: 64.69%

Epoch 15/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.23it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.97it/s]


Train Loss: 0.7466, Train Acc: 71.69%
Val Loss: 0.9257, Val Acc: 66.57%
‚úì Saved best model with 66.57% accuracy

Epoch 16/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.22it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:06<00:00,  4.02it/s]


Train Loss: 0.6715, Train Acc: 75.12%
Val Loss: 0.9064, Val Acc: 66.97%
‚úì Saved best model with 66.97% accuracy

Epoch 17/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.23it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.89it/s]


Train Loss: 0.6279, Train Acc: 76.11%
Val Loss: 0.9581, Val Acc: 67.43%
‚úì Saved best model with 67.43% accuracy

Epoch 18/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.18it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.92it/s]


Train Loss: 0.5885, Train Acc: 77.65%
Val Loss: 0.9136, Val Acc: 68.28%
‚úì Saved best model with 68.28% accuracy

Epoch 19/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.13it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.84it/s]


Train Loss: 0.5677, Train Acc: 78.60%
Val Loss: 0.9850, Val Acc: 67.77%

Epoch 20/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.22it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.83it/s]


Train Loss: 0.5279, Train Acc: 80.33%
Val Loss: 0.9385, Val Acc: 68.34%
‚úì Saved best model with 68.34% accuracy

Epoch 21/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.27it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.95it/s]


Train Loss: 0.4883, Train Acc: 81.74%
Val Loss: 1.0077, Val Acc: 69.25%
‚úì Saved best model with 69.25% accuracy

Epoch 22/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.25it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.87it/s]


Train Loss: 0.4693, Train Acc: 82.52%
Val Loss: 0.9761, Val Acc: 69.25%

Epoch 23/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.16it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.82it/s]


Train Loss: 0.4473, Train Acc: 83.17%
Val Loss: 0.9775, Val Acc: 68.68%

Epoch 24/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:41<00:00,  3.07it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.61it/s]


Train Loss: 0.4245, Train Acc: 84.44%
Val Loss: 0.9993, Val Acc: 69.02%

Epoch 25/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:42<00:00,  2.99it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.95it/s]


Train Loss: 0.4121, Train Acc: 84.41%
Val Loss: 1.0176, Val Acc: 69.48%
‚úì Saved best model with 69.48% accuracy

Epoch 26/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.19it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.91it/s]


Train Loss: 0.3976, Train Acc: 85.42%
Val Loss: 1.0290, Val Acc: 69.65%
‚úì Saved best model with 69.65% accuracy

Epoch 27/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:41<00:00,  3.11it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.76it/s]


Train Loss: 0.3932, Train Acc: 85.65%
Val Loss: 1.0331, Val Acc: 70.05%
‚úì Saved best model with 70.05% accuracy

Epoch 28/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.22it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.96it/s]


Train Loss: 0.3717, Train Acc: 86.46%
Val Loss: 1.0506, Val Acc: 70.05%

Epoch 29/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.24it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.97it/s]


Train Loss: 0.3663, Train Acc: 86.20%
Val Loss: 1.0745, Val Acc: 69.88%

Epoch 30/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.25it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.97it/s]


Train Loss: 0.3554, Train Acc: 86.75%
Val Loss: 1.0787, Val Acc: 69.71%

Epoch 31/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.26it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.97it/s]


Train Loss: 0.3543, Train Acc: 86.75%
Val Loss: 1.0799, Val Acc: 69.82%

Epoch 32/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.26it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.92it/s]


Train Loss: 0.3459, Train Acc: 87.52%
Val Loss: 1.0878, Val Acc: 69.48%

Epoch 33/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.26it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.89it/s]


Train Loss: 0.3422, Train Acc: 87.50%
Val Loss: 1.0867, Val Acc: 70.05%

Epoch 34/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.25it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.93it/s]


Train Loss: 0.3460, Train Acc: 87.16%
Val Loss: 1.0956, Val Acc: 69.71%

Epoch 35/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:39<00:00,  3.24it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:06<00:00,  4.00it/s]


Train Loss: 0.3388, Train Acc: 87.38%
Val Loss: 1.0987, Val Acc: 69.77%

Epoch 36/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.17it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.94it/s]


Train Loss: 0.3385, Train Acc: 87.68%
Val Loss: 1.0981, Val Acc: 69.88%

Epoch 37/100


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:40<00:00,  3.19it/s]
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.85it/s]

Train Loss: 0.3360, Train Acc: 87.65%
Val Loss: 1.1078, Val Acc: 69.54%
Early stopping at epoch 37

Training Complete!
‚úÖ Model saved successfully!
Model file size: 38.04 MB





# ============================================
# CELL 10: Evaluation and Visualization
# ============================================

In [21]:
def evaluate_model(model, test_loader, device):
    """Comprehensive model evaluation"""
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for features, labels in tqdm(test_loader, desc="Testing"):
            features = features.to(device)
            outputs = model(features)
            probs = F.softmax(outputs, dim=1)
            _, predicted = outputs.max(1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
            all_probs.extend(probs.cpu().numpy())
    
    return np.array(all_preds), np.array(all_labels), np.array(all_probs)

# Load best model - FIXED for PyTorch 2.6
checkpoint = torch.load('/kaggle/working/best_model.pth', weights_only=False)  # <-- Added weights_only=False
model.load_state_dict(checkpoint['model_state_dict'])
print(f"‚úÖ Loaded best model from epoch {checkpoint['epoch']} with {checkpoint['val_acc']:.2f}% validation accuracy")

# Evaluate on test set
print("\nEvaluating on test set...")
preds, labels, probs = evaluate_model(model, test_loader, device)

# Calculate metrics
accuracy = accuracy_score(labels, preds)
print(f"\nüéØ Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Classification report
emotion_names = ['neutral', 'happy', 'sad', 'angry', 'fear', 'disgust', 'surprise']
print("\n" + "="*60)
print("Classification Report:")
print("="*60)
print(classification_report(labels, preds, target_names=emotion_names[:config.n_classes], digits=3))

# Confusion Matrix
cm = confusion_matrix(labels, preds)
print("\n" + "="*60)
print("Confusion Matrix:")
print("="*60)
print(cm)

# Calculate per-class accuracy
per_class_acc = cm.diagonal() / cm.sum(axis=1)
print("\n" + "="*60)
print("Per-Class Accuracy:")
print("="*60)
for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(per_class_acc):
        print(f"  {emotion:10s}: {per_class_acc[i]:.3f} ({per_class_acc[i]*100:.1f}%)")

print("\n‚úÖ Evaluation complete!")

‚úÖ Loaded best model from epoch 26 with 70.05% validation accuracy

Evaluating on test set...


Testing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:07<00:00,  3.76it/s]


üéØ Test Accuracy: 0.7193 (71.93%)

Classification Report:
              precision    recall  f1-score   support

     neutral      0.717     0.782     0.748       266
       happy      0.663     0.682     0.673       280
         sad      0.633     0.706     0.668       279
       angry      0.811     0.796     0.804       280
        fear      0.775     0.564     0.653       280
     disgust      0.692     0.710     0.701       279
    surprise      0.869     0.966     0.915        89

    accuracy                          0.719      1753
   macro avg      0.737     0.744     0.737      1753
weighted avg      0.723     0.719     0.718      1753


Confusion Matrix:
[[208  13  27   3   3  12   0]
 [ 17 191   6  29  16  17   4]
 [ 30   7 197   0  17  24   4]
 [  5  26   3 223   2  18   3]
 [  6  36  52   9 158  17   2]
 [ 24  13  25  11   8 198   0]
 [  0   2   1   0   0   0  86]]

Per-Class Accuracy:
  neutral   : 0.782 (78.2%)
  happy     : 0.682 (68.2%)
  sad       : 0.706 (70.6%)





# ============================================
# CELL 11: Advanced Visualizations
# ============================================

In [22]:
# 1. Training History
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Loss', 'Accuracy')
)

fig.add_trace(
    go.Scatter(y=trainer.train_losses, name='Train Loss', mode='lines'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(y=trainer.val_losses, name='Val Loss', mode='lines'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=trainer.train_accs, name='Train Acc', mode='lines'),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(y=trainer.val_accs, name='Val Acc', mode='lines'),
    row=1, col=2
)

fig.update_layout(height=400, title_text="Training History")
fig.show()

# 2. Confusion Matrix Heatmap
fig = px.imshow(
    cm,
    labels=dict(x="Predicted", y="True", color="Count"),
    x=emotion_names[:config.n_classes],
    y=emotion_names[:config.n_classes],
    title="Confusion Matrix",
    color_continuous_scale="Blues",
    text_auto=True
)
fig.update_layout(width=600, height=500)
fig.show()

# 3. Per-class Performance
per_class_acc = cm.diagonal() / cm.sum(axis=1)
fig = go.Figure(data=[
    go.Bar(x=emotion_names[:config.n_classes], y=per_class_acc)
])
fig.update_layout(
    title="Per-Class Accuracy",
    xaxis_title="Emotion",
    yaxis_title="Accuracy",
    yaxis_range=[0, 1]
)
fig.show()

# ============================================
# CELL 12: Feature Importance Analysis
# ============================================

In [24]:
def extract_features_classical(file_paths, config):
    """Extract features for classical ML"""
    features = []
    
    for path in tqdm(file_paths[:100], desc="Extracting features"):  # Limit for demo
        try:
            y, sr = librosa.load(path, sr=config.sample_rate)
            
            # MFCC
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=config.n_mfcc)
            mfcc_mean = np.mean(mfcc, axis=1)
            mfcc_std = np.std(mfcc, axis=1)
            
            # Chroma
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            chroma_mean = np.mean(chroma, axis=1)
            chroma_std = np.std(chroma, axis=1)
            
            # Spectral features
            spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
            zcr = np.mean(librosa.feature.zero_crossing_rate(y))
            
            # Combine features
            feature_vector = np.hstack([
                mfcc_mean, mfcc_std,
                chroma_mean, chroma_std,
                spec_cent, spec_bw, rolloff, zcr
            ])
            
            features.append(feature_vector)
        except:
            features.append(np.zeros(104))  # Default feature size
    
    return np.array(features)

# Extract features for classical ML comparison
print("Extracting classical features for comparison...")
X_train_classical = extract_features_classical(X_train[:100], config)
y_train_classical = y_train[:100]

# Train Random Forest for comparison
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_classical, y_train_classical)

# Feature importance
feature_names = (
    [f'MFCC_{i}_mean' for i in range(config.n_mfcc)] +
    [f'MFCC_{i}_std' for i in range(config.n_mfcc)] +
    [f'Chroma_{i}_mean' for i in range(12)] +
    [f'Chroma_{i}_std' for i in range(12)] +
    ['Spec_Centroid', 'Spec_Bandwidth', 'Rolloff', 'ZCR']
)

importances = rf_model.feature_importances_
top_features_idx = np.argsort(importances)[-20:]

fig = go.Figure(data=[
    go.Bar(
        x=importances[top_features_idx],
        y=[feature_names[i] for i in top_features_idx],
        orientation='h'
    )
])
fig.update_layout(
    title="Top 20 Most Important Features (Random Forest)",
    xaxis_title="Importance",
    yaxis_title="Feature",
    height=500
)
fig.show()

Extracting classical features for comparison...


Extracting features: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:02<00:00, 36.99it/s]


# ============================================
# CELL 13: Model Interpretation (Attention Weights)
# ============================================

In [26]:
# ============================================
# CELL 13: Model Interpretation (MODIFIED)
# ============================================

# Since we're using ensemble, let's analyze ensemble weights
if config.model_type == 'ensemble':
    # Check if model is wrapped in DataParallel
    if isinstance(model, DataParallel):
        weights = model.module.weights
    else:
        weights = model.weights
    
    weights_normalized = F.softmax(weights, dim=0)
    
    print("Ensemble Model Weights:")
    print(f"  CNN Weight: {weights_normalized[0].item():.3f}")
    print(f"  LSTM Weight: {weights_normalized[1].item():.3f}")
    print(f"  Transformer Weight: {weights_normalized[2].item():.3f}")
    
    # Visualize ensemble weights
    import plotly.graph_objects as go
    
    fig = go.Figure(data=[
        go.Bar(
            x=['CNN', 'LSTM', 'Transformer'],
            y=weights_normalized.detach().cpu().numpy(),
            marker_color=['blue', 'green', 'red']
        )
    ])
    fig.update_layout(
        title="Ensemble Model Contribution Weights",
        yaxis_title="Weight",
        yaxis_range=[0, 1]
    )
    fig.show()

# Analyze common misclassifications
print("\n" + "="*60)
print("Most Common Misclassifications:")
print("="*60)

# Create confusion pairs
confusion_pairs = []
for true_idx in range(len(cm)):
    for pred_idx in range(len(cm)):
        if true_idx != pred_idx and cm[true_idx, pred_idx] > 10:
            true_emotion = emotion_names[true_idx]
            pred_emotion = emotion_names[pred_idx]
            count = cm[true_idx, pred_idx]
            confusion_pairs.append((true_emotion, pred_emotion, count))

# Sort by frequency
confusion_pairs.sort(key=lambda x: x[2], reverse=True)

for true_em, pred_em, count in confusion_pairs[:10]:
    print(f"  {true_em:10s} misclassified as {pred_em:10s}: {count} times")

# Success rate by emotion
print("\n" + "="*60)
print("Performance Summary by Emotion:")
print("="*60)

performance = []
for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(per_class_acc):
        total = cm[i].sum()
        correct = cm[i, i]
        performance.append({
            'Emotion': emotion,
            'Accuracy': per_class_acc[i],
            'Correct': correct,
            'Total': total,
            'Errors': total - correct
        })

# Sort by accuracy
performance.sort(key=lambda x: x['Accuracy'], reverse=True)

print(f"{'Rank':<5} {'Emotion':<10} {'Accuracy':<10} {'Correct/Total':<15}")
print("-" * 50)
for rank, perf in enumerate(performance, 1):
    print(f"{rank:<5} {perf['Emotion']:<10} {perf['Accuracy']*100:>6.1f}%    {perf['Correct']:>3}/{perf['Total']:<3}")

Ensemble Model Weights:
  CNN Weight: 0.266
  LSTM Weight: 0.510
  Transformer Weight: 0.225



Most Common Misclassifications:
  fear       misclassified as sad       : 52 times
  fear       misclassified as happy     : 36 times
  sad        misclassified as neutral   : 30 times
  happy      misclassified as angry     : 29 times
  neutral    misclassified as sad       : 27 times
  angry      misclassified as happy     : 26 times
  disgust    misclassified as sad       : 25 times
  sad        misclassified as disgust   : 24 times
  disgust    misclassified as neutral   : 24 times
  angry      misclassified as disgust   : 18 times

Performance Summary by Emotion:
Rank  Emotion    Accuracy   Correct/Total  
--------------------------------------------------
1     surprise     96.6%     86/89 
2     angry        79.6%    223/280
3     neutral      78.2%    208/266
4     disgust      71.0%    198/279
5     sad          70.6%    197/279
6     happy        68.2%    191/280
7     fear         56.4%    158/280


# ============================================
# CELL 14: Error Analysis
# ============================================

In [27]:
def error_analysis(preds, labels, probs, emotion_names):
    """Analyze model errors"""
    
    # Find misclassified samples
    errors = preds != labels
    error_indices = np.where(errors)[0]
    
    if len(error_indices) > 0:
        print(f"Total errors: {len(error_indices)} / {len(labels)} ({100*len(error_indices)/len(labels):.1f}%)")
        
        # Confusion pairs
        confusion_pairs = {}
        for idx in error_indices:
            true_label = emotion_names[labels[idx]]
            pred_label = emotion_names[preds[idx]]
            pair = f"{true_label} -> {pred_label}"
            confusion_pairs[pair] = confusion_pairs.get(pair, 0) + 1
        
        # Most common confusions
        sorted_pairs = sorted(confusion_pairs.items(), key=lambda x: x[1], reverse=True)
        
        print("\nMost Common Confusions:")
        for pair, count in sorted_pairs[:10]:
            print(f"  {pair}: {count} times")
        
        # Confidence analysis
        correct_confidence = probs[~errors].max(axis=1).mean()
        error_confidence = probs[errors].max(axis=1).mean()
        
        print(f"\nAverage Confidence:")
        print(f"  Correct predictions: {correct_confidence:.3f}")
        print(f"  Incorrect predictions: {error_confidence:.3f}")
        
        # Plot confidence distribution
        fig = go.Figure()
        fig.add_trace(go.Histogram(
            x=probs[~errors].max(axis=1),
            name='Correct',
            opacity=0.7,
            nbinsx=30
        ))
        fig.add_trace(go.Histogram(
            x=probs[errors].max(axis=1),
            name='Incorrect',
            opacity=0.7,
            nbinsx=30
        ))
        fig.update_layout(
            title="Confidence Distribution",
            xaxis_title="Confidence",
            yaxis_title="Count",
            barmode='overlay'
        )
        fig.show()

# Perform error analysis
error_analysis(preds, labels, probs, emotion_names[:config.n_classes])

Total errors: 492 / 1753 (28.1%)

Most Common Confusions:
  fear -> sad: 52 times
  fear -> happy: 36 times
  sad -> neutral: 30 times
  happy -> angry: 29 times
  neutral -> sad: 27 times
  angry -> happy: 26 times
  disgust -> sad: 25 times
  disgust -> neutral: 24 times
  sad -> disgust: 24 times
  angry -> disgust: 18 times

Average Confidence:
  Correct predictions: 0.879
  Incorrect predictions: 0.668


# ============================================
# CELL 15: Save Results and Model
# ============================================

In [28]:
# Save results
results = {
    'test_accuracy': accuracy,
    'predictions': preds.tolist(),
    'true_labels': labels.tolist(),
    'probabilities': probs.tolist(),
    'confusion_matrix': cm.tolist(),
    'training_history': {
        'train_losses': trainer.train_losses,
        'val_losses': trainer.val_losses,
        'train_accs': trainer.train_accs,
        'val_accs': trainer.val_accs
    }
}

with open('/kaggle/working/results.json', 'w') as f:
    json.dump(results, f)

print("Results saved to results.json")

# Save model for deployment
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': config,
    'emotion_names': emotion_names[:config.n_classes],
    'test_accuracy': accuracy
}, '/kaggle/working/final_model.pth')

print("Model saved to final_model.pth")

Results saved to results.json
Model saved to final_model.pth


# ============================================
# CELL 16: Generate Project Report
# ============================================

In [29]:
report = f"""
# Speech Emotion Recognition Project Report

## 1. Project Overview
- **Objective**: Develop a deep learning system for emotion recognition from speech
- **Model Type**: {config.model_type.upper()}
- **Number of Classes**: {config.n_classes}
- **Total Samples**: {len(file_paths)}
- **Train/Val/Test Split**: {config.train_size}/{config.val_size}/{config.test_size}

## 2. Model Architecture
- **Parameters**: {sum(p.numel() for p in model.parameters()):,}
- **Input Features**: Mel-spectrogram ({config.n_mels} bins)
- **Batch Size**: {config.batch_size}
- **Learning Rate**: {config.learning_rate}
- **Epochs Trained**: {len(trainer.train_losses)}

## 3. Performance Results
- **Test Accuracy**: {accuracy:.4f}
- **Best Validation Accuracy**: {checkpoint['val_acc']:.2f}%

## 4. Per-Class Performance
"""

for i, emotion in enumerate(emotion_names[:config.n_classes]):
    if i < len(per_class_acc):
        report += f"- {emotion}: {per_class_acc[i]:.3f}\n"

report += """
## 5. Key Findings
1. The model successfully learns to distinguish between different emotions
2. Some emotion pairs show higher confusion rates (see error analysis)
3. Ensemble models generally perform better than individual architectures

## 6. Future Improvements
1. Implement data augmentation techniques (pitch shift, time stretch)
2. Try pre-trained models (Wav2Vec2, HuBERT)
3. Collect more diverse training data
4. Implement real-time emotion recognition

## 7. Technologies Used
- **Deep Learning**: PyTorch, TorchAudio
- **Audio Processing**: Librosa
- **Visualization**: Plotly
- **Environment**: Kaggle GPU
"""

print(report)

# Save report
with open('/kaggle/working/project_report.md', 'w') as f:
    f.write(report)

print("\nProject completed successfully! üéâ")
print("Files saved:")
print("- best_model.pth")
print("- final_model.pth")
print("- results.json")
print("- project_report.md")


# Speech Emotion Recognition Project Report

## 1. Project Overview
- **Objective**: Develop a deep learning system for emotion recognition from speech
- **Model Type**: ENSEMBLE
- **Number of Classes**: 7
- **Total Samples**: 11682
- **Train/Val/Test Split**: 0.7/0.15/0.15

## 2. Model Architecture
- **Parameters**: 3,313,689
- **Input Features**: Mel-spectrogram (128 bins)
- **Batch Size**: 64
- **Learning Rate**: 0.001
- **Epochs Trained**: 37

## 3. Performance Results
- **Test Accuracy**: 0.7193
- **Best Validation Accuracy**: 70.05%

## 4. Per-Class Performance
- neutral: 0.782
- happy: 0.682
- sad: 0.706
- angry: 0.796
- fear: 0.564
- disgust: 0.710
- surprise: 0.966

## 5. Key Findings
1. The model successfully learns to distinguish between different emotions
2. Some emotion pairs show higher confusion rates (see error analysis)
3. Ensemble models generally perform better than individual architectures

## 6. Future Improvements
1. Implement data augmentation techniques (pitch 

# Memory Management Tips for Kaggle

```python
# Add these between cells if you run out of memory
import gc
gc.collect()
torch.cuda.empty_cache()

# Monitor GPU usage
!nvidia-smi

# Clear variables
del train_loader, val_loader  # After training
```