# Emotion Recognition Training Example

This notebook demonstrates how to train an emotion recognition model using the RAVDESS dataset.

## Table of Contents
1. Setup and Installation
2. Data Exploration
3. Data Preprocessing
4. Model Training
5. Evaluation
6. Inference on New Audio

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q transformers datasets torch torchaudio librosa pandas scikit-learn matplotlib seaborn

In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display

import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# Set style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

## 2. Data Exploration

In [None]:
# Load dataset statistics
import json

stats_path = Path('../data/processed/ravdess_statistics.json')
if stats_path.exists():
    with open(stats_path) as f:
        stats = json.load(f)
    
    print("Dataset Statistics:")
    print(json.dumps(stats, indent=2))
else:
    print("Statistics file not found. Please run prepare_ravdess.py first.")

In [None]:
# Load metadata
train_df = pd.read_csv('../data/processed/ravdess_train.csv')
val_df = pd.read_csv('../data/processed/ravdess_val.csv')
test_df = pd.read_csv('../data/processed/ravdess_test.csv')

print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

# Display first few rows
train_df.head()

In [None]:
# Visualize emotion distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (name, df) in zip(axes, [("Train", train_df), ("Val", val_df), ("Test", test_df)]):
    emotion_counts = df['label'].value_counts().sort_index()
    ax.bar(emotion_counts.index, emotion_counts.values)
    ax.set_title(f"{name} Set Emotion Distribution")
    ax.set_xlabel("Emotion")
    ax.set_ylabel("Count")
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### Audio Visualization

In [None]:
# Load and visualize sample audio for each emotion
emotions = train_df['label'].unique()

fig, axes = plt.subplots(len(emotions), 2, figsize=(15, 3*len(emotions)))

for i, emotion in enumerate(sorted(emotions)):
    # Get first sample for this emotion
    sample_path = train_df[train_df['label'] == emotion].iloc[0]['file_path']
    
    # Load audio
    y, sr = librosa.load(sample_path, sr=16000)
    
    # Plot waveform
    librosa.display.waveshow(y, sr=sr, ax=axes[i, 0])
    axes[i, 0].set_title(f"{emotion.upper()} - Waveform")
    axes[i, 0].set_xlabel("Time (s)")
    
    # Plot mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[i, 1])
    axes[i, 1].set_title(f"{emotion.upper()} - Mel Spectrogram")

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Add parent directory to path to import training utilities
sys.path.insert(0, os.path.abspath('..'))

from training.utils.audio_preprocessing import load_and_preprocess_audio, extract_features

# Test preprocessing
sample_path = train_df.iloc[0]['file_path']
y, sr = load_and_preprocess_audio(sample_path, target_sr=16000)

print(f"Sample rate: {sr}")
print(f"Audio shape: {y.shape}")
print(f"Duration: {len(y) / sr:.2f} seconds")

# Extract features
features = extract_features(y, sr)
print(f"\nExtracted features:")
for key, value in features.items():
    if isinstance(value, np.ndarray):
        print(f"  {key}: shape {value.shape}")
    else:
        print(f"  {key}: {value:.4f}")

## 4. Model Training

For full training, use the command-line script:
```bash
python training/train.py --dataset ravdess --epochs 10 --batch-size 8
```

Here we'll demonstrate the training process step by step.

In [None]:
# Load pretrained model
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Define emotion labels
emotion_labels = sorted(train_df['label'].unique())
label2id = {label: idx for idx, label in enumerate(emotion_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Emotion labels: {emotion_labels}")
print(f"Label mapping: {label2id}")

In [None]:
# Prepare a small batch for demonstration
def prepare_batch(df, num_samples=4):
    """Prepare a small batch of data."""
    batch = df.sample(n=num_samples)
    
    audio_arrays = []
    labels = []
    
    for _, row in batch.iterrows():
        y, sr = librosa.load(row['file_path'], sr=16000)
        audio_arrays.append(y)
        labels.append(label2id[row['label']])
    
    # Process with Wav2Vec2 processor
    inputs = processor(
        audio_arrays,
        sampling_rate=16000,
        return_tensors='pt',
        padding=True,
        max_length=16000 * 10,  # Max 10 seconds
        truncation=True
    )
    
    inputs['labels'] = torch.tensor(labels)
    
    return inputs

# Test batch preparation
batch = prepare_batch(train_df, num_samples=4)
print(f"Batch input_values shape: {batch['input_values'].shape}")
print(f"Batch labels: {batch['labels']}")

In [None]:
# Initialize model
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotion_labels),
    label2id=label2id,
    id2label=id2label
)

model = model.to(device)
print(f"Model loaded on {device}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

## 5. Inference Example

In [None]:
# Test inference on a sample
model.eval()

# Prepare single sample
sample_path = test_df.iloc[0]['file_path']
true_label = test_df.iloc[0]['label']

y, sr = librosa.load(sample_path, sr=16000)
inputs = processor(y, sampling_rate=16000, return_tensors='pt', padding=True)

# Move to device
inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)[0]
    predicted_id = torch.argmax(logits, dim=-1).item()

predicted_label = id2label[predicted_id]

print(f"True label: {true_label}")
print(f"Predicted label: {predicted_label}")
print(f"\nProbabilities:")
for i, prob in enumerate(probs.cpu().numpy()):
    print(f"  {id2label[i]:12s}: {prob:.4f}")

In [None]:
# Visualize prediction
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Waveform
librosa.display.waveshow(y, sr=sr, ax=ax1)
ax1.set_title(f"Audio Waveform\nTrue: {true_label} | Predicted: {predicted_label}")

# Probabilities
probs_np = probs.cpu().numpy()
ax2.bar(emotion_labels, probs_np)
ax2.set_title("Emotion Probabilities")
ax2.set_ylabel("Probability")
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Next Steps

1. **Full Training**: Run the complete training script:
   ```bash
   python training/train.py --dataset ravdess --epochs 10 --batch-size 8
   ```

2. **Monitor Training**: Use Weights & Biases:
   ```bash
   python training/train.py --dataset ravdess --use-wandb
   ```

3. **Custom Dataset**: Prepare your own data:
   ```bash
   python training/prepare_custom.py
   python training/train.py --dataset custom
   ```

4. **Deploy Model**: Update the backend to use your trained model:
   - Edit `backend/app/models/emotion_inference.py`
   - Change model path to your checkpoint
   - Restart the backend server