# 2. Audio Emotion Training - Wav2Vec 2.0 on RAVDESS

Train Wav2Vec 2.0 for speech emotion recognition. Run in Google Colab with GPU.

In [None]:
!pip install transformers datasets librosa soundfile -q

In [None]:
import os
import numpy as np
import torch
import librosa
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, TrainingArguments, Trainer
from torch.utils.data import Dataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

In [None]:
# RAVDESS emotion mapping (filename format: Actor_XX/03-01-EMOTION-...)
# Emotions: 01=neutral, 02=calm, 03=happy, 04=sad, 05=angry, 06=fearful, 07=disgust, 08=surprised
EMOTION_MAP = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fearful', 7:'disgust', 8:'surprised'}
LABEL2ID = {v:k-1 for k,v in EMOTION_MAP.items()}
ID2LABEL = {k-1:v for k,v in EMOTION_MAP.items()}
SAMPLE_RATE = 16000

In [None]:
# Load and prepare RAVDESS dataset
DATA_PATH = '../data/audio_data'  # Adjust path

def get_ravdess_files(data_path):
    files, labels = [], []
    for actor_dir in Path(data_path).glob('Actor_*'):
        for wav in actor_dir.glob('*.wav'):
            emotion_code = int(wav.stem.split('-')[2])
            files.append(str(wav))
            labels.append(emotion_code - 1)  # 0-indexed
    return files, labels

audio_files, labels = get_ravdess_files(DATA_PATH)
print(f'Found {len(audio_files)} audio files')

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(audio_files, labels, test_size=0.2, stratify=labels, random_state=42)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

In [None]:
# Custom Dataset
class AudioDataset(Dataset):
    def __init__(self, files, labels, feature_extractor):
        self.files, self.labels = files, labels
        self.fe = feature_extractor
    
    def __len__(self): return len(self.files)
    
    def __getitem__(self, idx):
        wav, _ = librosa.load(self.files[idx], sr=SAMPLE_RATE)
        inputs = self.fe(wav, sampling_rate=SAMPLE_RATE, return_tensors='pt', padding=True)
        return {'input_values': inputs.input_values.squeeze(), 'labels': torch.tensor(self.labels[idx])}

In [None]:
# Load model and feature extractor
MODEL = 'facebook/wav2vec2-base'
fe = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL, num_labels=8, id2label=ID2LABEL, label2id=LABEL2ID)

train_ds = AudioDataset(X_train, y_train, fe)
test_ds = AudioDataset(X_test, y_test, fe)

In [None]:
# Data collator for variable length audio
def collate_fn(batch):
    inputs = [b['input_values'] for b in batch]
    labels = torch.stack([b['labels'] for b in batch])
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    return {'input_values': inputs, 'labels': labels}

In [None]:
# Training
args = TrainingArguments(
    output_dir='./wav2vec_audio',
    num_train_epochs=10,
    per_device_train_batch_size=4,  # Small batch for memory
    per_device_eval_batch_size=4,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=test_ds, data_collator=collate_fn)

In [None]:
trainer.train()
trainer.save_model('../models/wav2vec_audio')
fe.save_pretrained('../models/wav2vec_audio')
print('Model saved!')