In [None]:
# Emotion Classifier Training with RAVDESS Dataset

# This notebook fine-tunes the HuBERT model (`superb/hubert-base-superb-er`) for emotion classification using the RAVDESS dataset.

# **Dataset**: RAVDESS (Ryerson Audio-Visual Database of Emotional Speech and Song)
# - 8 emotions: neutral, calm, happy, sad, angry, fearful, surprise, disgust
# - ~7,350 audio files from 24 actors

# **Training Strategy**:
# - Freeze all HuBERT layers (keep pre-trained features)
# - Train only the classification head (8 classes)
# - Speaker-independent train/val/test split

In [1]:
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
from typing import List, Tuple, Dict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import (AutoModelForAudioClassification, AutoFeatureExtractor, AutoConfig)
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# RAVDESS emotion labels (8 classes)
RAVDESS_EMOTIONS = [
    "neutral",
    "calm",
    "happy",
    "sad",
    "angry",
    "fearful",
    "surprise",
    "disgust"
]

EMOTION_TO_IDX = {emotion: idx for idx, emotion in enumerate(RAVDESS_EMOTIONS)}
IDX_TO_EMOTION = {idx: emotion for emotion, idx in EMOTION_TO_IDX.items()}

print(f"Emotion classes: {RAVDESS_EMOTIONS}")
print(f"Number of classes: {len(RAVDESS_EMOTIONS)}")


Using device: cuda
Emotion classes: ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'surprise', 'disgust']
Number of classes: 8


In [2]:
## Step 1: Load and Preprocess RAVDESS Dataset

# RAVDESS files are named with pattern: `[Modality]-[Vocal]-[Emotion]-[Intensity]-[Statement]-[Repetition]-[Actor].wav`

# - Modality: 01=full AV, 02=video-only, 03=audio-only
# - Emotion: 01=neutral, 02=calm, 03=happy, 04=sad, 05=angry, 06=fearful, 07=surprise, 08=disgust
# - Actor: 01-24 (12 male, 12 female)


In [3]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import zipfile
import os

# Define the path to your zipped RAVDESS file on Google Drive
zip_path = '/content/drive/MyDrive/ravdess/ravdess.zip'
extract_path = '/content/ravdess_dataset'

os.makedirs(extract_path, exist_ok=True)

print(f"Unzipping {zip_path} to {extract_path}...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Unzipping complete.")

# List contents to verify
# The actual audio files are usually inside a subdirectory like 'Actor_01', 'Actor_02', etc.
# The path should be to the parent directory containing these actor folders.
# For example, if you unzip and find '/content/ravdess_dataset/Actor_01', then RAVDESS_DATA_DIR should be '/content/ravdess_dataset'
print("Listing extracted contents (first 5):")
!ls -d {extract_path}/* | head -n 5

Unzipping /content/drive/MyDrive/ravdess/ravdess.zip to /content/ravdess_dataset...
Unzipping complete.
Listing extracted contents (first 5):
/content/ravdess_dataset/Actor_01
/content/ravdess_dataset/Actor_02
/content/ravdess_dataset/Actor_03
/content/ravdess_dataset/Actor_04
/content/ravdess_dataset/Actor_05


In [5]:
class RAVDESSDataset(Dataset):
    """Dataset class for RAVDESS audio files."""

    def __init__(self, file_paths: List[str], labels: List[int], feature_extractor, max_length: int = 16000 * 4):
        """
        Args:
            file_paths: List of audio file paths
            labels: List of emotion labels (indices)
            feature_extractor: HuggingFace feature extractor
            max_length: Maximum audio length in samples (default: 4 seconds at 16kHz)
        """
        self.file_paths = file_paths
        self.labels = labels
        self.feature_extractor = feature_extractor
        self.max_length = max_length
        self.sample_rate = 16000  # RAVDESS is 16kHz

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Load audio file
        try:
            audio, sr = librosa.load(file_path, sr=self.sample_rate)

            if len(audio) > self.max_length:
                audio = audio[:self.max_length]
            else:
                audio = np.pad(audio, (0, self.max_length - len(audio)), mode='constant')

            inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt", padding=True)

            return {
                'input_values': inputs['input_values'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            # Return zeros if file can't be loaded
            inputs = self.feature_extractor(
                np.zeros(self.max_length),
                sampling_rate=self.sample_rate,
                return_tensors="pt",
                padding=True
            )
            return {
                'input_values': inputs['input_values'].squeeze(0),
                'labels': torch.tensor(0, dtype=torch.long)
            }


def parse_ravdess_filename(filename: str) -> Dict[str, int]:
    """
    Parse RAVDESS filename to extract metadata.

    Format: [Modality]-[Vocal]-[Emotion]-[Intensity]-[Statement]-[Repetition]-[Actor].wav

    Returns:
        Dictionary with parsed values
    """
    basename = Path(filename).stem
    parts = basename.split('-')

    if len(parts) != 7:
        return None

    return {
        'modality': int(parts[0]),
        'vocal': int(parts[1]),
        'emotion': int(parts[2]),
        'intensity': int(parts[3]),
        'statement': int(parts[4]),
        'repetition': int(parts[5]),
        'actor': int(parts[6])
    }


def load_ravdess_dataset(data_dir: str) -> Tuple[List[str], List[int], List[int]]:
    """
    Load RAVDESS dataset and extract file paths, labels, and actor IDs.

    Args:
        data_dir: Directory containing RAVDESS audio files

    Returns:
        Tuple of (file_paths, labels, actors)
    """
    data_dir = Path(data_dir)
    file_paths = []
    labels = []
    actors = []

    # Emotion mapping: RAVDESS uses 01-08, we map to 0-7
    # Note: RAVDESS emotion 01=neutral, 02=calm, 03=happy, 04=sad, 05=angry, 06=fearful, 07=surprise, 08=disgust
    emotion_mapping = {
        1: 0,  # neutral
        2: 1,  # calm
        3: 2,  # happy
        4: 3,  # sad
        5: 4,  # angry
        6: 5,  # fearful
        7: 6,  # surprise
        8: 7   # disgust
    }

    # Find all WAV files
    audio_files = list(data_dir.rglob("*.wav"))

    print(f"Found {len(audio_files)} audio files")

    for audio_file in audio_files:
        parsed = parse_ravdess_filename(audio_file.name)
        if parsed is None:
            continue

        # Only use audio-only files (modality 03)
        if parsed['modality'] != 3:
            continue

        emotion_code = parsed['emotion']
        if emotion_code in emotion_mapping:
            file_paths.append(str(audio_file))
            labels.append(emotion_mapping[emotion_code])
            actors.append(parsed['actor'])

    print(f"Loaded {len(file_paths)} audio-only files")
    print(f"Emotion distribution: {np.bincount(labels)}")

    return file_paths, labels, actors


# Load dataset
# Update this path to your RAVDESS dataset location
RAVDESS_DATA_DIR = "/content/ravdess_dataset"

file_paths, labels, actors = load_ravdess_dataset(RAVDESS_DATA_DIR)

Found 2880 audio files
Loaded 2880 audio-only files
Emotion distribution: [192 384 384 384 384 384 384 384]


In [None]:
## Step 2: Train/Validation/Test Split (Speaker-Independent)

In [6]:
def create_speaker_independent_split(file_paths: List[str], labels: List[int], actors: List[int],
                                     train_ratio: float = 0.7, val_ratio: float = 0.15):
    """
    Create speaker-independent train/val/test split.

    Args:
        file_paths: List of file paths
        labels: List of labels
        actors: List of actor IDs
        train_ratio: Proportion for training
        val_ratio: Proportion for validation

    Returns:n        Train, validation, and test splits
    """
    # Get unique actors
    unique_actors = sorted(set(actors))
    num_actors = len(unique_actors)

    # Split actors (not files)
    num_train_actors = int(num_actors * train_ratio)
    num_val_actors = int(num_actors * val_ratio)

    train_actors = set(unique_actors[:num_train_actors])
    val_actors = set(unique_actors[num_train_actors:num_train_actors + num_val_actors])
    test_actors = set(unique_actors[num_train_actors + num_val_actors:])

    # Split files based on actor membership
    train_files, train_labels = [], []
    val_files, val_labels = [], []
    test_files, test_labels = [], []

    for file_path, label, actor in zip(file_paths, labels, actors):
        if actor in train_actors:
            train_files.append(file_path)
            train_labels.append(label)
        elif actor in val_actors:
            val_files.append(file_path)
            val_labels.append(label)
        elif actor in test_actors:
            test_files.append(file_path)
            test_labels.append(label)

    print(f"Train: {len(train_files)} files from {len(train_actors)} actors")
    print(f"Validation: {len(val_files)} files from {len(val_actors)} actors")
    print(f"Test: {len(test_files)} files from {len(test_actors)} actors")

    return (train_files, train_labels), (val_files, val_labels), (test_files, test_labels)


# Uncomment when dataset is loaded:
train_data, val_data, test_data = create_speaker_independent_split(file_paths, labels, actors)
train_files, train_labels = train_data
val_files, val_labels = val_data
test_files, test_labels = test_data

Train: 1920 files from 16 actors
Validation: 360 files from 3 actors
Test: 600 files from 5 actors


In [22]:
model_name = "superb/hubert-base-superb-er"
num_classes = len(RAVDESS_EMOTIONS)

print(f"Loading model: {model_name}")
print(f"Number of classes: {num_classes}")

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name, trust_remote_code=True)

config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
config.num_labels = num_classes

model = AutoModelForAudioClassification.from_pretrained(model_name,  config=config, trust_remote_code=True,  ignore_mismatched_sizes=True)

hidden_size = getattr(config, 'hidden_size', None)

if hasattr(model, 'classifier'):
    if isinstance(model.classifier, nn.Linear):
        hidden_size = model.classifier.in_features
    elif isinstance(model.classifier, nn.Sequential):
        for layer in reversed(model.classifier):
            if isinstance(layer, nn.Linear):
                hidden_size = layer.in_features
                break

print(f"Hidden size detected: {hidden_size}")

new_classifier = nn.Sequential(
    nn.Linear(hidden_size, 512),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(512, num_classes)
)

if hasattr(model, 'classification_head'):
    model.classification_head = new_classifier
else:
    model.classifier = new_classifier


for name, param in model.named_parameters():
    if ("hubert.feature_extractor" in name) or ("hubert.encoder" in name):
        param.requires_grad = False
    else:
        param.requires_grad = True
        print(f"Trainable: {name}")


trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"\nTrainable parameters: {trainable_params:,} / {total_params:,} "
      f"({100 * trainable_params / total_params:.2f}%)")

model = model.to(device)
print(f"Model moved to {device}")


Loading model: superb/hubert-base-superb-er
Number of classes: 8


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Hidden size detected: 256
Trainable: layer_weights
Trainable: hubert.masked_spec_embed
Trainable: hubert.feature_projection.layer_norm.weight
Trainable: hubert.feature_projection.layer_norm.bias
Trainable: hubert.feature_projection.projection.weight
Trainable: hubert.feature_projection.projection.bias
Trainable: projector.weight
Trainable: projector.bias
Trainable: classifier.0.weight
Trainable: classifier.0.bias
Trainable: classifier.3.weight
Trainable: classifier.3.bias

Trainable parameters: 728,341 / 94,704,277 (0.77%)
Model moved to cuda


In [23]:
train_dataset = RAVDESSDataset(train_files, train_labels, feature_extractor)
val_dataset = RAVDESSDataset(val_files, val_labels, feature_extractor)
test_dataset = RAVDESSDataset(test_files, test_labels, feature_extractor)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [24]:
from torch.optim import lr_scheduler

# Training hyperparameters
learning_rate = 2e-4
num_epochs = 20
patience = 4  # Early stopping patience

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=2, threshold=1e-4, threshold_mode='rel', cooldown=1, min_lr=1e-6)

print("Training setup complete")
print(f"Learning rate: {learning_rate}")
print(f"Epochs: {num_epochs}")


Training setup complete
Learning rate: 0.0002
Epochs: 20


In [25]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        input_values = batch['input_values'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_values=input_values)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        progress_bar.set_postfix({
            'loss': loss.item(),
            'acc': 100 * correct / total
        })

    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy


def validate(model, val_loader, criterion, device):
    """Validate the model."""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_values = batch['input_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_values=input_values)
            logits = outputs.logits

            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, predicted = torch.max(logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy, all_preds, all_labels


# Training loop
# Uncomment when data loaders are ready:

best_val_loss = float('inf')
best_val_acc = 0
patience_counter = 0
train_losses = []
val_losses = []
train_accs = []
val_accs = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 50)

    # Train
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    train_accs.append(train_acc)

    # Validate
    val_loss, val_acc, val_preds, val_labels = validate(model, val_loader, criterion, device)
    val_losses.append(val_loss)
    val_accs.append(val_acc)

    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_acc = val_acc
        patience_counter = 0

        # Save model
        os.makedirs("checkpoints", exist_ok=True)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'val_acc': val_acc,
        }, "checkpoints/best_model.pt")
        print("✓ Saved best model")
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

print(f"\nBest validation accuracy: {best_val_acc:.2f}%")



Epoch 1/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:31<00:00,  1.32it/s, loss=1.79, acc=31.8]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.57it/s]


Train Loss: 1.8897, Train Acc: 31.77%
Val Loss: 1.6636, Val Acc: 37.78%
✓ Saved best model

Epoch 2/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:12<00:00,  1.65it/s, loss=1.46, acc=43.4]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.80it/s]


Train Loss: 1.5124, Train Acc: 43.39%
Val Loss: 1.4985, Val Acc: 40.00%
✓ Saved best model

Epoch 3/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:12<00:00,  1.66it/s, loss=1.57, acc=50.7]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.77it/s]


Train Loss: 1.3053, Train Acc: 50.73%
Val Loss: 1.3732, Val Acc: 44.44%
✓ Saved best model

Epoch 4/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:13<00:00,  1.63it/s, loss=1.22, acc=55.7]
Validating: 100%|██████████| 23/23 [00:09<00:00,  2.51it/s]


Train Loss: 1.1700, Train Acc: 55.73%
Val Loss: 1.3184, Val Acc: 48.89%
✓ Saved best model

Epoch 5/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:14<00:00,  1.61it/s, loss=0.937, acc=59.3]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.69it/s]


Train Loss: 1.0702, Train Acc: 59.32%
Val Loss: 1.3294, Val Acc: 46.67%

Epoch 6/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:14<00:00,  1.60it/s, loss=1.05, acc=64.1]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.72it/s]


Train Loss: 0.9768, Train Acc: 64.11%
Val Loss: 1.3147, Val Acc: 50.56%
✓ Saved best model

Epoch 7/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:15<00:00,  1.59it/s, loss=0.939, acc=68.6]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.74it/s]


Train Loss: 0.8866, Train Acc: 68.59%
Val Loss: 1.2346, Val Acc: 54.44%
✓ Saved best model

Epoch 8/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:15<00:00,  1.59it/s, loss=1.01, acc=70.7]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.57it/s]


Train Loss: 0.8090, Train Acc: 70.73%
Val Loss: 1.2176, Val Acc: 55.00%
✓ Saved best model

Epoch 9/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:16<00:00,  1.58it/s, loss=0.645, acc=73.9]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.70it/s]


Train Loss: 0.7191, Train Acc: 73.91%
Val Loss: 1.2083, Val Acc: 57.78%
✓ Saved best model

Epoch 10/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:16<00:00,  1.57it/s, loss=0.833, acc=78.5]
Validating: 100%|██████████| 23/23 [00:09<00:00,  2.44it/s]


Train Loss: 0.6257, Train Acc: 78.54%
Val Loss: 1.2286, Val Acc: 58.89%

Epoch 11/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:16<00:00,  1.57it/s, loss=0.745, acc=81.3]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.59it/s]


Train Loss: 0.5467, Train Acc: 81.30%
Val Loss: 1.3100, Val Acc: 57.22%

Epoch 12/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:17<00:00,  1.54it/s, loss=0.593, acc=82.8]
Validating: 100%|██████████| 23/23 [00:08<00:00,  2.66it/s]


Train Loss: 0.5243, Train Acc: 82.81%
Val Loss: 1.2305, Val Acc: 61.67%

Epoch 13/20
--------------------------------------------------


Training: 100%|██████████| 120/120 [01:16<00:00,  1.57it/s, loss=0.651, acc=84.1]
Validating: 100%|██████████| 23/23 [00:09<00:00,  2.30it/s]

Train Loss: 0.4599, Train Acc: 84.06%
Val Loss: 1.3301, Val Acc: 57.78%
Early stopping at epoch 13

Best validation accuracy: 57.78%





In [26]:
# Load best model and evaluate on test set
# Uncomment when training is complete:
# Load best model
checkpoint = torch.load("checkpoints/best_model.pt")
model.load_state_dict(checkpoint['model_state_dict'])
print("Loaded best model from checkpoint")

# Evaluate on test set
test_loss, test_acc, test_preds, test_labels = validate(model, test_loader, criterion, device)
print(f"\nTest Results:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(
    test_labels,
    test_preds,
    target_names=RAVDESS_EMOTIONS,
    digits=4
))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(test_labels, test_preds)
print(cm)


Loaded best model from checkpoint


Validating: 100%|██████████| 38/38 [00:13<00:00,  2.78it/s]


Test Results:
Test Loss: 1.2383
Test Accuracy: 60.00%

Classification Report:
              precision    recall  f1-score   support

     neutral     0.6429    0.9000    0.7500        40
        calm     0.6102    0.9000    0.7273        80
       happy     0.4634    0.4750    0.4691        80
         sad     0.6500    0.3250    0.4333        80
       angry     0.6364    0.7000    0.6667        80
     fearful     0.8182    0.2250    0.3529        80
    surprise     0.7857    0.5500    0.6471        80
     disgust     0.5072    0.8750    0.6422        80

    accuracy                         0.6000       600
   macro avg     0.6392    0.6188    0.5861       600
weighted avg     0.6390    0.6000    0.5751       600


Confusion Matrix:
[[36  4  0  0  0  0  0  0]
 [ 6 72  0  2  0  0  0  0]
 [ 8  2 38  2  4  0  0 26]
 [ 2 40  8 26  0  0  4  0]
 [ 2  0  0  0 56  0  4 18]
 [ 0  0 32 10  0 18  4 16]
 [ 2  0  2  0 24  0 44  8]
 [ 0  0  2  0  4  4  0 70]]





In [29]:
# Save the final model
# Uncomment when training is complete:

save_dir = "checkpoints/emotion_classifier_ravdess"
os.makedirs(save_dir, exist_ok=True)

# Save model and tokenizer
model.save_pretrained(save_dir)
feature_extractor.save_pretrained(save_dir)

# Also save PyTorch state dict
torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))

print(f"Model saved to {save_dir}")
print("Model is ready to use in worker.py!")


Model saved to checkpoints/emotion_classifier_ravdess
Model is ready to use in worker.py!


In [30]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoFeatureExtractor, AutoModelForAudioClassification

# 1) Base model + labels
base_model_name = "superb/hubert-base-superb-er"
num_classes = len(RAVDESS_EMOTIONS)   # same as training
checkpoint_path = "/content/checkpoints/best_model.pt"

print("Loading config & feature extractor...")
config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
config.num_labels = num_classes

feature_extractor = AutoFeatureExtractor.from_pretrained(base_model_name, trust_remote_code=True)

print("Rebuilding model architecture...")
model = AutoModelForAudioClassification.from_pretrained(
    base_model_name,
    config=config,
    trust_remote_code=True,
    ignore_mismatched_sizes=True,
)

hidden_size = getattr(config, 'hidden_size', None)

if hasattr(model, 'classifier'):
    if isinstance(model.classifier, nn.Linear):
        hidden_size = model.classifier.in_features
    elif isinstance(model.classifier, nn.Sequential):
        for layer in reversed(model.classifier):
            if isinstance(layer, nn.Linear):
                hidden_size = layer.in_features
                break

print(f"Hidden size detected: {hidden_size}")

new_classifier = nn.Sequential(
    nn.Linear(hidden_size, 512),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(512, num_classes)
)

if hasattr(model, "classification_head"):
    model.classification_head = new_classifier
else:
    model.classifier = new_classifier

print("Loading checkpoint weights...")
ckpt = torch.load(checkpoint_path, map_location="cpu")
model.load_state_dict(ckpt["model_state_dict"])
model.eval()

print("Model reloaded from checkpoint ✅")


Loading config & feature extractor...
Rebuilding model architecture...


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Hidden size detected: 256
Loading checkpoint weights...
Model reloaded from checkpoint ✅


In [36]:
!pip install huggingface_hub==0.34.0



In [40]:
from huggingface_hub import notebook_login, upload_folder

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [41]:
upload_folder(folder_path="/content/checkpoints/emotion_classifier_ravdess", repo_id="BerkayPolat/hubert_ravdess_emotion", repo_type="model", commit_message="Add fine-tuned HuBERT emotion model with custom classifier and MLP.")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...lassifier_ravdess/model.safetensors:  11%|#         | 40.5MB /  379MB            

  ...lassifier_ravdess/pytorch_model.bin:   9%|8         | 33.0MB /  379MB            

CommitInfo(commit_url='https://huggingface.co/BerkayPolat/hubert_ravdess_emotion/commit/0e0c5cf63dabf1f2064e2211ecc8bbd573fa3a49', commit_message='Add fine-tuned HuBERT emotion model with custom classifier and MLP.', commit_description='', oid='0e0c5cf63dabf1f2064e2211ecc8bbd573fa3a49', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BerkayPolat/hubert_ravdess_emotion', endpoint='https://huggingface.co', repo_type='model', repo_id='BerkayPolat/hubert_ravdess_emotion'), pr_revision=None, pr_num=None)