In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from datasets import Dataset

In [None]:
ds_data, ds_info = tfds.load(
    "crema_d",
    with_info=True,
    as_supervised=False,
    split=['train', 'validation', 'test']
)



# Extract label mappings (0=Neutral, 1=Happy, etc.) from TFDS metadata
label_names = ds_info.features['label'].names
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for i, label in enumerate(label_names)}
print(f"Labels found: {label_names}")

# --- 2. Bridge: Convert TFDS to Hugging Face Dataset ---
# Wav2Vec2 Trainer works best with Hugging Face Datasets.
# Since CREMA-D is small (~2GB), we can convert it in memory.

def tfds_to_hf_dataset(tf_dataset):
    data_dict = {"audio": [], "label": []}
    for sample in tf_dataset:
        audio = sample['audio'].numpy()
        label = sample['label'].numpy()

        # Normalize audio if it's integer PCM (Wav2Vec2 expects float inputs)
        # CREMA-D in TFDS is often int64; we convert to float32
        audio = audio.astype(np.float32)
        if np.abs(audio).max() > 1.0:
            audio = audio / 32768.0  # Normalize 16-bit PCM to [-1, 1]

        data_dict["audio"].append(audio)
        data_dict["label"].append(label)

    return Dataset.from_dict(data_dict)

print("Converting TFDS to Hugging Face format... (this may take a minute)")
train_dataset = tfds_to_hf_dataset(ds_data[0])
eval_dataset = tfds_to_hf_dataset(ds_data[1])
test_dataset = tfds_to_hf_dataset(ds_data[2])


print("Saving converted dataset to disk...")
train_dataset.save_to_disk("./crema_hf/train")
eval_dataset.save_to_disk("./crema_hf/eval")
test_dataset.save_to_disk("./crema_hf/test")

print("Saved! You can now load it later using from_disk()")

Labels found: ['NEU', 'HAP', 'SAD', 'ANG', 'FEA', 'DIS']
Converting TFDS to Hugging Face format... (this may take a minute)
Saving converted dataset to disk...


Saving the dataset (0/2 shards):   0%|          | 0/5144 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1556 [00:00<?, ? examples/s]

Saved! You can now load it later using from_disk()


In [None]:
# crema_resnet_train.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
from datasets import load_from_disk
import os
from tqdm import tqdm

# -------------------------------
# 1. Dataset class: Mel spectrogram
# -------------------------------
class CREMADataset(Dataset):
    def __init__(self, hf_dataset, n_mels=128, max_len=128):
        """
        hf_dataset: Hugging Face dataset loaded from disk
        n_mels: number of mel bins
        max_len: resize mel spectrogram time dimension to max_len
        """
        self.dataset = hf_dataset
        self.n_mels = n_mels
        self.max_len = max_len
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=1024,
            hop_length=512,
            n_mels=self.n_mels
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        audio = torch.tensor(sample['audio'], dtype=torch.float32)
        label = torch.tensor(sample['label'], dtype=torch.long)

        # Mono audio
        if len(audio.shape) == 1:
            audio = audio.unsqueeze(0)

        mel_spec = self.mel_transform(audio)
        mel_spec_db = self.amplitude_to_db(mel_spec)

        # Resize to fixed size (C, H, W)
        mel_spec_db = F.interpolate(mel_spec_db.unsqueeze(0), size=(self.n_mels, self.max_len))
        mel_spec_db = mel_spec_db.squeeze(0)

        return mel_spec_db, label

# -------------------------------
# 2. Load Hugging Face datasets
# -------------------------------
train_dataset = load_from_disk("./crema_hf/train")
eval_dataset  = load_from_disk("./crema_hf/eval")
test_dataset  = load_from_disk("./crema_hf/test")

train_ds = CREMADataset(train_dataset)
eval_ds  = CREMADataset(eval_dataset)
test_ds  = CREMADataset(test_dataset)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4)
eval_loader  = DataLoader(eval_ds, batch_size=32, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=32, num_workers=2)

# -------------------------------
# 3. Model: ResNet18 for 1-channel input
# -------------------------------
import torchvision.models as models

num_classes = len(train_dataset.features['label'].names)

model = models.resnet18(weights=None)
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
model.fc = nn.Linear(model.fc.in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# -------------------------------
# 4. Training setup
# -------------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # optional LR decay

num_epochs = 30
best_val_acc = 0.0
save_path = "best_resnet_crema.pth"

# -------------------------------
# 5. Training loop
# -------------------------------
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for mel, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        mel, labels = mel.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(mel)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    scheduler.step()

    # -------------------------------
    # Validation
    # -------------------------------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for mel, labels in eval_loader:
            mel, labels = mel.to(device), labels.to(device)
            outputs = model(mel)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_acc = 100 * correct / total
    print(f"Epoch {epoch+1}: Train Loss={running_loss/len(train_loader):.4f}, Val Acc={val_acc:.2f}%")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), save_path)
        print(f"Best model saved with val acc: {best_val_acc:.2f}%")

# -------------------------------
# 6. Test evaluation
# -------------------------------
model.load_state_dict(torch.load(save_path))
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for mel, labels in test_loader:
        mel, labels = mel.to(device), labels.to(device)
        outputs = model(mel)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"Test Accuracy: {test_acc:.2f}%")
