In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip -q install --upgrade pip
!pip -q install speechbrain==0.5.16 torch torchaudio==2.3.0 torchmetrics==1.4.0 numpy==1.26.4 scikit-learn==1.4.2 pandas==2.2.2 librosa==0.10.2.post1

In [None]:
# Clean, runnable SER training cell (SpeechBrain + torchaudio)
import os, json
from pathlib import Path
import torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

import speechbrain as sb
from speechbrain.nnet.losses import nll_loss
from torchmetrics.functional import f1_score, confusion_matrix, accuracy
from torch.utils.data import DataLoader
from speechbrain.dataio.dataset import DynamicItemDataset


# Paths / labels
BASE = Path("/content/drive/MyDrive/emotional_ai")
DATA_DIR = BASE / "data" / "audio" / "ravdess"
AUDIO_ROOT = DATA_DIR / "wav"
META_DIR = DATA_DIR / "meta"

with open(META_DIR / "labels.json") as f:
    LABELS = json.load(f)
NUM_CLASSES = len(LABELS)
label_to_index = {lab: i for i, lab in enumerate(LABELS)}

train_csv = META_DIR / "train.csv"
valid_csv = META_DIR / "valid.csv"
test_csv  = META_DIR / "test.csv"
assert train_csv.exists() and valid_csv.exists() and test_csv.exists(), "Missing CSVs!"

# Dataset helpers
SAMPLE_RATE = 16000

def read_csv_to_items(path: Path):
    df = pd.read_csv(path)  # expects: ID,wav,duration,emotion
    df = df[df["emotion"].isin(LABELS)]
    items = []
    for _, r in df.iterrows():
        items.append({
            "id": str(r["ID"]),
            "wav": str(r["wav"]),
            "emotion": label_to_index[str(r["emotion"])],
        })
    return items

def items_to_mapping(items):
    # DynamicItemDataset expects dict keyed by "id"
    return {it["id"]: {"wav": it["wav"], "emotion": it["emotion"]} for it in items}

train_items = read_csv_to_items(train_csv)
valid_items = read_csv_to_items(valid_csv)
test_items  = read_csv_to_items(test_csv)

datasets = {
    "train": DynamicItemDataset(items_to_mapping(train_items)),
    "valid": DynamicItemDataset(items_to_mapping(valid_items)),
    "test":  DynamicItemDataset(items_to_mapping(test_items)),
}


# Pipelines (audio + labels)
def audio_pipeline(wav_path):
    sig, sr = torchaudio.load(wav_path)      # [C, T]
    if sig.shape[0] > 1:
        sig = sig.mean(dim=0, keepdim=True)  # mono
    if sr != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
        sig = resampler(sig)
    return sig.squeeze(0)                     # [T]

def label_pipeline(idx):
    return torch.tensor(idx, dtype=torch.long)

for split in datasets:
    datasets[split].add_dynamic_item(audio_pipeline, takes="wav", provides="sig")
    datasets[split].add_dynamic_item(label_pipeline,  takes="emotion", provides="label")
    datasets[split].set_output_keys(["id", "sig", "label"])


# Dataloaders (custom collate)
BATCH_SIZE = 16

def collate_fn(batch):
    # Keep variable-length signals as a list; stack labels into a tensor
    sigs = [b["sig"] for b in batch]  # list of 1D tensors
    labels = torch.tensor([int(b["label"].item()) for b in batch], dtype=torch.long)
    return {"sig": sigs, "label": labels}

train_dataloader = DataLoader(datasets["train"], batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
valid_dataloader = DataLoader(datasets["valid"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_dataloader  = DataLoader(datasets["test"],  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


# Features (log-Mel)
mel_spec = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=256, n_mels=80
)
ampl_to_db = torchaudio.transforms.AmplitudeToDB()

def compute_features(sig_batch):
    feats_list = []
    for sig in sig_batch:                 # sig: 1D
        S = mel_spec(sig)                 # [n_mels, T]
        S_db = ampl_to_db(S).transpose(0, 1)  # [T, n_mels]
        feats_list.append(S_db)
    lens = torch.tensor([f.size(0) for f in feats_list], dtype=torch.long)
    max_len = int(lens.max().item())
    feat_dim = feats_list[0].size(1)
    padded = torch.zeros(len(feats_list), max_len, feat_dim)
    for i, f in enumerate(feats_list):
        T = f.size(0)
        padded[i, :T, :feat_dim] = f
    return padded, lens

# Model (CRDNN)
class CRDNN(nn.Module):
    def __init__(self, n_mels=80, cnn_channels=128, rnn_hidden=128, num_classes=NUM_CLASSES):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, cnn_channels, kernel_size=(5,5), stride=(1,1), padding=2),
            nn.BatchNorm2d(cnn_channels),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(cnn_channels, cnn_channels, kernel_size=(3,3), padding=1),
            nn.BatchNorm2d(cnn_channels),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
        )
        self.rnn = nn.GRU(
            input_size=(n_mels//4)*cnn_channels,
            hidden_size=rnn_hidden,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden*2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, feats, lens):
        # feats: [B, T, F] -> [B, 1, T, F]
        x = feats.unsqueeze(1)
        x = self.conv(x)           # [B, C, T', F']
        B, C, Tprime, Fprime = x.shape
        x = x.permute(0, 2, 1, 3).contiguous().view(B, Tprime, C*Fprime)
        out, _ = self.rnn(x)
        out = out.mean(dim=1)      # mean pool over time
        logits = self.classifier(out)
        return logits


# Brain (with built-in accuracy counter)
class SERBrain(sb.core.Brain):
    def on_fit_start(self):
        super().on_fit_start()
        if "optimizer" not in self.checkpointer.recoverables:
            self.checkpointer.add_recoverables({
                "model": self.modules["model"],
                "optimizer": self.optimizer,
                "epoch_counter": self.hparams.epoch_counter,
            })

    def compute_forward(self, batch, stage):
        # batch is dict from the collate_fn
        sigs = batch["sig"]                               # list[Tensor(T_i)]
        labels = batch["label"].to(self.device)           # [B]
        feats, lens = compute_features(sigs)              # -> [B, Tmax, F], [B]
        feats = feats.to(self.device)
        logits = self.modules.model(feats, lens)          # [B, C]
        outputs = F.log_softmax(logits, dim=-1)
        return outputs, labels

    def compute_objectives(self, predictions, batch, stage):
        outputs, labels = predictions
        loss = nll_loss(outputs, labels)

        # Running accuracy for VALID/TEST (no external imports)
        if stage != sb.Stage.TRAIN:
            with torch.no_grad():
                preds = outputs.argmax(dim=-1)
                if not hasattr(self, "_acc_correct"):
                    self._acc_correct, self._acc_total = 0, 0
                self._acc_correct += (preds == labels).sum().item()
                self._acc_total += labels.numel()
        return loss

    def on_stage_start(self, stage, epoch=None):
        if stage != sb.Stage.TRAIN:
            self._acc_correct, self._acc_total = 0, 0

    def on_stage_end(self, stage, stage_loss, epoch=None):
        if stage == sb.Stage.VALID:
            acc = (self._acc_correct / max(1, self._acc_total))
            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch},
                valid_stats={"loss": stage_loss, "acc": acc},
            )
            self.checkpointer.save_and_keep_only(meta={"acc": acc}, min_keys=["acc"])
        elif stage == sb.Stage.TEST:
            acc = (self._acc_correct / max(1, self._acc_total))
            self.hparams.train_logger.log_stats(
                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats={"loss": stage_loss, "acc": acc},
            )


# Hyperparams / objects
run_dir = BASE / "runs" / "speechbrain_ravdess"
run_dir.mkdir(parents=True, exist_ok=True)

hparams = {
    "lr": 1e-3,
    "epochs": 25,
    "opt_class": torch.optim.Adam,
    "model": CRDNN(n_mels=80, cnn_channels=128, rnn_hidden=128, num_classes=NUM_CLASSES),
    "train_logger": sb.utils.train_logger.FileTrainLogger(str(run_dir / "log.txt")),
    "epoch_counter": sb.utils.epoch_loop.EpochCounter(limit=25),
    "checkpointer": sb.utils.checkpoints.Checkpointer(checkpoints_dir=str(run_dir / "ckpt")),
}
brain = SERBrain(
    modules={"model": hparams["model"]},
    opt_class=hparams["opt_class"],
    hparams=hparams,
    run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    checkpointer=hparams["checkpointer"]
)


# Train
brain.fit(
    epoch_counter=brain.hparams.epoch_counter,
    train_set=train_dataloader,
    valid_set=valid_dataloader,
)


# Evaluate (best ckpt) + metrics
brain.checkpointer.recover_if_possible()

all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_dataloader:
        outputs, labels = brain.compute_forward(batch, sb.Stage.TEST)
        _ = brain.compute_objectives((outputs, labels), batch, sb.Stage.TEST)
        preds = outputs.argmax(dim=-1).cpu().numpy().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy().tolist())

all_preds_t = torch.tensor(all_preds)
all_labels_t = torch.tensor(all_labels)
test_acc = accuracy(all_preds_t, all_labels_t, task="multiclass", num_classes=NUM_CLASSES).item()
test_f1  = f1_score(all_preds_t, all_labels_t, task="multiclass", num_classes=NUM_CLASSES, average="macro").item()
cm       = confusion_matrix(all_preds_t, all_labels_t, task="multiclass", num_classes=NUM_CLASSES).cpu().numpy()

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Macro-F1: {test_f1:.4f}")
print("Labels:", LABELS)
print("Confusion matrix (rows=true, cols=pred):\n", cm)

# Save artifacts
np.save(run_dir / "confusion_matrix.npy", cm)
with open(run_dir / "labels.json", "w") as f:
    json.dump(LABELS, f, indent=2)
torch.save(brain.modules["model"].state_dict(), run_dir / "model_best_state.pt")

# Robust TorchScript export (trace - fallback to script)
import json, torch

m = brain.modules["model"].eval()

# Put example tensors on the SAME device and dtype as the model
device = next(m.parameters()).device
ex_T = 400
example_feats = torch.randn(1, ex_T, 80, device=device, dtype=torch.float32).contiguous()
example_lens  = torch.tensor([ex_T], device=device, dtype=torch.long)

with torch.inference_mode():
    try:
        # Try tracing first (fast, portable). strict=False tolerates control-flow it can’t follow.
        ts = torch.jit.trace(m, (example_feats, example_lens), strict=False)
        _ = ts(example_feats, example_lens)  # sanity run
    except Exception as e:
        # Fallback to scripting (handles more dynamic bits)
        class Wrapper(torch.nn.Module):
            def __init__(self, core):
                super().__init__()
                self.core = core
            def forward(self, feats: torch.Tensor, lens: torch.Tensor):
                return self.core(feats, lens)
        wrapped = Wrapper(m).to(device).eval()
        ts = torch.jit.script(wrapped)

# Save TorchScript on CPU for portability
ts = ts.to("cpu")
ts_path = run_dir / "model_best_ts.pt"
ts.save(str(ts_path))

# Save labels next to the model
with open(run_dir / "labels.json", "w") as f:
    json.dump(LABELS, f, indent=2)

print("Saved:", ts_path)
print("Labels:", LABELS)



100%|██████████| 87/87 [00:38<00:00,  2.24it/s, train_loss=1.9]
100%|██████████| 18/18 [00:06<00:00,  2.65it/s]
100%|██████████| 87/87 [00:39<00:00,  2.19it/s, train_loss=1.79]
100%|██████████| 18/18 [00:07<00:00,  2.38it/s]
100%|██████████| 87/87 [00:38<00:00,  2.23it/s, train_loss=1.78]
100%|██████████| 18/18 [00:06<00:00,  2.82it/s]
100%|██████████| 87/87 [00:39<00:00,  2.22it/s, train_loss=1.73]
100%|██████████| 18/18 [00:07<00:00,  2.34it/s]
100%|██████████| 87/87 [00:39<00:00,  2.21it/s, train_loss=1.68]
100%|██████████| 18/18 [00:06<00:00,  2.78it/s]
100%|██████████| 87/87 [00:39<00:00,  2.21it/s, train_loss=1.63]
100%|██████████| 18/18 [00:07<00:00,  2.36it/s]
100%|██████████| 87/87 [00:39<00:00,  2.23it/s, train_loss=1.59]
100%|██████████| 18/18 [00:06<00:00,  2.73it/s]
100%|██████████| 87/87 [00:39<00:00,  2.20it/s, train_loss=1.51]
100%|██████████| 18/18 [00:07<00:00,  2.38it/s]
100%|██████████| 87/87 [00:39<00:00,  2.18it/s, train_loss=1.47]
100%|██████████| 18/18 [00:06<00

Test Accuracy: 0.7593
Test Macro-F1: 0.7526
Labels: ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
Confusion matrix (rows=true, cols=pred):
 [[30  0  4  1  0  0  1  0]
 [ 0 34  0  0  0  0  0  0]
 [ 1  0 31  0  0  0  1  0]
 [ 0  1  0 25  2  0  8  0]
 [ 0  1  0  1 24  5  7  0]
 [ 0  8  0  0  0 11  0  0]
 [ 0 15  0  3  3  0 16  0]
 [ 0  0  1  1  1  0  0 34]]
Saved: /content/drive/MyDrive/emotional_ai/runs/speechbrain_ravdess/model_best_ts.pt
Labels: ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
