In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.flush_and_unmount()


In [3]:

!pip -q install -U pip


!pip -q uninstall -y numpy pandas scipy torchvision fastai fastdownload timm \
  opencv-python opencv-python-headless opencv-contrib-python cuml-cu12 umap-learn || true


!pip -q install torch==2.3.0 torchaudio==2.3.0 --extra-index-url https://download.pytorch.org/whl/cu121


!pip -q install numpy==2.0.1 pandas==2.2.2 scipy==1.14.1


!pip -q install librosa==0.10.2.post1 speechbrain==0.5.16 torchmetrics==1.4.0





[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires numpy<3.0a0,>=1.23, which is not installed.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, which is not installed.
sentence-transformers 5.1.0 requires scipy, which is not installed.
dask-cudf-cu12 25.6.0 requires numpy<3.0a0,>=1.23, which is not installed.
dask-cudf-cu12 25.6.0 requires pandas<2

In [None]:
import os, time; print("Restarting runtime..."); time.sleep(0.5); os.kill(os.getpid(), 9)

Restarting runtime...


In [2]:
import sys, numpy, pandas, torch, torchaudio, scipy, librosa
print("py", sys.version)
print("numpy", numpy.__version__)
print("pandas", pandas.__version__)
print("scipy", scipy.__version__)
print("torch", torch.__version__, "torchaudio", torchaudio.__version__)
print("librosa", librosa.__version__)


py 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
numpy 2.0.2
pandas 2.2.2
scipy 1.16.1
torch 2.8.0+cu126 torchaudio 2.8.0+cu126
librosa 0.11.0


In [1]:
# Deterministic CRDNN training on RAVDESS (GPU)
import os, json, random
from pathlib import Path
import numpy as np
import torch, torchaudio, torch.nn as nn, torch.nn.functional as F
import pandas as pd

import speechbrain as sb
from speechbrain.nnet.losses import nll_loss
from torchmetrics.functional import f1_score, confusion_matrix, accuracy
from torch.utils.data import DataLoader
from speechbrain.dataio.dataset import DynamicItemDataset

# Reproducibility
def set_seed(s=42):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(s)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(42)

# Paths / labels
BASE = Path("/content/drive/MyDrive/emotional_ai")
DATA_DIR = BASE / "data" / "audio" / "ravdess"
AUDIO_ROOT = DATA_DIR / "wav"
META_DIR = DATA_DIR / "meta"

with open(META_DIR / "labels.json") as f:
    LABELS = json.load(f)
NUM_CLASSES = len(LABELS)
label_to_index = {lab: i for i, lab in enumerate(LABELS)}

train_csv = META_DIR / "train.csv"
valid_csv = META_DIR / "valid.csv"
test_csv  = META_DIR / "test.csv"
assert train_csv.exists() and valid_csv.exists() and test_csv.exists(), "Missing CSVs!"

# Dataset helpers
SAMPLE_RATE = 16000

def read_csv_to_items(path: Path):
    df = pd.read_csv(path)# expects: ID,wav,duration,emotion
    df = df[df["emotion"].isin(LABELS)]
    items = []
    for _, r in df.iterrows():
        items.append({
            "id": str(r["ID"]),
            "wav": str(r["wav"]),
            "emotion": label_to_index[str(r["emotion"])],
        })
    return items

def items_to_mapping(items):
    return {it["id"]: {"wav": it["wav"], "emotion": it["emotion"]} for it in items}

train_items = read_csv_to_items(train_csv)
valid_items = read_csv_to_items(valid_csv)
test_items  = read_csv_to_items(test_csv)

datasets = {
    "train": DynamicItemDataset(items_to_mapping(train_items)),
    "valid": DynamicItemDataset(items_to_mapping(valid_items)),
    "test":  DynamicItemDataset(items_to_mapping(test_items)),
}

# Pipelines (audio and labels)
def audio_pipeline(wav_path):
    sig, sr = torchaudio.load(wav_path)# [C, T]
    if sig.shape[0] > 1:
        sig = sig.mean(dim=0, keepdim=True)# mono
    if sr != SAMPLE_RATE:
        sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(sig)
    return sig.squeeze(0)# [T]

def label_pipeline(idx):
    return torch.tensor(idx, dtype=torch.long)

for split in datasets:
    datasets[split].add_dynamic_item(audio_pipeline, takes="wav", provides="sig")
    datasets[split].add_dynamic_item(label_pipeline,  takes="emotion", provides="label")
    datasets[split].set_output_keys(["id", "sig", "label"])

# Dataloaders
BATCH_SIZE = 16

def collate_fn(batch):
    sigs = [b["sig"] for b in batch]
    labels = torch.tensor([int(b["label"].item()) for b in batch], dtype=torch.long)
    return {"sig": sigs, "label": labels}

train_dataloader = DataLoader(datasets["train"], batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
valid_dataloader = DataLoader(datasets["valid"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_dataloader  = DataLoader(datasets["test"],  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Features (log-Mel)
mel_spec = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=256, n_mels=80
)
ampl_to_db = torchaudio.transforms.AmplitudeToDB()

def compute_features(sig_batch):
    feats_list = []
    for sig in sig_batch:
        S = mel_spec(sig)# [n_mels, T]
        S_db = ampl_to_db(S).transpose(0, 1)# [T, n_mels]
        feats_list.append(S_db)
    lens = torch.tensor([f.size(0) for f in feats_list], dtype=torch.long)
    max_len = int(lens.max().item())
    feat_dim = feats_list[0].size(1)
    padded = torch.zeros(len(feats_list), max_len, feat_dim)
    for i, f in enumerate(feats_list):
        T = f.size(0)
        padded[i, :T, :feat_dim] = f
    return padded, lens

# Model (CRDNN)
class CRDNN(nn.Module):
    def __init__(self, n_mels=80, cnn_channels=128, rnn_hidden=128, num_classes=NUM_CLASSES):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, cnn_channels, kernel_size=(5,5), stride=(1,1), padding=2),
            nn.BatchNorm2d(cnn_channels),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(cnn_channels, cnn_channels, kernel_size=(3,3), padding=1),
            nn.BatchNorm2d(cnn_channels),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
        )
        self.rnn = nn.GRU(
            input_size=(n_mels//4)*cnn_channels,
            hidden_size=128,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(128*2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, feats, lens):
        x = feats.unsqueeze(1)# [B, 1, T, F]
        x = self.conv(x)# [B, C, T', F']
        B, C, Tprime, Fprime = x.shape
        x = x.permute(0, 2, 1, 3).contiguous().view(B, Tprime, C*Fprime)
        out, _ = self.rnn(x)
        out = out.mean(dim=1)# mean over time
        return self.classifier(out)

# Brain
class SERBrain(sb.core.Brain):
    def on_fit_start(self):
        super().on_fit_start()
        if "optimizer" not in self.checkpointer.recoverables:
            self.checkpointer.add_recoverables({
                "model": self.modules["model"],
                "optimizer": self.optimizer,
                "epoch_counter": self.hparams.epoch_counter,
            })

    def compute_forward(self, batch, stage):
        sigs = batch["sig"]
        labels = batch["label"].to(self.device)
        feats, lens = compute_features(sigs)
        feats = feats.to(self.device)
        logits = self.modules.model(feats, lens)
        outputs = F.log_softmax(logits, dim=-1)
        return outputs, labels

    def compute_objectives(self, predictions, batch, stage):
        outputs, labels = predictions
        loss = nll_loss(outputs, labels)
        if stage != sb.Stage.TRAIN:  # running accuracy
            with torch.no_grad():
                preds = outputs.argmax(dim=-1)
                if not hasattr(self, "_acc_correct"):
                    self._acc_correct, self._acc_total = 0, 0
                self._acc_correct += (preds == labels).sum().item()
                self._acc_total += labels.numel()
        return loss

    def on_stage_start(self, stage, epoch=None):
        if stage != sb.Stage.TRAIN:
            self._acc_correct, self._acc_total = 0, 0

    def on_stage_end(self, stage, stage_loss, epoch=None):
        if stage == sb.Stage.VALID:
            acc = (self._acc_correct / max(1, self._acc_total))
            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch},
                valid_stats={"loss": stage_loss, "acc": acc},
            )
            self.checkpointer.save_and_keep_only(meta={"acc": acc}, min_keys=["acc"])
        elif stage == sb.Stage.TEST:
            acc = (self._acc_correct / max(1, self._acc_total))
            self.hparams.train_logger.log_stats(
                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats={"loss": stage_loss, "acc": acc},
            )

# Hyperparams / objects
run_dir = BASE / "runs" / "speechbrain_ravdess"
run_dir.mkdir(parents=True, exist_ok=True)

hparams = {
    "lr": 1e-3,
    "epochs": 25,
    "opt_class": torch.optim.Adam,
    "model": CRDNN(n_mels=80, cnn_channels=128, rnn_hidden=128, num_classes=NUM_CLASSES),
    "train_logger": sb.utils.train_logger.FileTrainLogger(str(run_dir / "log.txt")),
    "epoch_counter": sb.utils.epoch_loop.EpochCounter(limit=25),
    "checkpointer": sb.utils.checkpoints.Checkpointer(checkpoints_dir=str(run_dir / "ckpt")),
}
brain = SERBrain(
    modules={"model": hparams["model"]},
    opt_class=hparams["opt_class"],
    hparams=hparams,
    run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    checkpointer=hparams["checkpointer"]
)

# Train
brain.fit(
    epoch_counter=brain.hparams.epoch_counter,
    train_set=train_dataloader,
    valid_set=valid_dataloader,
)


100%|██████████| 87/87 [11:14<00:00,  7.76s/it, train_loss=1.92]
100%|██████████| 18/18 [02:32<00:00,  8.50s/it]
100%|██████████| 87/87 [00:35<00:00,  2.43it/s, train_loss=1.86]
100%|██████████| 18/18 [00:05<00:00,  3.09it/s]
100%|██████████| 87/87 [00:37<00:00,  2.35it/s, train_loss=1.74]
100%|██████████| 18/18 [00:06<00:00,  2.87it/s]
100%|██████████| 87/87 [00:35<00:00,  2.42it/s, train_loss=1.71]
100%|██████████| 18/18 [00:06<00:00,  2.60it/s]
100%|██████████| 87/87 [00:35<00:00,  2.46it/s, train_loss=1.61]
100%|██████████| 18/18 [00:06<00:00,  2.61it/s]
100%|██████████| 87/87 [00:35<00:00,  2.42it/s, train_loss=1.54]
100%|██████████| 18/18 [00:05<00:00,  3.05it/s]
100%|██████████| 87/87 [00:36<00:00,  2.41it/s, train_loss=1.46]
100%|██████████| 18/18 [00:05<00:00,  3.03it/s]
100%|██████████| 87/87 [00:36<00:00,  2.42it/s, train_loss=1.36]
100%|██████████| 18/18 [00:06<00:00,  2.81it/s]
100%|██████████| 87/87 [00:36<00:00,  2.39it/s, train_loss=1.29]
100%|██████████| 18/18 [00:06<0

In [None]:
import os, json, numpy as np, pandas as pd
from sklearn.metrics import classification_report

# Evaluate (best ckpt)
brain.checkpointer.recover_if_possible()

all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_dataloader:
        outputs, labels = brain.compute_forward(batch, sb.Stage.TEST)
        preds = outputs.argmax(dim=-1).cpu().numpy().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy().tolist())

all_preds_t = torch.tensor(all_preds)
all_labels_t = torch.tensor(all_labels)
test_acc = accuracy(all_preds_t, all_labels_t, task="multiclass", num_classes=len(LABELS)).item()
test_f1  = f1_score(all_preds_t, all_labels_t, task="multiclass", num_classes=len(LABELS), average="macro").item()
cm       = confusion_matrix(all_preds_t, all_labels_t, task="multiclass", num_classes=len(LABELS)).cpu().numpy()

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Macro-F1: {test_f1:.4f}")
print("Labels:", LABELS)
print("Confusion matrix (rows=true, cols=pred):\n", cm)

# Save artifacts
np.save(run_dir / "confusion_matrix.npy", cm)
with open(run_dir / "labels.json", "w") as f:
    json.dump(LABELS, f, indent=2)
torch.save(brain.modules["model"].state_dict(), run_dir / "model_best_state.pt")

# TorchScript export (trace - fallback to script)
m = brain.modules["model"].eval()
device = next(m.parameters()).device
ex_T = 400
example_feats = torch.randn(1, ex_T, 80, device=device, dtype=torch.float32).contiguous()
example_lens  = torch.tensor([ex_T], device=device, dtype=torch.long)

with torch.inference_mode():
    try:
        ts = torch.jit.trace(m, (example_feats, example_lens), strict=False)
        _ = ts(example_feats, example_lens)
    except Exception:
        class Wrapper(torch.nn.Module):
            def __init__(self, core): super().__init__(); self.core = core
            def forward(self, feats: torch.Tensor, lens: torch.Tensor): return self.core(feats, lens)
        ts = torch.jit.script(Wrapper(m).to(device).eval())

ts = ts.to("cpu")
ts_path = run_dir / "model_best_ts.pt"
ts.save(str(ts_path))
print("Saved TorchScript to:", ts_path)


# FORMAL TEST ARTIFACTS (overall and per-class)
# Where to store
MODEL_DIR = BASE / "models" / "audio" / "checkpoint"
EVAL_DIR  = MODEL_DIR / "eval_test"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

# Overall summary
overall = {
    "accuracy": float(test_acc),
    "macro/f1": float(test_f1),
    "num_classes": int(len(LABELS)),
    "num_samples": int(len(all_labels)),
}

# Per-class table
rep = classification_report(all_labels, all_preds,
                            target_names=LABELS,
                            output_dict=True,
                            zero_division=0)
rep_df = pd.DataFrame(rep).transpose()  # includes each class and macro/weighted avg

# Save files
with open(EVAL_DIR / "metrics_overall.json", "w") as f:
    json.dump(overall, f, indent=2)

rep_df.to_csv(EVAL_DIR / "metrics_report.csv")# per-class P/R/F1 and support

# Move/copy confusion matrix into eval_test (if it’s elsewhere)
src_cm = run_dir / "confusion_matrix.npy"
dst_cm = EVAL_DIR / "confusion_matrix.npy"
if src_cm.exists() and str(src_cm) != str(dst_cm):
    import shutil; shutil.copy2(src_cm, dst_cm)

print("Saved:", EVAL_DIR / "metrics_overall.json")
print("Saved:", EVAL_DIR / "metrics_report.csv")
print("Saved:", dst_cm)



Test Accuracy: 0.8333
Test Macro-F1: 0.8293
Labels: ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
Confusion matrix (rows=true, cols=pred):
 [[35  0  1  0  0  0  0  0]
 [ 0 34  0  0  0  0  0  0]
 [ 2  0 31  0  0  0  0  0]
 [ 0  1  2 29  1  0  2  1]
 [ 1  0  1  3 19  4  4  6]
 [ 0  0  0  0  0 18  1  0]
 [ 1  6  1  3  1  0 24  1]
 [ 2  0  0  0  0  0  0 35]]
Saved TorchScript to: /content/drive/MyDrive/emotional_ai/runs/speechbrain_ravdess/model_best_ts.pt
Saved: /content/drive/MyDrive/emotional_ai/models/audio/checkpoint/eval_test/metrics_overall.json
Saved: /content/drive/MyDrive/emotional_ai/models/audio/checkpoint/eval_test/metrics_report.csv
Saved: /content/drive/MyDrive/emotional_ai/models/audio/checkpoint/eval_test/confusion_matrix.npy
