# create metadata CSV

In [1]:
import os
import pandas as pd

AUDIO_DIR = r"C:\Users\Dennismz\Desktop\CDAC_PROJECT\dataset for SER\CREMA-D\AudioWAV"

rows = []
for file in os.listdir(AUDIO_DIR):
    if file.endswith(".wav"):
        emotion = file.split("_")[2]  # ANG, HAP, SAD, etc.
        rows.append({
            "file": file,
            "label": emotion
        })

df = pd.DataFrame(rows)
df.to_csv("transcripts.csv", index=False)

print("CSV created with", len(df), "samples")
print(df.head())


CSV created with 4281 samples
                  file label
0  1001_DFA_ANG_XX.wav   ANG
1  1001_DFA_DIS_XX.wav   DIS
2  1001_DFA_FEA_XX.wav   FEA
3  1001_DFA_HAP_XX.wav   HAP
4  1001_DFA_NEU_XX.wav   NEU


imports

In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import numpy as np
import pandas as pd
import librosa
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =========================
# Dataset
# =========================
class SERDataset(Dataset):
    def __init__(self, csv_path, audio_dir, sr=16000, max_seconds=6):
        self.df = pd.read_csv(csv_path)
        self.audio_dir = audio_dir
        self.sr = sr
        self.max_len = sr * max_seconds

        self.le = LabelEncoder()
        self.labels = self.le.fit_transform(self.df["label"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file = self.df.iloc[idx]["file"]
        label = self.labels[idx]

        path = os.path.join(self.audio_dir, file)
        if not os.path.exists(path):
            raise FileNotFoundError(path)

        y, _ = librosa.load(path, sr=self.sr, mono=True)

        if len(y) > self.max_len:
            y = y[:self.max_len]
        else:
            y = np.pad(y, (0, self.max_len - len(y)))

        return {
            "audio": torch.tensor(y, dtype=torch.float32),
            "label": torch.tensor(label, dtype=torch.long)
        }


In [4]:
# =========================
# Feature Extraction
# =========================
def collate_fn(batch, extractor, wav2vec):
    audios = [b["audio"].numpy() for b in batch]
    labels = torch.stack([b["label"] for b in batch]).to(DEVICE)

    inputs = extractor(
        audios,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = wav2vec(**inputs)
        features = outputs.last_hidden_state  # (B, T, 768)

    return features, labels



In [5]:
# =========================
# Model
# =========================
class CNN_BiLSTM(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.conv1 = nn.Conv1d(768, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.3)

        self.lstm = nn.LSTM(
            input_size=256,
            hidden_size=256,
            bidirectional=True,
            batch_first=True
        )

        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)          # (B, 768, T)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.permute(0, 2, 1)          # (B, T, 256)

        x, _ = self.lstm(x)
        x = x.mean(dim=1)               # Temporal pooling
        x = self.dropout(x)

        return self.fc(x)


In [6]:
# =========================
# Training
# =========================
def train():
    CSV = "transcripts.csv"
    AUDIO_DIR = r"C:\Users\Dennismz\Desktop\CDAC_PROJECT\dataset for SER\CREMA-D\AudioWAV"
    EPOCHS = 10
    BATCH = 4

    dataset = SERDataset(CSV, AUDIO_DIR)

    train_idx, val_idx = train_test_split(
        range(len(dataset)),
        test_size=0.2,
        stratify=dataset.labels,
        random_state=42
    )

    train_ds = Subset(dataset, train_idx)
    val_ds = Subset(dataset, val_idx)

    extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        "facebook/wav2vec2-base"
    )

    wav2vec = Wav2Vec2Model.from_pretrained(
        "facebook/wav2vec2-base"
    ).to(DEVICE)

    # Freeze wav2vec
    for p in wav2vec.parameters():
        p.requires_grad = False

    model = CNN_BiLSTM(num_classes=len(dataset.le.classes_)).to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    train_loader = DataLoader(
        train_ds,
        batch_size=BATCH,
        shuffle=True,
        collate_fn=lambda x: collate_fn(x, extractor, wav2vec)
    )

    val_loader = DataLoader(
        val_ds,
        batch_size=BATCH,
        shuffle=False,
        collate_fn=lambda x: collate_fn(x, extractor, wav2vec)
    )

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total, correct, loss_sum = 0, 0, 0

        for x, y in tqdm(train_loader, desc=f"Epoch {epoch}"):
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

            loss_sum += loss.item() * y.size(0)
            correct += (out.argmax(1) == y).sum().item()
            total += y.size(0)

        print(f"Train Acc: {correct/total:.4f}")

    torch.save(model.state_dict(), "ser_model.pt")
    print("Model saved.")

# =========================
if __name__ == "__main__":
    train()

Epoch 1: 100%|██████████| 856/856 [1:01:51<00:00,  4.34s/it]


Train Acc: 0.3373


Epoch 2: 100%|██████████| 856/856 [1:01:02<00:00,  4.28s/it]


Train Acc: 0.4095


Epoch 3: 100%|██████████| 856/856 [56:40<00:00,  3.97s/it]


Train Acc: 0.4384


Epoch 4: 100%|██████████| 856/856 [54:50<00:00,  3.84s/it]


Train Acc: 0.4743


Epoch 5: 100%|██████████| 856/856 [57:01<00:00,  4.00s/it]


Train Acc: 0.5000


Epoch 6: 100%|██████████| 856/856 [51:26<00:00,  3.61s/it]


Train Acc: 0.5091


Epoch 7: 100%|██████████| 856/856 [46:47<00:00,  3.28s/it]


Train Acc: 0.5315


Epoch 8: 100%|██████████| 856/856 [52:15<00:00,  3.66s/it]


Train Acc: 0.5459


Epoch 9: 100%|██████████| 856/856 [55:22<00:00,  3.88s/it] 


Train Acc: 0.5567


Epoch 10: 100%|██████████| 856/856 [50:45<00:00,  3.56s/it]

Train Acc: 0.5698
Model saved.



