In [41]:
# Import libraries
import os
import torch
import torchaudio
import torchaudio.transforms as T
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch import nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

## Data

### Load

In [42]:
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def parse_emotion_from_filename(filename):
    emotion_code = filename.split('-')[2]
    return emotion_map.get(emotion_code, 'unknown')

# Collect all files and labels
def load_filepaths_and_labels(audio_dir):
    data = []
    for root, _, files in os.walk(audio_dir):
        for f in files:
            if f.endswith('.wav'):
                full_path = os.path.join(root, f)
                emotion = parse_emotion_from_filename(f)
                actor = root.split("\\")[-1]
                data.append((full_path, actor, f, emotion))
    return pd.DataFrame(data, columns=['path', 'actor', 'file', 'emotion'])


In [43]:
current_path = os.getcwd()
data_path = os.path.join(current_path, "data")

In [44]:
df = load_filepaths_and_labels(data_path)

### Class to process data

In [45]:
class EmotionDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        self.label2idx = {label: idx for idx, label in enumerate(sorted(df['emotion'].unique()))}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        label = self.df.iloc[idx]['emotion']
        waveform, sr = torchaudio.load(path)
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, self.label2idx[label]


### Transform data

In [46]:
mfcc_transform = T.MFCC(
    sample_rate=16000,
    n_mfcc=40,
    melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 64}
)

# Resample if needed
resampler = T.Resample(orig_freq=48000, new_freq=16000)  # optional if not 16kHz

def full_transform(waveform):
    if waveform.shape[1] > 0:
        waveform = resampler(waveform)
    return mfcc_transform(waveform)

## Model

### Train and validation split

In [47]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['emotion'], random_state=42)

### Emotion Dataset format

In [48]:
train_dataset = EmotionDataset(train_df, transform=full_transform)
val_dataset = EmotionDataset(val_df, transform=full_transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

### Define the model

In [49]:
class CNNEmotionClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(32, num_classes)
        )

    def forward(self, x):
        return self.net(x)

### Train the model

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNEmotionClassifier(num_classes=len(train_dataset.label2idx)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        x = x.unsqueeze(1)  # [B, 1, time, feat]
        preds = model(x)
        loss = loss_fn(preds, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} done. Loss: {loss.item():.4f}")

RuntimeError: stack expects each tensor to be equal size, but got [1, 40, 371] at entry 0 and [1, 40, 431] at entry 1

### Test the model

In [None]:
# Put model in evaluation mode
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        x = x.unsqueeze(1)  # [B, 1, n_mfcc, T]
        outputs = model(x)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

# Accuracy
acc = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {acc:.4f}")

# Detailed report
print(classification_report(all_labels, all_preds, target_names=train_dataset.label2idx.keys()))
