**Modules**
os to deal with the data
visualization: seaborn, matplotlib
audio visualization:librosa
play audio: Iputhon Audio

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import torchaudio.transforms as T
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
torch.cuda.empty_cache()

In [3]:
import torch
print(torch.cuda.is_available())  # Should be True
print(torch.version.cuda)

True
12.6


In [4]:
SAMPLE_RATE = 22050
N_MFCC = 50
BATCH_SIZE = 60
LEARNING_RATE = 0.01
EPOCHS = 750

In [5]:
DATASET_PATH = "C:/Users/NJS/Desktop/Thesis/RAVDESS_DATASET"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
def extract_mfcc_sequence(waveform):
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    mfcc_transform = T.MFCC(
        sample_rate=SAMPLE_RATE,
        n_mfcc=N_MFCC,
        melkwargs={'n_fft': 1024, 'hop_length': 512, 'n_mels': 128}
    )
    mfcc = mfcc_transform(waveform)
    mfcc = mfcc.squeeze(0).transpose(0, 1)
    return mfcc

In [7]:
def extract_prosodic_features(waveform, sample_rate=SAMPLE_RATE):
    pitch = torchaudio.functional.detect_pitch_frequency(waveform, sample_rate)
    pitch = pitch[pitch > 0]  # filter out zeros
    if pitch.numel() == 0:
        pitch_mean, pitch_std = 0.0, 0.0
    else:
        pitch_mean, pitch_std = pitch.mean().item(), pitch.std().item()
    energy = waveform.pow(2).mean().sqrt().item()
    return torch.tensor([pitch_mean, pitch_std, energy], dtype=torch.float32)

In [8]:
def pad_sequence(sequences):
    lengths = [s.shape[0] for s in sequences]
    max_len = max(lengths)
    padded = torch.zeros(len(sequences), max_len, sequences[0].shape[1])
    for i, seq in enumerate(sequences):
        end = lengths[i]
        if isinstance(seq, torch.Tensor):
            padded[i, :end, :] = seq.detach().clone()
        else:
            padded[i, :end, :] = torch.tensor(seq, dtype=torch.float32)
    return padded

In [9]:
def load_dataset(path):
    files = []
    labels = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith(".wav"):
                full_path = os.path.join(dirname, filename)
                emotion = os.path.basename(dirname).lower()
                files.append(full_path)
                labels.append(emotion)
    df = pd.DataFrame({"path": files, "label": labels})
    return df


In [10]:
class SERDataset(Dataset):
    def __init__(self, dataframe, label_encoder, scaler):
        self.paths = dataframe['path'].values
        self.labels = label_encoder.transform(dataframe['label'].values)
        self.scaler = scaler

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.paths[idx])
        if sr != SAMPLE_RATE:
            resample = T.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
            waveform = resample(waveform)
        mfcc = extract_mfcc_sequence(waveform)
        prosodic = extract_prosodic_features(waveform)
        mfcc = self.scaler.transform(mfcc.cpu().numpy())
        prosodic = prosodic_scaler.transform(prosodic.unsqueeze(0).numpy()).squeeze(0)
        prosodic = torch.tensor(prosodic, dtype=torch.float32)
        return torch.tensor(mfcc, dtype=torch.float32), prosodic, torch.tensor(self.labels[idx], dtype=torch.long)

In [11]:
def collate_fn(batch):
    sequences, prosodics, labels = zip(*batch)
    sequences_padded = pad_sequence(sequences)
    prosodics = torch.stack(prosodics).float()
    labels = torch.stack(labels)
    return sequences_padded, prosodics, labels

In [12]:
class AttentionPooling(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(dim, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        weights = self.attention(x)
        weights = torch.softmax(weights, dim=1)
        return (x * weights).sum(dim=1)

In [13]:
class HybridLSTMTransformer(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size=N_MFCC, hidden_size=64, batch_first=True)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=64, nhead=4, dim_feedforward=512, dropout=0.3, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.attn_pool = AttentionPooling(64)
        self.fc = nn.Sequential(
            nn.Linear(64 + 3, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, x, prosodic):
        lstm_out, _ = self.lstm(x)
        trans_out = self.transformer(lstm_out)
        pooled = self.attn_pool(trans_out)
        fused = torch.cat([pooled, prosodic], dim=1)
        return self.fc(fused)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, patience=10):
    best_acc = 0
    all_preds = []
    all_labels = []
    train_losses, val_accuracies = [], []

    best_acc = 0
    best_epoch = 0
    no_improve_counter = 0

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0
        for x, p, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            x, p, y = x.to(device), p.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x, p)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        train_losses.append(running_loss / len(train_loader))

        model.eval()
        correct, total = 0, 0
        all_preds.clear()
        all_labels.clear()
        with torch.no_grad():
            for x, p, y in val_loader:
                x, p, y = x.to(device), p.to(device), y.to(device)
                output = model(x, p)
                preds = output.argmax(dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y.cpu().numpy())
                correct += (preds == y).sum().item()
                total += y.size(0)
        acc = correct / total
        val_accuracies.append(acc)

        print(f"Epoch {epoch+1}: Train Loss = {running_loss:.4f}, Val Acc = {acc*100:.2f}%")
        if acc > best_acc:
            best_acc = acc
            best_epoch = epoch
            no_improve_counter = 0
            torch.save(model.state_dict(), "best_hybrid_model.pth")
            print("Model saved.")
        else:
            no_improve_counter += 1

        scheduler.step(acc)

        if no_improve_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}. Best epoch was {best_epoch+1}.")
            break

    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
    disp.plot(cmap='Blues', xticks_rotation=45)
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Validation Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [15]:
df = load_dataset(DATASET_PATH)
train_df, val_df = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=42)
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label']), y=train_df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Fit scalers on training data
all_mfcc = []
all_prosodic = []
for path in train_df['path']:
    waveform, sr = torchaudio.load(path)
    if sr != SAMPLE_RATE:
        waveform = T.Resample(sr, SAMPLE_RATE)(waveform)
    mfcc = extract_mfcc_sequence(waveform)
    prosodic = extract_prosodic_features(waveform)
    all_mfcc.append(mfcc.cpu().numpy())
    all_prosodic.append(prosodic.numpy())
all_mfcc_flat = np.concatenate([m for m in all_mfcc], axis=0)
scaler = StandardScaler().fit(all_mfcc_flat)
prosodic_scaler = StandardScaler().fit(np.stack(all_prosodic))

train_dataset = SERDataset(train_df, label_encoder, scaler)
val_dataset = SERDataset(val_df, label_encoder, scaler)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

model = HybridLSTMTransformer(num_classes=len(label_encoder.classes_)).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5)

train_model(model, train_loader, val_loader, criterion, optimizer)


Epoch 1/750:   0%|          | 0/22 [00:02<?, ?it/s]

[Debug] model.device: cuda:0, x.device: cpu, p.device: cpu, y.device: cpu





Epoch 1: Train Loss = 0.0000, Val Acc = 12.50%
Model saved.


Epoch 2/750:   0%|          | 0/22 [00:02<?, ?it/s]

[Debug] model.device: cuda:0, x.device: cpu, p.device: cpu, y.device: cpu





Epoch 2: Train Loss = 0.0000, Val Acc = 12.50%


Epoch 3/750:   0%|          | 0/22 [00:02<?, ?it/s]

[Debug] model.device: cuda:0, x.device: cpu, p.device: cpu, y.device: cpu





Epoch 3: Train Loss = 0.0000, Val Acc = 12.50%


Epoch 4/750:   0%|          | 0/22 [00:02<?, ?it/s]

[Debug] model.device: cuda:0, x.device: cpu, p.device: cpu, y.device: cpu





Epoch 4: Train Loss = 0.0000, Val Acc = 12.50%


Epoch 5/750:   0%|          | 0/22 [00:02<?, ?it/s]

[Debug] model.device: cuda:0, x.device: cpu, p.device: cpu, y.device: cpu





KeyboardInterrupt: 