Conformer

In [16]:
import torch
import torchaudio
from torch.utils.data import Dataset
from torchaudio.transforms import MelSpectrogram, Resample
import pandas as pd

class AudioDataset(Dataset):
    def __init__(self, csv_file, transform=None, target_length=73):
        self.data = pd.read_csv(csv_file)
        self.file_paths = self.data['file_path'].tolist()
        self.labels = self.data['label'].tolist()
        self.transform = transform
        self.target_length = target_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio_file = self.file_paths[idx]
        label = self.labels[idx]
        waveform, sample_rate = torchaudio.load(audio_file)

        target_sample_rate = 16000
        if sample_rate != target_sample_rate:
            waveform = Resample(sample_rate, target_sample_rate)(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        if waveform.size(-1) < self.target_length:
            pad_length = self.target_length - waveform.size(-1)
            waveform = torch.nn.functional.pad(waveform, (0, pad_length))
        elif waveform.size(-1) > self.target_length:
            waveform = waveform[:, :, :self.target_length]
        
        label = torch.tensor(label)  # Convert label to tensor here
        return waveform, label



In [None]:
from torch.utils.data import DataLoader
from torchaudio.transforms import MelSpectrogram

transform = MelSpectrogram(sample_rate=16000, n_mels=512, hop_length=512, n_fft=1024)

dataset = AudioDataset("labeled_data.csv", transform=transform, target_length=73)
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


#25m 0.1s



In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ConformerBlock(nn.Module):
    def __init__(self, hidden_dim, dropout=0.3):
        super(ConformerBlock, self).__init__()
        self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 4, hidden_dim)
        )
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.norm3 = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Conv block
        residual = x
        x = x.transpose(1, 2)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.transpose(1, 2)
        x = self.norm1(x + residual)

        # Self-attention block
        residual = x
        attn_output, _ = self.attention(x, x, x)
        x = self.norm2(x + self.dropout(attn_output))

        # Feedforward block
        residual = x
        ff_output = self.ffn(x)
        x = self.norm3(x + self.dropout(ff_output))

        return x

class ConformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, num_blocks=8, hidden_dim=384, dropout=0.3):
        super(ConformerModel, self).__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        self.conformer_blocks = nn.ModuleList([
            ConformerBlock(hidden_dim, dropout) for _ in range(num_blocks)
        ])
        self.global_norm = nn.LayerNorm(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = x.transpose(1, 2)  # (B, T, input_dim)
        x = self.input_proj(x)
        for block in self.conformer_blocks:
            x = block(x)
        x = self.global_norm(x)
        x = torch.mean(x, dim=1)
        return self.fc_out(x)


In [29]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_score):
        if self.best_score is None:
            self.best_score = val_score
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.counter = 0


In [None]:

import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ConformerModel(input_dim=512, num_classes=5).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for mel_spectrograms, labels in train_loader:
        mel_spectrograms = mel_spectrograms.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        mel_spectrograms = mel_spectrograms.squeeze(1)
        logits = model(mel_spectrograms)
        loss = F.cross_entropy(logits, labels)
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)
    
    avg_loss = total_loss / len(train_loader)
    accuracy = correct_preds / total_preds * 100
    return avg_loss, accuracy

def validate(model, val_loader, device):
    model.eval()
    total_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for mel_spectrograms, labels in val_loader:
            mel_spectrograms = mel_spectrograms.to(device)
            labels = labels.to(device)

            mel_spectrograms = mel_spectrograms.squeeze(1)
            logits = model(mel_spectrograms)
            loss = F.cross_entropy(logits, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)
    
    
    avg_loss = total_loss / len(val_loader)
    accuracy = correct_preds / total_preds * 100
    return avg_loss, accuracy
EPOCHS = 20
best_val_acc = 0.0

early_stopper = EarlyStopping(patience=3, min_delta=0.001)

for epoch in range(EPOCHS):
    loss, train_acc = train(model, train_loader, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, device)

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss:.4f}, Train Acc: {train_acc:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model_Conformer_Crema.pth")
        print("Saved new best model!")
    
    early_stopper(val_acc)
    if early_stopper.early_stop:
        print("Early stopping triggered.")
        break



#1033m 36.1s


Epoch 1/20, Loss: 1.1990, Train Acc: 50.17%, Val Loss: 1.1425, Val Acc: 53.83%
Saved new best model!
Epoch 2/20, Loss: 1.0029, Train Acc: 59.15%, Val Loss: 0.9830, Val Acc: 61.44%
Saved new best model!
Epoch 3/20, Loss: 0.8438, Train Acc: 66.81%, Val Loss: 0.7927, Val Acc: 68.57%
Saved new best model!
Epoch 4/20, Loss: 0.6792, Train Acc: 73.71%, Val Loss: 0.7016, Val Acc: 74.12%
Saved new best model!
Epoch 5/20, Loss: 0.5316, Train Acc: 80.08%, Val Loss: 0.6479, Val Acc: 76.75%
Saved new best model!
Epoch 6/20, Loss: 0.4215, Train Acc: 84.56%, Val Loss: 0.6051, Val Acc: 78.29%
Saved new best model!
Epoch 7/20, Loss: 0.3333, Train Acc: 87.95%, Val Loss: 0.5795, Val Acc: 80.80%
Saved new best model!
Epoch 8/20, Loss: 0.2647, Train Acc: 90.47%, Val Loss: 0.6719, Val Acc: 81.94%
Saved new best model!
Epoch 9/20, Loss: 0.2235, Train Acc: 92.26%, Val Loss: 0.6592, Val Acc: 81.49%
Epoch 10/20, Loss: 0.1952, Train Acc: 93.16%, Val Loss: 0.6904, Val Acc: 81.65%
Epoch 11/20, Loss: 0.1594, Train 

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

def evaluate(model, test_loader, device):
    model.eval()
    correct_preds = 0
    total_preds = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for mel_spectrograms, labels in test_loader:
            mel_spectrograms = mel_spectrograms.to(device)
            labels = labels.to(device)

            mel_spectrograms = mel_spectrograms.squeeze(1) 
            logits = model(mel_spectrograms)

            _, predicted = torch.max(logits, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_preds / total_preds * 100
    f1 = f1_score(all_labels, all_preds, average='macro') * 100
    cm = confusion_matrix(all_labels, all_preds)

    return accuracy, f1, cm


model.load_state_dict(torch.load("best_model_Conformer_Crema.pth"))
accuracy, f1, cm = evaluate(model, test_loader, device)


#7m 0.1s

In [38]:

print(f"Test Accuracy: {accuracy:.2f}%")
print(f"F1 Score: {f1:.2f}%")



label_names = ['angry', 'happy', 'neutral', 'sad', 'fear']
cm_df = pd.DataFrame(cm, 
    index=[f"Actual {label}" for label in label_names],
    columns=[f"Predicted {label}" for label in label_names])


print("Confusion Matrix:")
print(cm_df)

Test Accuracy: 82.17%
F1 Score: 82.10%
Confusion Matrix:
                Predicted angry  Predicted happy  Predicted neutral  \
Actual angry               1110               16                 90   
Actual happy                 47              956                106   
Actual neutral               88               42               1064   
Actual sad                   39               25                 58   
Actual fear                  14               96                 19   

                Predicted sad  Predicted fear  
Actual angry               28               4  
Actual happy               64             110  
Actual neutral             43              28  
Actual sad                902              60  
Actual fear               123            1039  
