In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
features=np.load("../datasets/processed_audio/features.npy", allow_pickle=True)
labels=np.load("../datasets/processed_audio/labels.npy", allow_pickle=True)

In [3]:
X_train, X_val, y_train, y_val=train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)


In [4]:
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x= torch.tensor(self.features[idx], dtype=torch.float32).unsqueeze(0)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

train_dataset = AudioDataset(X_train, y_train)
val_dataset = AudioDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [6]:
class AudioCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)
        self.bn1 = nn.BatchNorm2d(16)

        self.pool = nn.MaxPool2d(kernel_size=(2, 2))

        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), padding=1)
        self.bn2 = nn.BatchNorm2d(32)

        self.pool = nn.MaxPool2d(kernel_size=(2, 2))

        self.fc1 = nn.Linear(32 * (X_train.shape[1]//4) * (X_train.shape[2]//4), 128)  # Adjust based on input size
        self.fc2 = nn.Linear(128, n_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x=self.pool(self.relu(self.bn1(self.conv1(x))))
        x=self.pool(self.relu(self.bn2(self.conv2(x))))
        x=torch.flatten(x, 1)
        x=self.dropout(self.relu(self.fc1(x)))
        x=self.fc2(x)
        return x
    
model=AudioCNN(n_classes=5)

In [7]:
print(X_train.shape)

(691, 200, 40)


In [8]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()

    total_loss = 0
    correct = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()

        output=model(x)

        loss = criterion(output, y)

        loss.backward()
        optimizer.step()
        total_loss+=loss.item()*x.size(0)
        preds=output.argmax(dim=1)
        correct+=(preds==y).sum().item()
    return total_loss/len(loader.dataset),correct/len(loader.dataset)

def eval_epoch(model, loader, criterion, device):
    model.eval()

    total_loss = 0
    correct = 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)

            output=model(x)

            loss = criterion(output, y)
            total_loss+=loss.item()*x.size(0)
            preds=output.argmax(dim=1)
            correct+=(preds==y).sum().item()
    
    return total_loss/len(loader.dataset),correct/len(loader.dataset)

In [10]:
num_epochs = 20
for epoch in range(num_epochs):
    train_loss, train_acc= train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc= eval_epoch(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

Epoch 1/20, Train Loss: 2.0408, Train Acc: 0.2663, Val Loss: 1.5399, Val Acc: 0.2717
Epoch 2/20, Train Loss: 1.4537, Train Acc: 0.3343, Val Loss: 1.2835, Val Acc: 0.4393
Epoch 3/20, Train Loss: 1.3170, Train Acc: 0.4327, Val Loss: 1.2067, Val Acc: 0.4913
Epoch 4/20, Train Loss: 1.2034, Train Acc: 0.5137, Val Loss: 1.1643, Val Acc: 0.5491
Epoch 5/20, Train Loss: 1.0969, Train Acc: 0.5687, Val Loss: 1.1620, Val Acc: 0.4624
Epoch 6/20, Train Loss: 1.0556, Train Acc: 0.5601, Val Loss: 1.0227, Val Acc: 0.5665
Epoch 7/20, Train Loss: 0.9037, Train Acc: 0.6469, Val Loss: 0.9668, Val Acc: 0.5954
Epoch 8/20, Train Loss: 0.7979, Train Acc: 0.6643, Val Loss: 0.9304, Val Acc: 0.6474
Epoch 9/20, Train Loss: 0.7423, Train Acc: 0.7192, Val Loss: 0.8628, Val Acc: 0.6301
Epoch 10/20, Train Loss: 0.6164, Train Acc: 0.7598, Val Loss: 0.8599, Val Acc: 0.6474
Epoch 11/20, Train Loss: 0.5890, Train Acc: 0.7742, Val Loss: 0.7764, Val Acc: 0.6821
Epoch 12/20, Train Loss: 0.4989, Train Acc: 0.8046, Val Loss: 0

In [11]:
from collections import Counter
print(Counter(labels))

Counter({4: 192, 1: 192, 0: 192, 3: 192, 2: 96})


In [12]:
torch.save(model.state_dict(), "../training/models/audio_cnn.pt")

In [13]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=AudioCNN(n_classes=5)
model.load_state_dict(torch.load("../training/models/audio_cnn.pt", map_location=device))
model.to(device)
model.eval()

all_audio_probs=[]
all_labels=[]


with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device)
        output = model(x)
        probs = torch.softmax(output, dim=1)
        all_audio_probs.append(probs.cpu())
        all_labels.append(y)

In [14]:
audio_val_probs = torch.cat(all_audio_probs, dim=0)
fusion_val_labels= torch.cat(all_labels, dim=0)
audio_preds = audio_val_probs.argmax(dim=1)

In [15]:
torch.save(audio_val_probs, "../datasets/fusion/audio_val_probs.pt")
torch.save(fusion_val_labels, "../datasets/fusion/fusion_val_labels.pt")
torch.save(audio_preds, "../datasets/fusion/audio_val_preds.pt")

print("Saved: audio_val_probs.pt and fusion_val_labels.pt")

Saved: audio_val_probs.pt and fusion_val_labels.pt
