In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
import torch.nn as nn

In [2]:
categories = ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow',
        'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip',
        'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh',
        'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill',
        'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat',
        'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck',
        'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']

features_dir = r"C:\Users\abdul\Desktop\Uni\ML Pattern Classification\Project\Classification\MLPC2025_classification\audio_features"
labels_dir = r"C:\Users\abdul\Desktop\Uni\ML Pattern Classification\Project\Classification\MLPC2025_classification\labels"

In [3]:
ann_df = pd.read_csv( 'annotations.csv')
meta_df = pd.read_csv('metadata.csv')


train_filename = meta_df.sample(len(meta_df),random_state=42)['filename'].unique()[:int(len(meta_df)*0.7)]
validation_filename = meta_df.sample(len(meta_df),random_state=42)['filename'].unique()[int(len(meta_df)*0.7):int(len(meta_df)*0.9)]
test_filename = meta_df.sample(len(meta_df),random_state=42)['filename'].unique()[int(len(meta_df)*0.9):len(meta_df)]





In [4]:
def aggregate_labels(file_labels):
    __y = []
    for frame_labels in file_labels:
        if(sum(frame_labels) == 0):
            __y.append([0])
        elif(np.count_nonzero(frame_labels) == len(frame_labels)):
             __y.append([1])
        else: #The annotators don't agree on the label
            __y.append([np.random.choice(frame_labels)])
    return __y

In [5]:
import itertools
def read_files(file_names):
    X_train = []
    Y_train = {}
    for c in categories:
        Y_train[c] = []
    for f in file_names: #we are not loading the entire dataset due to processing time
        features = np.load(os.path.join(features_dir , f.split('.')[0] + '.npz'))["melspectrogram"]
        X_train.append(features)
        y = np.load(os.path.join(labels_dir , f.split('.')[0] + '_labels.npz'))
        for c in categories:
            _y = aggregate_labels(y[c])
            Y_train[c].extend(list(itertools.chain.from_iterable(_y)))
    X_train = np.concatenate(X_train)
    return X_train, Y_train


In [6]:
train_x,train_y = read_files(train_filename)
val_x, val_y = read_files(validation_filename)
test_x, test_y = read_files(test_filename)

In [7]:
train_x.shape
val_x.shape
test_x.shape
print(f'shape of train set:{train_x.shape}, shape of val set:{val_x.shape}, shape of test set:{test_x.shape}')
#print(f'shape of train_y set:{train_y.shape}, shape of val_y set:{val_y.shape}, shape of test_y set:{test_y.shape}')

shape of train set:(1078243, 64), shape of val set:(305277, 64), shape of test set:(155057, 64)


In [8]:
from sklearn.preprocessing import StandardScaler

def log_scalling(array):
    epsilon = 1e-10  
    log_array = np.log(array + epsilon)
    return log_array
    
def z_score(array):
    s = StandardScaler()
    scaled_array = s.fit_transform(array)
    return scaled_array,s.mean_,s.scale_

In [9]:
log_train_x,log_validation_x,log_test_x = log_scalling(train_x),log_scalling(val_x),log_scalling(test_x)


scaled_train_x,mu,std = z_score(log_train_x)
scaled_validation_x = (log_validation_x - mu)/std
scaled_test_x = (log_test_x - mu)/std

In [10]:
train_y = np.array([train_y[cls] for cls in categories]).T
val_y = np.array([val_y[cls] for cls in categories]).T
test_y = np.array([test_y[cls] for cls in categories]).T
train_y.shape

(1078243, 58)

In [11]:
class SpectrogramTransformer(nn.Module):
    def __init__(self,mel_bin=64,em_dim=128,num_labels=58 , n_head=4 , num_layer=2,dropout=0.3):
        super().__init__()
        self.input_projection = nn.Linear(mel_bin,em_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=em_dim,nhead=n_head,dropout=dropout,batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer,num_layer)
        self.classifier = nn.Linear(em_dim,num_labels)

    def forward(self,x):
        x = self.input_projection(x)
        x = self.transformer(x)
        logits = self.classifier(x)
        return logits  

In [12]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [17]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


device = "cuda"

model = SpectrogramTransformer().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Datasets
batch_size = 32
scaled_train_x = torch.tensor(scaled_train_x, dtype=torch.float32)
train_y = torch.tensor(train_y, dtype=torch.float32)
scaled_validation_x = torch.tensor(scaled_validation_x, dtype=torch.float32)
val_y = torch.tensor(val_y, dtype=torch.float32)

train_dataset = TensorDataset(scaled_train_x, train_y)
val_dataset = TensorDataset(scaled_validation_x, val_y)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Training loop
num_epochs = 4
from torch.nn.functional import sigmoid

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device).float()
        y_batch = y_batch.to(device).float()

        optimizer.zero_grad()
        output = model(x_batch)  # raw logits
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Accuracy: apply sigmoid and threshold
        preds = (sigmoid(output) > 0.5).int()
        targets = y_batch.int()
        train_correct += (preds == targets).sum().item()
        train_total += targets.numel()

    train_accuracy = train_correct / train_total

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device).float()
            y_val = y_val.to(device).float()

            val_output = model(x_val)
            val_loss += criterion(val_output, y_val).item()

            preds = (sigmoid(val_output) > 0.5).int()
            targets = y_val.int()
            val_correct += (preds == targets).sum().item()
            val_total += targets.numel()

    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Train Acc: {train_accuracy*100:.2f}% | "
          f"Val Loss: {val_loss/len(val_loader):.4f} | "
          f"Val Acc: {val_accuracy*100:.2f}%")

torch.save(model.state_dict(), 'spectrogram_transformer_weights.pth')
torch.save(model, 'spectrogram_transformer_full_model.pth')

  scaled_train_x = torch.tensor(scaled_train_x, dtype=torch.float32)
  train_y = torch.tensor(train_y, dtype=torch.float32)
  scaled_validation_x = torch.tensor(scaled_validation_x, dtype=torch.float32)
  val_y = torch.tensor(val_y, dtype=torch.float32)


Epoch 1/4 | Train Loss: 0.0724 | Train Acc: 98.01% | Val Loss: 0.0805 | Val Acc: 97.90%
Epoch 2/4 | Train Loss: 0.0639 | Train Acc: 98.13% | Val Loss: 0.0823 | Val Acc: 97.85%
Epoch 3/4 | Train Loss: 0.0605 | Train Acc: 98.20% | Val Loss: 0.0825 | Val Acc: 97.86%
Epoch 4/4 | Train Loss: 0.0584 | Train Acc: 98.24% | Val Loss: 0.0829 | Val Acc: 97.82%


In [15]:
scaled_test_x = torch.tensor(scaled_test_x, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.float32)

test_dataset = TensorDataset(scaled_test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()

test_correct = 0
test_total = 0

with torch.no_grad():
    for x_test, y_test in test_loader:
        x_test = x_test.to(device).float()
        y_test = y_test.to(device).float()

        output = model(x_test)
        preds = (sigmoid(output) > 0.5).int()
        targets = y_test.int()

        test_correct += (preds == targets).sum().item()
        test_total += targets.numel()

test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

  scaled_test_x = torch.tensor(scaled_test_x, dtype=torch.float32)
  test_y = torch.tensor(test_y, dtype=torch.float32)


Test Accuracy: 97.81%
