In [1]:
import numpy as np
import pandas as pd
import os 

In [2]:
categories = ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow',
        'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip',
        'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh',
        'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill',
        'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat',
        'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck',
        'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']

In [3]:
ann_df = pd.read_csv(
    'annotations.csv'
)
meta_df = pd.read_csv(
    'metadata.csv'
)
ann_df.shape


(27552, 9)

In [4]:
train_filename = meta_df.sample(len(meta_df),random_state=42)['filename'].unique()[:int(len(meta_df)*0.7)]
validation_filename = meta_df.sample(len(meta_df),random_state=42)['filename'].unique()[int(len(meta_df)*0.7):int(len(meta_df)*0.9)]
test_filename = meta_df.sample(len(meta_df),random_state=42)['filename'].unique()[int(len(meta_df)*0.9):len(meta_df)]




In [5]:
def aggregate_labels(file_labels):
    __y = []
    for frame_labels in file_labels:
        if(sum(frame_labels) == 0):
            __y.append([0])
        elif(np.count_nonzero(frame_labels) == len(frame_labels)):
             __y.append([1])
        else: #The annotators don't agree on the label
            __y.append([np.random.choice(frame_labels)])
    return __y

In [6]:
features_dir = "audio_features"
labels_dir = "labels"

import itertools
def read_files(file_names):
    X_train = []
    Y_train = {c: [] for c in categories}  # Initialize label dictionary
    
    for f in file_names:
        # Load melspectrogram and MFCC features
        features_mel = np.load(os.path.join(features_dir, f.split('.')[0] + '.npz'))["melspectrogram"]
        features_mfcc = np.load(os.path.join(features_dir, f.split('.')[0] + '.npz'))["mfcc"]
        
        
        # Pad or truncate all features to have the same number of timesteps
        min_length = min(features_mel.shape[0], features_mfcc.shape[0])
        features_mel = features_mel[:min_length]
        features_mfcc = features_mfcc[:min_length]
        
        # Concatenate features along feature axis (axis=1)
        features = np.concatenate([features_mel, features_mfcc], axis=1)
        X_train.append(features)
        
        # Load and process labels
        y = np.load(os.path.join(labels_dir, f.split('.')[0] + '_labels.npz'))
        for c in categories:
            _y = aggregate_labels(y[c])
            Y_train[c].extend(list(itertools.chain.from_iterable(_y))[:min_length])  # Match length
    
    X_train = np.concatenate(X_train)
    return X_train, Y_train

In [7]:
from sklearn.preprocessing import StandardScaler
def preprocessing(array):
    s=StandardScaler()
    scaled_array = s.fit_transform(array)
    scalar =s.fit(array)
    return scaled_array,scalar.mean_,scalar.scale_

In [8]:
train_x,train_y = read_files(train_filename)
val_x, val_y = read_files(validation_filename)
test_x, test_y = read_files(test_filename)

In [9]:
scaled_train_x,mu,std =preprocessing(train_x)
scaled_validation_x = (val_x - mu)/std
scaled_test_x = (test_x - mu)/std

In [10]:
train_y = np.array([train_y[cls] for cls in categories]).T
val_y = np.array([val_y[cls] for cls in categories]).T
test_y = np.array([test_y[cls] for cls in categories]).T
train_y.shape

(1078243, 58)

In [11]:
import torch
import torch.nn as nn

class BiLSTMAED(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers=2, dropout=0.3):
        super(BiLSTMAED, self).__init__()
        self.bilstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, num_classes)
        )

    def forward(self, x):
        lstm_out, _ = self.bilstm(x)  
        out = self.classifier(lstm_out)  
        return out

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


device = "cuda"

model = BiLSTMAED(input_dim=96, hidden_dim=128, num_classes=58).to(device)
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
print(model)

scaled_train_x = torch.tensor(scaled_train_x, dtype=torch.float32)
train_y = torch.tensor(train_y, dtype=torch.float32)
val_x = torch.tensor(val_x, dtype=torch.float32)
val_y = torch.tensor(val_y, dtype=torch.float32)



# Datasets
batch_size = 32
train_dataset = TensorDataset(scaled_train_x, train_y)
val_dataset = TensorDataset(val_x, val_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Training loop
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device).float()
        y_batch = y_batch.to(device).float()

        optimizer.zero_grad()
        output = model(x_batch)  # (B, T, C)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
        # Train accuracy calculation
        train_predictions = (output > 0.5).int()
        train_correct += (train_predictions == y_batch.int()).sum().item()
        train_total += y_batch.numel()

    train_accuracy = train_correct / train_total

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device).float()
            y_val = y_val.to(device).float()

            val_output = model(x_val)
            val_loss += criterion(val_output, y_val).item()
            
            # Validation accuracy calculation
            val_predictions = (val_output > 0.5).int()
            val_correct += (val_predictions == y_val.int()).sum().item()
            val_total += y_val.numel()

    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Train Accuracy: {train_accuracy * 100:.2f}% | "
          f"Val Loss: {val_loss/len(val_loader):.4f} | "
          f"Val Accuracy: {val_accuracy * 100:.2f}%")
torch.save(model.state_dict(), "BiLSTM.pth")

BiLSTMAED(
  (bilstm): LSTM(96, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=58, bias=True)
  )
)
Epoch 1/4 | Train Loss: 0.0781 | Train Accuracy: 97.96% | Val Loss: 0.0781 | Val Accuracy: 97.96%
Epoch 2/4 | Train Loss: 0.0704 | Train Accuracy: 98.01% | Val Loss: 0.0778 | Val Accuracy: 97.97%
Epoch 3/4 | Train Loss: 0.0677 | Train Accuracy: 98.05% | Val Loss: 0.0779 | Val Accuracy: 97.96%
Epoch 4/4 | Train Loss: 0.0661 | Train Accuracy: 98.07% | Val Loss: 0.0784 | Val Accuracy: 97.97%


In [13]:
# Test set evaluation
test_x = torch.tensor(test_x, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.float32)
test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for x_test, y_test in test_loader:
        x_test = x_test.to(device).float()
        y_test = y_test.to(device).float()

        test_output = model(x_test)
        test_loss += criterion(test_output, y_test).item()
        
        # Test accuracy calculation
        test_predictions = (test_output > 0.5).int()
        test_correct += (test_predictions == y_test.int()).sum().item()
        test_total += y_test.numel()

test_accuracy = test_correct / test_total
print(f"Test Loss: {test_loss/len(test_loader):.4f} | "
      f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Loss: 0.0811 | Test Accuracy: 97.93%


In [14]:
# Test set evaluation
test_x = torch.tensor(test_x, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.float32)
test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
all_test_labels = []
all_test_predictions = []

with torch.no_grad():
    for x_test, y_test in test_loader:
        x_test = x_test.to(device).float()
        y_test = y_test.to(device).float()

        test_output = model(x_test)
        test_predictions = (test_output > 0.5).int().cpu().numpy()  # Convert to binary predictions
        
        all_test_labels.extend(y_test.cpu().numpy())     # Store true labels
        all_test_predictions.extend(test_predictions)    # Store predicted labels

# Convert lists to numpy arrays for further calculations
import numpy as np
all_test_labels = np.array(all_test_labels)
all_test_predictions = np.array(all_test_predictions)



  test_x = torch.tensor(test_x, dtype=torch.float32)
  test_y = torch.tensor(test_y, dtype=torch.float32)


In [15]:
# Flatten the arrays for calculation
labels_flat = all_test_labels.flatten()
preds_flat = all_test_predictions.flatten()

# Calculate TP, FP, FN, TN
TP = np.sum((labels_flat == 1) & (preds_flat == 1))
FP = np.sum((labels_flat == 0) & (preds_flat == 1))
FN = np.sum((labels_flat == 1) & (preds_flat == 0))
TN = np.sum((labels_flat == 0) & (preds_flat == 0))

# Accuracy Calculation
accuracy = (TP + TN) / (TP + FP + FN + TN)

# Precision Calculation
precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0

# Recall Calculation
recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0

sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
balanced_accuracy = (sensitivity + specificity) / 2
average_balanced_accuracy = balanced_accuracy.mean()

# F1-Score Calculation
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

print(f"Accuracy:          {accuracy:.4f}")
print(f"Precision:         {precision:.4f}")
print(f"Recall:            {recall:.4f}")
print(f"F1 Score:          {f1_score:.4f}")
print(f"average_balanced_accuracy:          {average_balanced_accuracy:.4f}")

Accuracy:          0.9794
Precision:         0.6515
Recall:            0.0741
F1 Score:          0.1331
average_balanced_accuracy:          0.5366
