In [1]:
import os
import numpy as np
# Data Preparation
def load_data(spectrogram_dir, label_dir):
    spectrograms = []
    labels = []

    # Load spectrograms
    spectrogram_files = sorted(os.listdir(spectrogram_dir))
    label_files = sorted(os.listdir(label_dir))

    for spec_file, label_file in zip(spectrogram_files, label_files):
        spectrogram = np.load(os.path.join(spectrogram_dir, spec_file))
        label = np.load(os.path.join(label_dir, label_file)).T 

        # Pad or trim labels
        T_spec = spectrogram.shape[0]
        T_label = label.shape[0]
        if T_label < T_spec:
            diff = T_spec - T_label
            label = np.pad(label, ((0, diff), (0, 0)), mode='constant')
        elif T_label > T_spec:
            label = label[:T_spec]

        spectrograms.append(spectrogram)
        labels.append(label)

    # Convert to numpy arrays
    X = np.vstack(spectrograms)
    Y = np.vstack(labels)

    return X, Y
X_train, Y_train = load_data("spectrograms_train", "timestamps_train")
X_train = np.expand_dims(X_train, axis=-1)
print("Final Shapes:")
print("X_train:", X_train.shape, "Y_train:", Y_train.shape)

Final Shapes:
X_train: (99104, 128, 173, 1) Y_train: (99104, 129)


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# CNN Model
class InstrumentCNN(nn.Module):
    def __init__(self):
        super(InstrumentCNN, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3,3), padding=1),  # (128, 173) -> (128, 173, 32)
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d((2,2)),  # (128, 173) -> (64, 86, 32)

            nn.Conv2d(32, 64, kernel_size=(3,3), padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d((2,2)),  # (64, 86) -> (32, 43, 64)

            nn.Conv2d(64, 128, kernel_size=(3,3), padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d((2,2))   # (32, 43) -> (16, 21, 128)
        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),  
            nn.Linear(16 * 21 * 128, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 129),  # Output 129 classes
            nn.Sigmoid()  # Multi-label classification (each instrument independent)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

# Initialize Model
model = InstrumentCNN()
print(model)

InstrumentCNN(
  (conv_layers): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=43008, out_features=512, bias=

In [3]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).permute(0, 3, 1, 2)  # (N, 1, 128, 173)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)  # (N, 129)

# Create DataLoader
batch_size = 4
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Loss and Optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


InstrumentCNN(
  (conv_layers): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=43008, out_features=512, bias=

In [4]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for spectrograms, labels in train_loader:
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

print("Training Complete!")

Epoch [1/1], Loss: 0.1188
Training Complete!


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

# Load Validation Data
X_val, Y_val = load_data("spectrograms_validation", "timestamps_validation")
X_val = np.expand_dims(X_val, axis=-1)  # Add channel dimension

# Convert to PyTorch tensors
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).permute(0, 3, 1, 2)  # (N, 1, 128, 173)
Y_val_tensor = torch.tensor(Y_val, dtype=torch.float32)  # (N, 129)

# Move tensors to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_val_tensor, Y_val_tensor = X_val_tensor.to(device), Y_val_tensor.to(device)

# Set model to evaluation mode
model.eval()

# Disable gradient calculations
with torch.no_grad():
    outputs = model(X_val_tensor)
    predictions = (outputs > 0.5).float()  # Convert probabilities to binary labels

# Convert tensors to numpy for sklearn metrics
Y_true = Y_val_tensor.cpu().numpy()
Y_pred = predictions.cpu().numpy()

# Compute evaluation metrics
accuracy = accuracy_score(Y_true.flatten(), Y_pred.flatten())  # Binary accuracy
precision = precision_score(Y_true, Y_pred, average='macro', zero_division=0)
recall = recall_score(Y_true, Y_pred, average='macro', zero_division=0)
f1 = f1_score(Y_true, Y_pred, average='macro', zero_division=0)

# Print results
print("Model Evaluation on Validation Set:")
print(f"Binary Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 55.34 GiB. GPU 0 has a total capacity of 4.00 GiB of which 1.01 GiB is free. Of the allocated memory 2.10 GiB is allocated by PyTorch, and 89.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)