In [1]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class BirdSongDataset(Dataset):
    def __init__(self, df, audio_dir, class_info, transform=None):
        self.df = df
        self.audio_dir = audio_dir
        self.class_info = class_info
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        audio_path = os.path.join(self.audio_dir, filename)
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # Getting the labels
        labels = self.df[self.df['filename'] == filename]
        target = torch.zeros(len(self.class_info))
        for _, label in labels.iterrows():
            class_name = label['class']
            target[self.class_info.index(class_name)] = 1.0

        if self.transform:
            waveform = self.transform(waveform)
        
        return waveform, target

# Load csv files
train_csv = pd.read_csv('data/train.csv')
class_info_csv = pd.read_csv('data/class_info.csv')
class_names = class_info_csv['class name'].tolist()

# Split data into train and validation sets
train_df, valid_df = train_test_split(train_csv, test_size=0.1, random_state=42)

# Define transformations (Mel Spectrogram Transformation)
transform = T.MelSpectrogram(sample_rate=44100, n_fft=1024, hop_length=512, n_mels=64)

# Define Dataset objects
train_dataset = BirdSongDataset(train_df, 'data/train/', class_names, transform=transform)
valid_dataset = BirdSongDataset(valid_df, 'data/train/', class_names, transform=transform)

# Define DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)


In [2]:
from torch.nn.utils.rnn import pad_sequence

# Find the global max length of waveforms in the dataset
global_max_len = max([wf.shape[2] for wf, _ in train_dataset] + [wf.shape[2] for wf, _ in valid_dataset])

def collate_fn(batch):
    # A batch is a list of (waveform, target) pairs
    waveforms, targets = zip(*batch)
    
    # Manually pad the waveforms to the global max length
    padded_waveforms = []
    for wf in waveforms:
        pad_len = global_max_len - wf.shape[2]
        padded_wf = torch.cat([wf, torch.zeros((1, wf.shape[1], pad_len))], axis=2)
        padded_waveforms.append(padded_wf)
        
    waveforms = torch.stack(padded_waveforms)
    targets = torch.stack(targets)
    
    return waveforms, targets

# Update DataLoader objects with the new collate_fn
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 16 * 108, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), 32 * 16 * 108)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [4]:
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model
model = SimpleCNN(num_classes=len(class_names)).to(device)


In [5]:
import numpy as np


def find_size(model):
    dummy_x = torch.randn(1, 1, 64, 432).to(device)  # Assuming you have 1080 time frames
    dummy_out = model.pool(model.conv2(model.pool(model.conv1(dummy_x))))
    return int(np.prod(dummy_out.size()))

# Before training
feature_size = find_size(model)
print(feature_size)

55296


In [6]:


# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # print(inputs.shape)
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

print('Finished Training')


Epoch 1, Loss: 0.14110128507018088
Epoch 2, Loss: 0.09452031412376807
Epoch 3, Loss: 0.08362709475824466
Epoch 4, Loss: 0.07759899737743231
Epoch 5, Loss: 0.08085088552907109
Epoch 6, Loss: 0.08101275307484544
Epoch 7, Loss: 0.08204633131479988
Epoch 8, Loss: 0.08742077085547723
Epoch 9, Loss: 0.07985209534526802
Epoch 10, Loss: 0.08020822957262401
Epoch 11, Loss: 0.06367546010741394
Epoch 12, Loss: 0.0600907236086921
Epoch 13, Loss: 0.07281116678954837
Epoch 14, Loss: 0.06429234598046885
Epoch 15, Loss: 0.08175168961035804
Epoch 16, Loss: 0.07924278858117759
Epoch 17, Loss: 0.08586080096518764
Epoch 18, Loss: 0.07916261679718277
Epoch 19, Loss: 0.0981386755933412
Epoch 20, Loss: 0.09205889044365344
Epoch 21, Loss: 0.1145830626908439
Epoch 22, Loss: 0.10401691607235429
Epoch 23, Loss: 0.08338128556004869
Epoch 24, Loss: 0.08272552966193941
Epoch 25, Loss: 0.08610309978606752
Epoch 26, Loss: 0.08176129769686108
Epoch 27, Loss: 0.06985028434920913
Epoch 28, Loss: 0.07558340201906573
Epoc

In [7]:
from sklearn.metrics import f1_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in valid_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = outputs.round()  # Convert to binary: 0 or 1
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

f1_macro = f1_score(all_labels, all_preds, average='macro')
print(f"F1 Score (Macro): {f1_macro}")


F1 Score (Macro): 0.5344448031212603


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
