In [16]:
import os
import pandas as pd
import torch
import torchaudio
import torchaudio.transforms as T

class_info_df = pd.read_csv("data/class_info.csv")
class_to_index = {c: i for i, c in enumerate(class_info_df['class name'].values)}


class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, root_dir, class_map, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.class_map = class_map

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_name = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        waveform, sample_rate = torchaudio.load(audio_name)

        # Transform waveform if provided
        if self.transform:
            waveform = self.transform(waveform)

        # Get the labels from the CSV and convert to multi-hot tensor
        labels = torch.zeros(len(self.class_map))
        for i in range(1, len(self.annotations.columns), 3):  # step by 3 because of filename, start, end, class
            bird_class = self.annotations.iloc[index, i + 2]
            if bird_class in self.class_map:
                labels[self.class_map[bird_class]] = 1

        return waveform, labels

# Define transform to convert audio waveform to MFCC features
transform = T.MFCC(
    sample_rate=44100, 
    n_mfcc=13,
    melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False}
)

train_dataset = AudioDataset(csv_file="data/train.csv", root_dir="data/train", class_map=class_to_index, transform=transform)
test_dataset = AudioDataset(csv_file="data/test.csv", root_dir="data/test", class_map=class_to_index, transform=transform)


In [17]:
# Data Augmentation

In [18]:
import torch.nn as nn

class BirdNet(nn.Module):
    def __init__(self, num_classes):
        super(BirdNet, self).__init__()

        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(288, 512) # Adjusted based on the output shape
        self.fc2 = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(2)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.maxpool(x)
        x = self.relu(self.conv2(x))
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

num_classes = len(train_dataset[0][1])
model = BirdNet(num_classes)


In [20]:
import torch.optim as optim

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    for waveforms, labels in train_dataset:
        optimizer.zero_grad()
        outputs = model(waveforms)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

    # Optional: add validation here

print("Training complete!")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x1032 and 288x512)