In [15]:
import torch
import torchaudio
import torchaudio.transforms as T
import numpy as np
import scipy.signal
import cv2
import torch.nn as nn

class AudioPreprocessing(nn.Module):
    def __init__(self, sample_rate=22050, n_fft=512, win_length=512, hop_length=int(512 * 0.75)):
        super(AudioPreprocessing, self).__init__()
        self.resampler = T.Resample(orig_freq=44100, new_freq=sample_rate)
        self.spectrogram = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length, power=None, window_fn=torch.hann_window)
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length

    def __call__(self, waveform):
        # 1. Resample to 22050Hz
        waveform = self.resampler(waveform)

        # 2. Apply STFT
        spectrogram = self.spectrogram(waveform)

        # 3. Normalize
        spectrogram = torch.abs(spectrogram)
        max_val = spectrogram.max()
        if max_val:
            spectrogram /= max_val

        # 4. Median Clipping
        freq_median = torch.median(spectrogram, dim=2, keepdim=True)[0]
        time_median = torch.median(spectrogram, dim=1, keepdim=True)[0]

        mask = (spectrogram > (3 * freq_median)) & (spectrogram > (3 * time_median))
        spectrogram = torch.where(mask, torch.tensor(1.0).to(spectrogram.device), torch.tensor(0.0).to(spectrogram.device))

        # 5. Image processing techniques
        # Convert to numpy for OpenCV processing
        img = spectrogram.squeeze(0).cpu().numpy()
        kernel = np.ones((3,3),np.uint8)

        # Closing & Dilation
        img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
        img = cv2.dilate(img, kernel, iterations=1)

        # Median Filter & Remove small objects
        img = cv2.medianBlur(img, 3)
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(img.astype(np.uint8), 4, cv2.CV_32S)
        for i in range(1, num_labels):
            if stats[i][-1] < 100:  # Change this threshold according to your needs
                img[labels == i] = 0

        # Convert back to torch tensor
        spectrogram = torch.tensor(img).float().unsqueeze(0)

        return spectrogram

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torchaudio.transforms as transforms
import torch.nn as nn

class BirdSongDataset(Dataset):
    def __init__(self, df, audio_dir, class_info, transform=None):
        self.df = df
        self.audio_dir = audio_dir
        self.class_info = class_info
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        audio_path = os.path.join(self.audio_dir, filename)
        waveform, sample_rate = torchaudio.load(audio_path)

        # Getting the labels
        labels = self.df[self.df['filename'] == filename]
        target = torch.zeros(len(self.class_info))
        for _, label in labels.iterrows():
            class_name = label['class']
            target[self.class_info.index(class_name)] = 1.0

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, target

# Load csv files
train_csv = pd.read_csv('/content/drive/MyDrive/DeepLearning/data/train.csv')
class_info_csv = pd.read_csv('/content/drive/MyDrive/DeepLearning/data/class_info.csv')
class_names = class_info_csv['class name'].tolist()

# Split data into train and validation sets
train_df, valid_df = train_test_split(train_csv, test_size=0.1, random_state=42)

# Chain transformations using nn.Sequential
transform = nn.Sequential(
    AudioPreprocessing()
)

train_dataset = BirdSongDataset(train_df, '/content/drive/MyDrive/DeepLearning/data/train/', class_names, transform=transform)
valid_dataset = BirdSongDataset(valid_df, '/content/drive/MyDrive/DeepLearning/data/train/', class_names, transform=transform)


# Define DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)


In [18]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Calculate the global max length of waveforms in the dataset
global_max_len = max(max(wf.shape[2] for wf, _ in dataset) for dataset in [train_dataset, valid_dataset])

def collate_fn(batch):
    # A batch is a list of (waveform, target) pairs
    waveforms, targets = zip(*batch)

    # Pad the waveforms to the global max length
    waveforms = [torch.cat([wf, torch.zeros((1, wf.shape[1], global_max_len - wf.shape[2]))], dim=2) for wf in waveforms]

    waveforms = torch.stack(waveforms)
    targets = torch.stack(targets)

    return waveforms, targets

# Update DataLoader objects with the new collate_fn
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=2)


In [40]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 64 * 72, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        #print(x.shape)
        x = x.view(x.size(0), 32 * 64 * 72)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [41]:
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model
model = SimpleCNN(num_classes=len(class_names)).to(device)


In [42]:
# Loss and Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

class SmoothBCELoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super(SmoothBCELoss, self).__init__()
        self.smoothing = smoothing

    def forward(self, pred, target):
        target = target * (1.0 - self.smoothing) + 0.5 * self.smoothing
        loss = nn.BCELoss()(pred, target)
        return loss


criterion = SmoothBCELoss(smoothing=0.1)


num_epochs = 30
for epoch in range(num_epochs):
    # Training
    model.train()
    running_train_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

    train_loss = running_train_loss / len(train_loader)

    # Validation
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item()

    val_loss = running_val_loss / len(valid_loader)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

print('Finished Training')

Epoch 1, Train Loss: 0.2410, Validation Loss: 0.2283
Epoch 2, Train Loss: 0.2141, Validation Loss: 0.2317
Epoch 3, Train Loss: 0.2108, Validation Loss: 0.2282
Epoch 4, Train Loss: 0.2091, Validation Loss: 0.2295
Epoch 5, Train Loss: 0.2085, Validation Loss: 0.2280
Epoch 6, Train Loss: 0.2080, Validation Loss: 0.2296
Epoch 7, Train Loss: 0.2074, Validation Loss: 0.2291
Epoch 8, Train Loss: 0.2071, Validation Loss: 0.2299
Epoch 9, Train Loss: 0.2073, Validation Loss: 0.2286
Epoch 10, Train Loss: 0.2068, Validation Loss: 0.2293
Epoch 11, Train Loss: 0.2067, Validation Loss: 0.2287
Epoch 12, Train Loss: 0.2066, Validation Loss: 0.2290
Epoch 13, Train Loss: 0.2063, Validation Loss: 0.2301
Epoch 14, Train Loss: 0.2063, Validation Loss: 0.2312
Epoch 15, Train Loss: 0.2063, Validation Loss: 0.2295
Epoch 16, Train Loss: 0.2064, Validation Loss: 0.2302
Epoch 17, Train Loss: 0.2063, Validation Loss: 0.2295
Epoch 18, Train Loss: 0.2061, Validation Loss: 0.2284
Epoch 19, Train Loss: 0.2061, Validat

In [43]:
from sklearn.metrics import f1_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in valid_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = outputs.round()  # Convert to binary: 0 or 1

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

f1_macro = f1_score(all_labels, all_preds, average='samples')
print(f"F1 Score (Samples): {f1_macro}")


F1 Score (Samples): 0.6620396306670816


In [44]:
def test_collate_fn(batch):
    # A batch is a list of (waveform, filename) pairs for the test dataset
    waveforms, filenames = zip(*batch)

    # Pad the waveforms to the global max length
    waveforms = [torch.cat([wf, torch.zeros((1, wf.shape[1], global_max_len - wf.shape[2]))], dim=2) for wf in waveforms]

    waveforms = torch.stack(waveforms)

    return waveforms, filenames


# Loading the test set
class BirdSongTestDataset(Dataset):
    def __init__(self, df, audio_dir, transform=None):
        self.df = df
        self.audio_dir = audio_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        audio_path = os.path.join(self.audio_dir, filename)
        waveform, sample_rate = torchaudio.load(audio_path)

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, filename

test_csv = pd.read_csv('/content/drive/MyDrive/DeepLearning/data/test.csv')
test_dataset = BirdSongTestDataset(test_csv, '/content/drive/MyDrive/DeepLearning/data/test/', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=test_collate_fn, num_workers=2)

# Make predictions on test set
model.eval()
predictions = {}
with torch.no_grad():
    for inputs, filenames in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = outputs.round()  # Convert to binary: 0 or 1
        for fname, pred in zip(filenames, preds):
            predictions[fname] = pred.cpu().numpy()

# Convert predictions to submission format
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=class_names)
submission_df.reset_index(inplace=True)
submission_df.rename(columns={'index': 'filename'}, inplace=True)
submission_df.to_csv('/content/drive/MyDrive/DeepLearning/submission.csv', index=False)
