In [17]:
import torch
from torch.utils.data import Dataset
import os
import numpy as np
import torchvision.transforms.functional as TF
from torch.utils.data import DataLoader
import torch.nn as nn
import torchvision.models as models
from transformers import ViTForImageClassification

!pip install librosa --quiet

import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os



In [18]:
import os

# List all datasets mounted in Kaggle input directory
print("Available datasets under /kaggle/input:")
print(os.listdir("/kaggle/input/bird-dataset/dataset"))


Available datasets under /kaggle/input:
['greani1', 'thbeup1', 'yebsee1', 'speowl1', 'strowl1', 'blbwre1', 'rufmot1', 'anhing', 'babwar', 'srwswa1', 'eardov1', 'recwoo1', 'gybmar', 'y00678', 'chfmac1', 'neocor', 'yectyr1', 'thlsch3', 'gohman1', 'whfant1', 'purgal2', 'ywcpar', 'crcwoo1', 'whbman1', 'plbwoo1', 'bicwre1', 'ragmac1', 'secfly1', 'gretin1', 'colcha1', 'ruther1', '65448', 'whbant1', 'cocwoo1', 'rugdov', 'cargra1', 'brtpar1', '41663', 'snoegr', 'baymac', 'yecspi2', 'watjac1', 'bkcdon', 'greibi1', 'rtlhum', 'mastit1', 'pavpig2', 'amakin1', 'leagre', 'blcjay1', 'cattyr', 'grbhaw1', 'grnkin']


In [19]:
import os
import shutil
import random

def split_dataset(source_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    assert train_ratio + val_ratio + test_ratio == 1.0
    os.makedirs(output_dir, exist_ok=True)

    for species in os.listdir(source_dir):
        species_path = os.path.join(source_dir, species)
        if not os.path.isdir(species_path):
            continue

        files = os.listdir(species_path)
        random.shuffle(files)

        n = len(files)
        n_train = int(train_ratio * n)
        n_val = int(val_ratio * n)
        n_test = n - n_train - n_val

        splits = {
            "train": files[:n_train],
            "val": files[n_train:n_train + n_val],
            "test": files[n_train + n_val:]
        }

        for split_name, split_files in splits.items():
            split_dir = os.path.join(output_dir, split_name, species)
            os.makedirs(split_dir, exist_ok=True)
            for file in split_files:
                src = os.path.join(species_path, file)
                dst = os.path.join(split_dir, file)
                shutil.copy2(src, dst)

        print(f"{species}: {n_train} train, {n_val} val, {n_test} test")

# Run the function with Kaggle paths
split_dataset(
    source_dir="/kaggle/input/bird-dataset/dataset",
    output_dir="/kaggle/working/split-bird-dataset",
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15
)


greani1: 39 train, 8 val, 9 test
thbeup1: 53 train, 11 val, 12 test
yebsee1: 68 train, 14 val, 16 test
speowl1: 56 train, 12 val, 13 test
strowl1: 54 train, 11 val, 13 test
blbwre1: 46 train, 9 val, 11 test
rufmot1: 37 train, 8 val, 9 test
anhing: 44 train, 9 val, 11 test
babwar: 58 train, 12 val, 13 test
srwswa1: 55 train, 11 val, 13 test
eardov1: 49 train, 10 val, 12 test
recwoo1: 48 train, 10 val, 11 test
gybmar: 77 train, 16 val, 18 test
y00678: 81 train, 17 val, 18 test
chfmac1: 57 train, 12 val, 13 test
neocor: 44 train, 9 val, 10 test
yectyr1: 46 train, 10 val, 11 test
thlsch3: 36 train, 7 val, 9 test
gohman1: 47 train, 10 val, 11 test
whfant1: 45 train, 9 val, 11 test
purgal2: 77 train, 16 val, 17 test
ywcpar: 51 train, 11 val, 12 test
crcwoo1: 49 train, 10 val, 11 test
whbman1: 83 train, 17 val, 19 test
plbwoo1: 78 train, 16 val, 18 test
bicwre1: 47 train, 10 val, 11 test
ragmac1: 42 train, 9 val, 9 test
secfly1: 82 train, 17 val, 19 test
gretin1: 39 train, 8 val, 9 test
colch

In [20]:
def audio_to_mel(file_path, sr=48000, duration=3, n_mels=64, hop_length=128, n_fft=512):
    y, _ = librosa.load(file_path, sr=sr)
    y = librosa.util.fix_length(y, size=sr * duration)  # pad or trim to 3 sec
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft,
                                         hop_length=hop_length, n_mels=n_mels, fmax=15000)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db  # Shape should be ~64 x 384


In [21]:
def generate_all_mels(input_dir, output_dir):
    for split in ['train', 'val', 'test']:
        input_split = os.path.join(input_dir, split)
        output_split = os.path.join(output_dir, split)
        os.makedirs(output_split, exist_ok=True)

        for species in tqdm(os.listdir(input_split), desc=f"Processing {split}"):
            input_species_path = os.path.join(input_split, species)
            output_species_path = os.path.join(output_split, species)
            os.makedirs(output_species_path, exist_ok=True)

            for file in os.listdir(input_species_path):
                if not file.endswith(".ogg"):
                    continue

                input_file_path = os.path.join(input_species_path, file)
                output_file_path = os.path.join(output_species_path, file.replace(".ogg", ".npy"))

                try:
                    mel = audio_to_mel(input_file_path)
                    np.save(output_file_path, mel)
                except Exception as e:
                    print(f"Error processing {file}: {e}")

generate_all_mels("/kaggle/working/split-bird-dataset", "/kaggle/working/MelSpectrograms")


Processing train: 100%|██████████| 53/53 [01:51<00:00,  2.10s/it]
Processing val: 100%|██████████| 53/53 [00:22<00:00,  2.33it/s]
Processing test: 100%|██████████| 53/53 [00:25<00:00,  2.06it/s]


In [22]:
base_dir = "/kaggle/working/MelSpectrograms"

train_dir = f'{base_dir}/train'
val_dir   = f'{base_dir}/val'
test_dir  = f'{base_dir}/test'


In [23]:

class ResizeNormalize:
    def __init__(self, size=(224, 224)):
        self.size = size

    def __call__(self, tensor):
        tensor = TF.resize(tensor, self.size)
        tensor = TF.normalize(tensor, mean=[0.5]*3, std=[0.5]*3)
        return tensor

class MelSpectrogramDataset(Dataset):
    def __init__(self, folder_path, class_map, transform=None):
        self.folder_path = folder_path
        self.transform = transform
        self.class_map = class_map  # dict: class_name -> label index

        self.files = []
        for label_name in os.listdir(folder_path):
            class_folder = os.path.join(folder_path, label_name)
            for fname in os.listdir(class_folder):
                if fname.endswith('.npy'):
                    self.files.append((os.path.join(class_folder, fname), self.class_map[label_name]))

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path, label = self.files[idx]
        mel = np.load(path)  # shape: [H, W]
        mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).repeat(3, 1, 1)  # [3, H, W]
        if self.transform:
            mel = self.transform(mel)
        return mel, label


In [24]:
# Get class labels from subfolders of 'train'
class_names = sorted(os.listdir(train_dir))
class_map = {cls: idx for idx, cls in enumerate(class_names)}

print("Class Map:", class_map)


Class Map: {'41663': 0, '65448': 1, 'amakin1': 2, 'anhing': 3, 'babwar': 4, 'baymac': 5, 'bicwre1': 6, 'bkcdon': 7, 'blbwre1': 8, 'blcjay1': 9, 'brtpar1': 10, 'cargra1': 11, 'cattyr': 12, 'chfmac1': 13, 'cocwoo1': 14, 'colcha1': 15, 'crcwoo1': 16, 'eardov1': 17, 'gohman1': 18, 'grbhaw1': 19, 'greani1': 20, 'greibi1': 21, 'gretin1': 22, 'grnkin': 23, 'gybmar': 24, 'leagre': 25, 'mastit1': 26, 'neocor': 27, 'pavpig2': 28, 'plbwoo1': 29, 'purgal2': 30, 'ragmac1': 31, 'recwoo1': 32, 'rtlhum': 33, 'rufmot1': 34, 'rugdov': 35, 'ruther1': 36, 'secfly1': 37, 'snoegr': 38, 'speowl1': 39, 'srwswa1': 40, 'strowl1': 41, 'thbeup1': 42, 'thlsch3': 43, 'watjac1': 44, 'whbant1': 45, 'whbman1': 46, 'whfant1': 47, 'y00678': 48, 'yebsee1': 49, 'yecspi2': 50, 'yectyr1': 51, 'ywcpar': 52}


In [25]:

transform = ResizeNormalize()

train_data = MelSpectrogramDataset(train_dir, class_map, transform)
val_data   = MelSpectrogramDataset(val_dir, class_map, transform)
test_data  = MelSpectrogramDataset(test_dir, class_map, transform)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=32)
test_loader  = DataLoader(test_data, batch_size=32)


In [26]:


num_classes = len(class_map)

# ResNet-50
from torchvision.models import ResNet50_Weights
resnet50 = models.resnet50(weights=ResNet50_Weights.DEFAULT)
resnet50.fc = nn.Linear(resnet50.fc.in_features, num_classes)

# DenseNet-161
from torchvision.models import DenseNet161_Weights
densenet161 = models.densenet161(weights=DenseNet161_Weights.DEFAULT)
densenet161.classifier = nn.Linear(densenet161.classifier.in_features, num_classes)

# ViT-B/16
vit = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=num_classes,
    ignore_mismatched_sizes=True  # required if using custom classes
)


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 164MB/s] 
Downloading: "https://download.pytorch.org/models/densenet161-8d451a50.pth" to /root/.cache/torch/hub/checkpoints/densenet161-8d451a50.pth
100%|██████████| 110M/110M [00:01<00:00, 87.3MB/s] 


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device) #cpu
resnet50 = resnet50.to(device)
densenet161 = densenet161.to(device)
vit = vit.to(device)

cuda


In [28]:
import torch
import torch.nn as nn
from tqdm import tqdm

def train_model(model, train_loader, val_loader, epochs=10, save_path="best_model.pth", lr=1e-4, patience=3):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_acc = 0.0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        total_loss, total_correct = 0, 0

        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs) if not isinstance(model, ViTForImageClassification) else model(inputs).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * inputs.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        train_acc = total_correct / len(train_loader.dataset)
        avg_loss = total_loss / len(train_loader.dataset)
        print(f"Train Loss: {avg_loss:.4f} | Accuracy: {train_acc:.4f}")

        # Validate
        val_acc = evaluate_model(model, val_loader)

        # Early Stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0
            torch.save(model.state_dict(), save_path)
            print(f"✅ New best model saved with Val Accuracy: {val_acc:.4f}")
        else:
            epochs_no_improve += 1
            print(f"⚠️ No improvement for {epochs_no_improve} epoch(s).")

        if epochs_no_improve >= patience:
            print("🛑 Early stopping triggered.")
            break


In [29]:
def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    device = next(model.parameters()).device

    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X) if not isinstance(model, ViTForImageClassification) else model(X).logits
            preds = outputs.argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    acc = correct / total if total > 0 else 0.0
    print(f"Validation Accuracy: {acc:.4f}")
    return acc


In [30]:
train_model(resnet50, train_loader, val_loader, epochs=15,save_path="best_model_resnet50")

Epoch 1/5: 100%|██████████| 97/97 [00:33<00:00,  2.92it/s]


Train Loss: 3.8792 | Accuracy: 0.0686
Validation Accuracy: 0.1553
✅ New best model saved with Val Accuracy: 0.1553


Epoch 2/5: 100%|██████████| 97/97 [00:32<00:00,  3.00it/s]


Train Loss: 2.9395 | Accuracy: 0.3108
Validation Accuracy: 0.3967
✅ New best model saved with Val Accuracy: 0.3967


Epoch 3/5: 100%|██████████| 97/97 [00:32<00:00,  2.94it/s]


Train Loss: 1.8229 | Accuracy: 0.5618
Validation Accuracy: 0.5395
✅ New best model saved with Val Accuracy: 0.5395


Epoch 4/5: 100%|██████████| 97/97 [00:33<00:00,  2.87it/s]


Train Loss: 1.0411 | Accuracy: 0.7704
Validation Accuracy: 0.5381
⚠️ No improvement for 1 epoch(s).


Epoch 5/5: 100%|██████████| 97/97 [00:34<00:00,  2.83it/s]


Train Loss: 0.5347 | Accuracy: 0.8952
Validation Accuracy: 0.5562
✅ New best model saved with Val Accuracy: 0.5562


In [31]:
evaluate_model(resnet50, test_loader)


Validation Accuracy: 0.5620


0.561963190184049

In [32]:
train_model(densenet161, train_loader, val_loader, epochs=10,save_path = "best_model_densenet161")

Epoch 1/10: 100%|██████████| 97/97 [01:11<00:00,  1.35it/s]


Train Loss: 3.5134 | Accuracy: 0.1746
Validation Accuracy: 0.3537
✅ New best model saved with Val Accuracy: 0.3537


Epoch 2/10: 100%|██████████| 97/97 [01:14<00:00,  1.31it/s]


Train Loss: 2.2869 | Accuracy: 0.4932
Validation Accuracy: 0.5479
✅ New best model saved with Val Accuracy: 0.5479


Epoch 3/10: 100%|██████████| 97/97 [01:14<00:00,  1.29it/s]


Train Loss: 1.4421 | Accuracy: 0.7063
Validation Accuracy: 0.6241
✅ New best model saved with Val Accuracy: 0.6241


Epoch 4/10: 100%|██████████| 97/97 [01:15<00:00,  1.29it/s]


Train Loss: 0.8118 | Accuracy: 0.8687
Validation Accuracy: 0.6477
✅ New best model saved with Val Accuracy: 0.6477


Epoch 5/10: 100%|██████████| 97/97 [01:14<00:00,  1.30it/s]


Train Loss: 0.3577 | Accuracy: 0.9576
Validation Accuracy: 0.6824
✅ New best model saved with Val Accuracy: 0.6824


Epoch 6/10: 100%|██████████| 97/97 [01:14<00:00,  1.30it/s]


Train Loss: 0.1566 | Accuracy: 0.9897
Validation Accuracy: 0.7115
✅ New best model saved with Val Accuracy: 0.7115


Epoch 7/10: 100%|██████████| 97/97 [01:14<00:00,  1.30it/s]


Train Loss: 0.0597 | Accuracy: 0.9994
Validation Accuracy: 0.7129
✅ New best model saved with Val Accuracy: 0.7129


Epoch 8/10: 100%|██████████| 97/97 [01:14<00:00,  1.30it/s]


Train Loss: 0.0300 | Accuracy: 1.0000
Validation Accuracy: 0.7129
⚠️ No improvement for 1 epoch(s).


Epoch 9/10: 100%|██████████| 97/97 [01:14<00:00,  1.30it/s]


Train Loss: 0.0186 | Accuracy: 1.0000
Validation Accuracy: 0.7254
✅ New best model saved with Val Accuracy: 0.7254


Epoch 10/10: 100%|██████████| 97/97 [01:14<00:00,  1.29it/s]


Train Loss: 0.0132 | Accuracy: 1.0000
Validation Accuracy: 0.7226
⚠️ No improvement for 1 epoch(s).


In [33]:
evaluate_model(densenet161, test_loader)


Validation Accuracy: 0.7227


0.7226993865030675

In [36]:
train_model(vit, train_loader, val_loader, epochs=10,save_path = "best_model_vit")

Epoch 1/10: 100%|██████████| 97/97 [01:39<00:00,  1.03s/it]


Train Loss: 1.0383 | Accuracy: 0.8862
Validation Accuracy: 0.4730
✅ New best model saved with Val Accuracy: 0.4730


Epoch 2/10: 100%|██████████| 97/97 [01:47<00:00,  1.11s/it]


Train Loss: 0.6520 | Accuracy: 0.9599
Validation Accuracy: 0.4577
⚠️ No improvement for 1 epoch(s).


Epoch 3/10: 100%|██████████| 97/97 [01:50<00:00,  1.14s/it]


Train Loss: 0.3915 | Accuracy: 0.9903
Validation Accuracy: 0.4508
⚠️ No improvement for 2 epoch(s).


Epoch 4/10: 100%|██████████| 97/97 [01:51<00:00,  1.15s/it]


Train Loss: 0.2059 | Accuracy: 0.9987
Validation Accuracy: 0.4563
⚠️ No improvement for 3 epoch(s).
🛑 Early stopping triggered.


In [37]:
evaluate_model(vit, test_loader)


Validation Accuracy: 0.4663


0.4662576687116564