In [1]:
%%time

# --- Install offline packages ---
try:
    import ace_tools_open
except ModuleNotFoundError:
    print('Installing ace tools...')
    !pip install -q /kaggle/input/offline-packages/itables-2.3.0-py3-none-any.whl
    !pip install -q /kaggle/input/offline-packages/ace_tools_open-0.1.0-py3-none-any.whl

try:
    import timm
except ModuleNotFoundError:
    print('Installing timm...')
    !pip install -q /kaggle/input/offline-packages/timm-1.0.15-py3-none-any.whl
    
try:
    import warmup_scheduler
except ModuleNotFoundError:
    print('Installing warmup-scheduler...')
    !pip install -q /kaggle/input/offline-packages/warmup_scheduler-0.3.tar.gz

# --- Core libraries ---
import os
import math
import random
import time
import numpy as np

# --- Data handling ---
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from dataclasses import dataclass, field
from sklearn.preprocessing import label_binarize

# --- PyTorch ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchaudio.functional import bandpass_biquad
from warmup_scheduler import GradualWarmupScheduler

# --- Audio processing ---
import torchaudio
import torchaudio.transforms as T

# --- Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio, display
from tqdm.notebook import tqdm
import ace_tools_open as tools
import torchvision
from torchvision.ops.focal_loss import sigmoid_focal_loss
import cv2

# --- Parallel and Custom Tools ---
from joblib import Parallel, delayed
from torch.amp import GradScaler, autocast
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Optional, List, Tuple
import timm
import tempfile
import gc
import itertools
from glob import glob

Installing ace tools...
Installing warmup-scheduler...
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for warmup_scheduler (setup.py) ... [?25l[?25hdone
CPU times: user 8.04 s, sys: 1.77 s, total: 9.81 s
Wall time: 25 s


In [4]:
!find /kaggle/working/pseudo_spectrograms -type f -name "*.npy" | wc -l  # 223698

223698


In [5]:
@dataclass
class CFG:
    # General
    LOAD_DATA: bool = True
    seed: int = 69
    debug: bool = False
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    ## Data paths ##
    OUTPUT_DIR: str = '/kaggle/working/'
    temporary_dir: str = field(init=False)
    real_spectrogram_dir: str = "/kaggle/input/eda-birdclef2025" #"/kaggle/working/precomputed_spectrograms"
    real_spectrogram_csv_filename: str = "spec_metadata.csv"
    real_spectrograms_metadata_path: str = field(init=False) # "filename", "primary_label")
    pseudo_spectrograms_dir: str = '/kaggle/working/' # "/kaggle/input/pseudo-labeling-birdclef2025"
    pseudo_spectrogram_csv_filename: str = "pseudo_metadata.csv"
    pseudo_spectrograms_metadata_path: str = field(init=False) # "filename", "primary_label"
    
    # Base path to dataset
    data_path: str = '/kaggle/input/birdclef-2025/'
    # Key file paths
    metadata_path: str = field(init=False)
    taxonomy_path: str = field(init=False)
    sample_submission_path: str = field(init=False)
    location_path: str = field(init=False)
    # Audio data directories
    train_data_path: str = field(init=False)
    test_soundscapes_path: str = field(init=False)
    unlabeled_soundscapes_path: str = field(init=False)

    # Audio config
    topDB: int = 80
    FS: int = 32000
    CHUNK_LENGTH: float = 5.0   # seconds
    N_FFT: int = 1024
    HOP_LENGTH: int = 512
    N_MELS: int = 128
    FMIN: int = 50
    FMAX: int = 16000
    POWER: int = 2
    SPEC_DTYPE: str = 'float16'  # for disk saving
    spectrogram_time_frames: int = field(init=False)
    
    # VAD and Filtering
    VAD_ENABLED: bool = False  # Whether to apply Voice Activity Detection
    VAD_THRESHOLD: float = 0.4  # Confidence threshold for Silero VAD
    BANDPASS_LOW: int = 300
    BANDPASS_HIGH: int = 16000
    VISUALIZE_SKIPPED: bool = False  # Set to True to see spectrograms of skipped chunks


    # Training
    BATCH_SIZE: int = 32
    EPOCHS: int = 20
    criterion: str = 'BCEWithLogitsLoss'
    optimizer: str = 'AdamW'
    LEARNING_RATE: float = 1e-3
    weight_decay: float = 1e-5
    scheduler: str = 'CosineAnnealingLR'
    min_lr: float = 1e-6
    n_fold: int = 5
    num_workers: int = 4

    # Augmentation
    augment = True
    aug_prob: float = 0.5
    mixup_alpha: float = 0.4

    # Model
    model_name: str = "efficientnet_b0" # 'efficientnet_b3_pruned', 'efficientnetv2_rw_m', 'efficientvit_l1', 'efficientvit_l2', 'efficientvit_m0'
    pretrained: bool = True
    in_channels: int = 1
    input_directory: str = '/kaggle/input'
    input_model_filename: str = field(init=False)
    output_model_filename: str = field(init=False)
    pretrained_model_weights: str = field(init=False)
    sed_model_weights_path: str = field(init=False)
    num_classes: str = field(init=False)

    # Focal Loss parameters
    alpha: float = 0.25 
    gamma: float = 2.0 
    reduction: str = "mean"
    bce_weight: float = 1.0
    focal_weight: float = 1.0
    secondary_weight: float = .5

    def __post_init__(self):
        self.metadata_path = os.path.join(self.data_path, 'train.csv')
        self.taxonomy_path = os.path.join(self.data_path, 'taxonomy.csv')
        self.sample_submission_path = os.path.join(self.data_path, 'sample_submission.csv')
        self.location_path = os.path.join(self.data_path, 'recording_location.txt')
        self.train_data_path = os.path.join(self.data_path, 'train_audio')
        self.test_soundscapes_path = os.path.join(self.data_path, 'test_soundscapes')
        self.unlabeled_soundscapes_path = os.path.join(self.data_path, 'train_soundscapes')
        self.real_spectrograms_metadata_path = os.path.join(self.real_spectrogram_dir, self.real_spectrogram_csv_filename)
        self.pseudo_spectrograms_metadata_path = os.path.join(self.pseudo_spectrograms_dir, self.pseudo_spectrogram_csv_filename)
        
        self.input_model_filename = f'{self.model_name}_pretrained.pth'
        self.output_model_filename = f'{self.model_name}_sed.pth'
        self.pretrained_model_weights = os.path.join(self.input_directory, "offline-packages", self.input_model_filename)
        self.sed_model_weights_path = os.path.join(self.input_directory, "effnet14", self.output_model_filename)
        self.num_classes = len(pd.read_csv(self.taxonomy_path))

        self.temporary_dir = tempfile.TemporaryDirectory().name
        self.spectrogram_time_frames = int((self.FS * self.CHUNK_LENGTH) // self.HOP_LENGTH + 1)
        if self.debug:
            self.EPOCHS = 2
            print("⚠️ Debug mode is ON. Training only for 2 epochs.")

cfg = CFG()

In [6]:
def set_seed(seed=69):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(cfg.seed)
print(f"Training with device: {cfg.device}")

Training with device: cuda


In [8]:
class EfficientNetSED(nn.Module):
    """
    EfficientNet with a custom SED head for frequency-wise attention.
    
    This model:
    - Uses a pretrained EfficientNet backbone
    - Applies a frequency-wise attention mechanism
    - Outputs class probabilities for multi-class classification
    
    Arguments:
    ----------
    cfg : object
        Configuration object (assumes it's an instance of CFG)
    """
    def __init__(self, cfg):
        super().__init__()
        
        # Store config and device
        self.cfg = cfg
        self.device = torch.device(cfg.device)

        # Create model with the correct architecture
        self.backbone = timm.create_model(cfg.model_name, pretrained=cfg.pretrained)

        # Load weights manually
        checkpoint_path = cfg.pretrained_model_weights
        if checkpoint_path:
            print(f"[INFO] Loading weights from {checkpoint_path}")
            state_dict = torch.load(checkpoint_path, map_location=self.device, weights_only=True)
            if "model" in state_dict:
                state_dict = state_dict["model"]  # In case it's wrapped in 'model' key
            self.backbone.load_state_dict(state_dict)

        # Remove classifier head, we will add our own
        self.feature_dim = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()  # Remove classifier

        # Frequency-wise attention block -> attention mechanism to emphasize important frequency regions.
        self.att_block = nn.Sequential(
            nn.AdaptiveAvgPool2d((None, 1)),          # Mean over frequency bands
            nn.Conv2d(self.feature_dim, self.feature_dim, kernel_size=1),
            nn.Sigmoid()
        )

        # Custom classifier head
        self.classifier = nn.Sequential(
            nn.Conv2d(self.feature_dim, cfg.num_classes, kernel_size=1),
            nn.AdaptiveMaxPool2d((1, 1)),
            nn.Flatten()
        )

    def forward(self, x):
        """
        Forward pass of the model.
        
        Parameters:
        -----------
        x : torch.Tensor
            Input tensor of shape [B, 3, M, T], where:
            - B = Batch size
            - M = Mel bands (frequency bins)
            - T = Time frames

        Returns:
        --------
        torch.Tensor:
            Output tensor of shape [B, num_classes]
        """
        x = x.to(self.device)
        features = self.backbone.forward_features(x)  # EfficientNet backbone [B, C, M', T']
        attn = self.att_block(features)  # Attention on frequency bands [B, C, T', 1]
        features = features * attn       # Apply attention
        
        out = self.classifier(features)  # Classify [B, num_classes]
        return out

# Usage Example:
model = EfficientNetSED(cfg)
model = model.to(cfg.device)

# Show model summary
sample_input = torch.randn(10, 3, cfg.N_MELS, cfg.spectrogram_time_frames).to(cfg.device)  # [Batch, Channels, Mel Bands, Time Frames]
output = model(sample_input)
print(f"Model Output Shape: {output.shape}")  # Should be [10(B), num_classes]

model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

[INFO] Loading weights from /kaggle/input/offline-packages/efficientnet_b0_pretrained.pth
Model Output Shape: torch.Size([10, 206])


In [9]:
class PrecomputedSpectrogramDataset(Dataset):
    """
    PyTorch Dataset for loading precomputed log-mel spectrograms.

    Arguments:
    ----------
    metadata : pd.DataFrame
        DataFrame containing filenames, primary labels, and number of frames.
    spec_dir : str
        Directory where the spectrogram .npy files are saved.
    label_to_index : dict
        Mapping from class name to label index.
    augment : bool
        Whether to apply augmentation (placeholder for now).
    """
    def __init__(self, metadata, spec_dir, augment=False):
        self.metadata = metadata
        self.real_spectrogram_dir = os.path.join(spec_dir, "precomputed_spectograms")
        self.label_to_class, self.label_to_index, _, self.filename_to_secondary_label = get_mappings()
        self.augment = augment
        if self.augment:
            self.augmentor = SpectrogramAugmentor()

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        path = os.path.join(self.real_spectrogram_dir, row["filename"])
        
        # Load spectrogram
        spec = np.load(path, mmap_mode="r")
        spec = torch.tensor(spec, dtype=torch.float16).unsqueeze(0)  # [1, M, T]
        
        # Get the class name
        primary_label = row["primary_label"]
        class_name = self.label_to_class.get(primary_label, "Unknown")
        
        # Apply augmentations if enabled
        if self.augment and class_name in {"Aves", "Insecta", "Amphibia", "Mammalia"}:
            spec = self.augmentor.apply_augmentations(spec, class_name)
        
        # Convert to 3-channel image-like format
        spec = spec.repeat(3, 1, 1)
        
        # Multi-hot vector for primary labels
        primary_label_tensor = torch.zeros(len(self.label_to_index), dtype=torch.float16)
        if primary_label in self.label_to_index:
            primary_label_tensor[self.label_to_index[primary_label]] = 1.0
        
        # Multi-hot vector for secondary labels
        secondary_labels = self.filename_to_secondary_label.get(f"{row['filename'].split('_')[0]}.ogg", [])
        secondary_label_tensor = torch.zeros(len(self.label_to_index), dtype=torch.float16)
        for sec_label in secondary_labels:
            if sec_label in self.label_to_index:
                secondary_label_tensor[self.label_to_index[sec_label]] = 0.5
    
        # Combine them (primary gets 1.0 weight, secondary gets 0.5 weight)
        combined_labels = primary_label_tensor + secondary_label_tensor
        combined_labels = torch.clamp(combined_labels, 0, 1)  # Ensure it's only 0 or 1
        
        return {
            "spectrogram": spec, 
            "labels": combined_labels, 
            "filename": row["filename"], 
            "class_name": class_name
        }
def collate_fn(batch, mixup=False, alpha=0.4):
    """
    Custom collate function to handle varying-size spectrograms and apply Mixup if specified.

    Parameters:
    -----------
    batch : list
        List of samples (dict) with spectrogram, label, and filename.
    mixup : bool
        Whether to apply Mixup augmentation to the batch.
    alpha : float
        Alpha parameter for the Beta distribution in Mixup.

    Returns:
    --------
    dict : 
        Dictionary with stacked tensors and filenames.
    """
    if len(batch) == 0:
        return {"spectrograms": None, "labels": None, "filenames": None}
    
    # Extract elements
    specs = [item["spectrogram"] for item in batch]
    labels = [item["labels"] for item in batch]
    filenames = [item["filename"] for item in batch]

    # Stack along the batch dimension
    specs = torch.stack(specs)
    labels = torch.stack(labels)

    # 🚀 Apply Mixup if specified and more than one element exists in the batch
    if mixup and len(batch) > 1:
        indices = torch.randperm(len(batch))
        mixed_specs = []
        mixed_labels = []

        for i in range(0, len(batch) - 1, 2):
            lam = np.random.beta(alpha, alpha)
            spec1, spec2 = specs[i], specs[indices[i]]
            label1, label2 = labels[i], labels[indices[i]]

            mixed_spec = lam * spec1 + (1 - lam) * spec2
            mixed_label = lam * label1 + (1 - lam) * label2

            mixed_specs.append(mixed_spec)
            mixed_labels.append(mixed_label)

        # If the batch size is odd, we append the last sample as it is
        if len(batch) % 2 != 0:
            mixed_specs.append(specs[-1])
            mixed_labels.append(labels[-1])

        specs = torch.stack(mixed_specs)
        labels = torch.stack(mixed_labels)

    return {"spectrograms": specs, "labels": labels, "filenames": filenames}

def get_mappings():
    """
    Creates label-to-class and label-to-index mappings.
    Also maps filenames to their associated secondary labels.
    
    Returns:
    --------
    - label_to_class : dict
        Maps primary labels to their respective class names.
    - label_to_index : dict
        Maps primary labels to unique index values.
    - filename_to_secondary_label : dict
        Maps filenames to lists of secondary labels.
    """
    # Load the datasets
    taxonomy_df = pd.read_csv(cfg.taxonomy_path)
    metadata_df = pd.read_csv(cfg.real_spectrograms_metadata_path)
    train_df = pd.read_csv(cfg.metadata_path)
    
    # Filter out unused labels
    used_labels = set(metadata_df["primary_label"].unique())
    taxonomy_df = taxonomy_df[taxonomy_df['primary_label'].isin(used_labels)]

    # Label to Class Mapping
    label_to_class = taxonomy_df.set_index('primary_label')['class_name'].to_dict()

    # Label to Index Mapping → Now guaranteed to match!
    label_to_index = {label: idx for idx, label in enumerate(sorted(label_to_class.keys()))}
    index_to_label = {idx: label for idx, label in enumerate(sorted(label_to_class.keys()))}

    # 🌟 New Logic to retrieve filename → secondary_labels
    filename_to_secondary_label = {}

    # Iterate over the DataFrame
    for _, row in train_df.iterrows():
        filename = row["filename"]
        secondary_labels = eval(row["secondary_labels"]) if isinstance(row["secondary_labels"], str) and row["secondary_labels"] != "['']" else []

        # Only add if there are secondary labels
        if len(secondary_labels) > 0:
            filename_to_secondary_label[filename] = secondary_labels

    # Display the label maps only once (no need to repeat every call)
    if not hasattr(get_mappings, "_displayed"):
        tools.display_dataframe_to_user(name="Label to Class", dataframe=pd.DataFrame(label_to_class.items(), columns=["Animal Label", "Class Name"]))
        tools.display_dataframe_to_user(name="Label Map", dataframe=pd.DataFrame(label_to_index.items(), columns=["Animal Label", "Index"]))
        tools.display_dataframe_to_user(name="Filename to Secondary Labels", dataframe=pd.DataFrame(list(filename_to_secondary_label.items()), columns=["Filename", "Secondary Labels"]))
        get_mappings._displayed = True
    
    return label_to_class, label_to_index, index_to_label, filename_to_secondary_label


In [10]:
class SpectrogramAugmentor:
    def __init__(self, time_mask_prob=0.3, freq_mask_prob=0.3, mixup_prob=0.3, random_erase_prob=0.5):
        self.time_mask_prob = time_mask_prob
        self.freq_mask_prob = freq_mask_prob
        self.mixup_prob = mixup_prob
        self.random_erase_prob = random_erase_prob

    def time_mask(self, spec, num_masks=2, max_mask_size=20):
        """Apply Time Masking"""
        for _ in range(num_masks):
            t = random.randint(0, spec.size(2) - 1)
            mask_size = random.randint(5, max_mask_size)
            spec[:, :, t:t + mask_size] = 0
        return spec

    def freq_mask(self, spec, num_masks=2, max_mask_size=20):
        """Apply Frequency Masking"""
        for _ in range(num_masks):
            f = random.randint(0, spec.size(1) - 1)
            mask_size = random.randint(5, max_mask_size)
            spec[:, f:f + mask_size, :] = 0
        return spec

    def random_erasing(self, spec, max_rect=20):
        """Apply Random Erasing"""
        if random.random() < self.random_erase_prob:
            x = random.randint(0, spec.size(2) - max_rect)
            y = random.randint(0, spec.size(1) - max_rect)
            w = random.randint(5, max_rect)
            h = random.randint(5, max_rect)
            spec[:, y:y+h, x:x+w] = 0
        return spec

    def mixup(self, spec1, spec2, label1, label2, alpha=0.4):
        """Apply Mixup Augmentation"""
        lam = np.random.beta(alpha, alpha)
        mixed_spec = lam * spec1 + (1 - lam) * spec2
        mixed_label = lam * label1 + (1 - lam) * label2
        return mixed_spec, mixed_label

    def apply_augmentations(self, spec, class_name):
        """
        Conditional augmentations based on class name.
        """
        if class_name == "Aves":  # Birds
            if random.random() < self.time_mask_prob:
                spec = self.time_mask(spec)
            if random.random() < self.freq_mask_prob:
                spec = self.freq_mask(spec)

        elif class_name == "Insecta":  # Insects
            if random.random() < self.freq_mask_prob:
                spec = self.freq_mask(spec, max_mask_size=30)
            if random.random() < self.random_erase_prob:
                #spec = self.random_erasing(spec, max_rect=30)
                pass

        elif class_name == "Amphibia":  # Amphibians
            if random.random() < self.time_mask_prob:
                spec = self.time_mask(spec, max_mask_size=30)
            if random.random() < self.freq_mask_prob:
                spec = self.freq_mask(spec)

        elif class_name == "Mammalia":  # Mammals
            if random.random() < self.time_mask_prob:
                spec = self.time_mask(spec)
            if random.random() < self.freq_mask_prob:
                spec = self.freq_mask(spec)

        return spec


In [11]:
class PseudoLabeledDataset(Dataset):
    """
    Dataset for pseudo-labeled spectrograms with flexible filtering strategies.

    Supports:
    - "none": raw soft labels
    - "hard": hard thresholding
    - "topk": top-k masking
    - "hybrid": combine hard and soft
    - callable: pass a function(labels: np.ndarray) -> np.ndarray
    """
    def __init__(self, metadata_csv, spec_dir, strategy="none", soft_threshold=0.5, hard_threshold=0.8, top_k=1, index_to_label=None):
        self.df = pd.read_csv(metadata_csv)
        self.spec_dir = os.path.join(spec_dir, "pseudo_spectrograms")
        self.strategy = strategy
        self.soft_threshold = soft_threshold
        self.hard_threshold = hard_threshold
        self.top_k = top_k
        self.label_cols = [c for c in self.df.columns if c.startswith("class_")]
        self.num_classes = len(self.label_cols)
        self.index_to_label = index_to_label or {i: f"class_{i}" for i in range(self.num_classes)}
        self.filter_fn = strategy if callable(strategy) else None
        
        # Precompute label stats on filtered labels
        self._label_summary = self._compute_label_summary()


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        spec_path = os.path.join(self.spec_dir, row["filename"])
        spec = np.load(spec_path).astype(np.float32)
        spec = torch.tensor(spec).unsqueeze(0).repeat(3, 1, 1)

        labels = row[self.label_cols].values.astype(np.float32)

        # Handle strategy
        if self.filter_fn:
            labels = self.filter_fn(labels)

        elif self.strategy == "hard":
            labels = (labels >= self.hard_threshold).astype(np.float32)

        elif self.strategy == "topk":
            topk_idx = labels.argsort()[-self.top_k:]
            mask = np.zeros_like(labels)
            mask[topk_idx] = 1.0
            labels = mask.astype(np.float32)

        elif self.strategy == "hybrid":
            soft_mask = ((labels > self.soft_threshold) & (labels < self.hard_threshold)).astype(np.float32)
            hard_mask = (labels >= self.hard_threshold).astype(np.float32)
            labels = labels * soft_mask + 1.0 * hard_mask

        # else: strategy == "none", keep raw labels

        return {
            "spectrogram": spec,
            "labels": torch.tensor(labels, dtype=torch.float32),
            "filename": row["filename"]
        }

    def _compute_label_summary(self, sample_size=500):
        """
        Samples and summarizes the distribution of label values after filtering.
        """
        summary = {
            "total_samples": len(self.df),
            "avg_labels_per_sample": 0,
            "hard_ones": 0,
            "soft_values": 0,
            "zeros": 0
        }

        sample_indices = np.random.choice(len(self.df), size=min(sample_size, len(self.df)), replace=False)
        label_counts = []

        for idx in sample_indices:
            row = self.df.iloc[idx]
            labels = row[self.label_cols].values.astype(np.float32)

            if self.filter_fn:
                labels = self.filter_fn(labels)
            elif self.strategy == "hard":
                labels = (labels >= self.hard_threshold).astype(np.float32)
            elif self.strategy == "topk":
                topk_idx = labels.argsort()[-self.top_k:]
                mask = np.zeros_like(labels)
                mask[topk_idx] = 1.0
                labels = mask.astype(np.float32)
            elif self.strategy == "hybrid":
                soft_mask = ((labels > self.soft_threshold) & (labels < self.hard_threshold)).astype(np.float32)
                hard_mask = (labels >= self.hard_threshold).astype(np.float32)
                labels = labels * soft_mask + 1.0 * hard_mask

            hard = np.sum(labels == 1.0)
            soft = np.sum((labels > 0) & (labels < 1.0))
            zeros = np.sum(labels == 0.0)

            label_counts.append(hard + soft)
            summary["hard_ones"] += hard
            summary["soft_values"] += soft
            summary["zeros"] += zeros

        summary["avg_labels_per_sample"] = np.mean(label_counts)
        summary["hard_ones"] = int(summary["hard_ones"])
        summary["soft_values"] = int(summary["soft_values"])
        summary["zeros"] = int(summary["zeros"])

        # Class-wise activations
        class_stats = {f"class_{i}": {"hard": 0, "soft": 0, "total": 0} for i in range(self.num_classes)}

        for idx in sample_indices:
            row = self.df.iloc[idx]
            labels = row[self.label_cols].values.astype(np.float32)

            if self.filter_fn:
                labels = self.filter_fn(labels)
            elif self.strategy == "hard":
                labels = (labels >= self.hard_threshold).astype(np.float32)
            elif self.strategy == "topk":
                topk_idx = labels.argsort()[-self.top_k:]
                mask = np.zeros_like(labels)
                mask[topk_idx] = 1.0
                labels = mask.astype(np.float32)
            elif self.strategy == "hybrid":
                soft_mask = ((labels > self.soft_threshold) & (labels < self.hard_threshold)).astype(np.float32)
                hard_mask = (labels >= self.hard_threshold).astype(np.float32)
                labels = labels * soft_mask + 1.0 * hard_mask

            for i, val in enumerate(labels):
                if val >= 1.0:
                    class_stats[f"class_{i}"]["hard"] += 1
                elif val > 0.0:
                    class_stats[f"class_{i}"]["soft"] += 1
                if val > 0.0:
                    class_stats[f"class_{i}"]["total"] += 1

        summary["class_stats"] = class_stats

        return summary

    def __repr__(self):
        s = f"PseudoLabeledDataset(strategy='{self.strategy}', size={len(self)}, classes={self.num_classes})"
        s += "\n\n  Label Summary (approx. over sample):"
        for k, v in self._label_summary.items():
            if k != "class_stats":
                s += f"\n    {k}: {v}"

        p = "No thresholds"
        if self.strategy=="hybrid":
            p = f"hard {self.hard_threshold} + soft {self.soft_threshold}"
        elif self.strategy=="topk":
            p = f"topk = {self.top_k}"
        elif self.strategy=="hard":
            p = f"hard {self.hard_threshold}"
            
        s += f"\n\n  Per-Class Activations ({p}):"
        stats = self._label_summary["class_stats"]
        # Show top 10 most frequent
        s += "\n    Top 10 most frequent species:"
        n=10
        most_frequent_stats = sorted(stats.items(), key=lambda x: x[1]["total"], reverse=True)[:n]
        for k, v in most_frequent_stats:
            label_name = self.index_to_label.get(int(k.replace("class_", "")), k)
            s += f"\n      {label_name}: total={v['total']}, hard={v['hard']}, soft={v['soft']}"

        # Show top 10 less frequent
        s += "\n    Top 10 less frequent species:"
        n=10
        less_frequent_stats = sorted(stats.items(), key=lambda x: x[1]["total"], reverse=False)[:n]
        for k, v in less_frequent_stats:
            label_name = self.index_to_label.get(int(k.replace("class_", "")), k)
            s += f"\n      {label_name}: total={v['total']}, hard={v['hard']}, soft={v['soft']}"
        return s

def get_index_to_label(cfg):
    taxonomy_df = pd.read_csv(cfg.taxonomy_path)
    label_to_index = {label: idx for idx, label in enumerate(sorted(taxonomy_df['primary_label'].unique()))}
    index_to_label = {v: k for k, v in label_to_index.items()}
    return index_to_label

Want to zero out weak predictions, keep the rest as soft?

```python
def soft_but_sparse(labels):
    return np.where(labels > 0.2, labels, 0.0)
```

```python
dataset = PseudoLabeledDataset(
    metadata_csv="pseudo_metadata.csv",
    spec_dir="pseudo_spectrograms",
    strategy=soft_but_sparse
)
```

In [28]:
class CurriculumDatasetWrapper:
    """
    Wraps multiple datasets and switches between them based on epoch thresholds.
    Usage:
        wrapper = CurriculumDatasetWrapper({
            0: dataset_phase1,
            5: dataset_phase2,
            10: dataset_phase3
        })

        for epoch in range(total_epochs):
            train_loader = wrapper.get_dataloader(epoch)
    """
    def __init__(self, phase_datasets: dict, cfg, collate_fn=None):
        """
        phase_datasets: dict {epoch_threshold: dataset}
        e.g., {0: real_only, 5: real_plus_high_conf, 10: all_data}
        """
        self.phases = sorted(phase_datasets.items())  # list of (threshold, dataset)
        self.cfg = cfg
        self.collate_fn = collate_fn or (lambda x: collate_fn(x))  # pass-through if None
        self.current_loader = None
        self.current_phase = None

    def get_dataloader(self, epoch):
        selected_dataset = self.phases[0][1]
        for threshold, dataset in self.phases:
            if epoch == threshold:
                selected_dataset = dataset
    
        if self.current_phase != selected_dataset:
            print(f"[INFO] 🔁 Switching to new curriculum phase at epoch {epoch}")
            
            # ⬇️ Count pseudo vs real samples
            if isinstance(selected_dataset, ConcatDataset):
                num_parts = len(selected_dataset.datasets)
                sizes = [len(d) for d in selected_dataset.datasets]
                print(f"[INFO]   Dataset sizes → Real: {sizes[0]}, Pseudo: {sum(sizes[1:])}")
            else:
                print(f"[INFO]   Dataset size: {len(selected_dataset)}")
    
            self.current_loader = DataLoader(
                selected_dataset,
                batch_size=self.cfg.BATCH_SIZE,
                shuffle=True,
                num_workers=self.cfg.num_workers,
                pin_memory=True,
                collate_fn=self.collate_fn
            )
            self.current_phase = selected_dataset
    
        return self.current_loader



In [29]:
class FocalLossBCE(nn.Module):
    def __init__(
            self,
            alpha: float = 0.25,
            gamma: float = 2,
            reduction: str = "mean",
            bce_weight: float = 1.0,
            focal_weight: float = 1.0,
            secondary_weight: float = 0.5,
    ):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction=reduction)
        self.bce_weight = bce_weight
        self.focal_weight = focal_weight
        self.secondary_weight = secondary_weight

    def forward(self, logits, targets):
        # Apply scaling based on presence of primary (1.0) and secondary (0.5) labels
        weight_scale = targets.clone()
        weight_scale[weight_scale == self.secondary_weight] = self.secondary_weight
        weight_scale[weight_scale == 1.0] = 1.0

        focal_loss = sigmoid_focal_loss(
            inputs=logits,
            targets=targets,
            alpha=self.alpha,
            gamma=self.gamma,
            reduction=self.reduction,
        )
        bce_loss = self.bce(logits, targets)
        return (self.bce_weight * bce_loss * weight_scale).mean() + (self.focal_weight * focal_loss).mean()


In [30]:
class GradualWarmupSchedulerV2(GradualWarmupScheduler):
    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]
        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

In [33]:
class Trainer:
    """
    Trainer class to handle model training and evaluation.
    
    Parameters:
    -----------
    model : nn.Module
        The model to be trained.
    cfg : object
        Configuration object with training parameters.
    criterion : torch.nn.Module
        Loss function.
    optimizer : torch.optim.Optimizer
        Optimizer for model updates.
    scheduler : torch.optim.lr_scheduler
        Learning rate scheduler.
    """
    def __init__(self, model, cfg, criterion, optimizer, scheduler):
        self.model = model.to(cfg.device)
        self.cfg = cfg
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = cfg.device
        self.history = []
        self.scaler = GradScaler()  # ⚡️ Mixed Precision Support

    def train_one_epoch(self, loader):
        """
        Runs one full epoch of training with tqdm progress bar.
        """
        self.model.train()
        total_loss = 0.0
        all_targets = []
        all_outputs = []

        for batch in tqdm(loader, desc="Training Batch", leave=False):
            inputs = batch["spectrograms"].to(self.device)  # [16, 3, 128, 313]
            targets = batch["labels"].to(self.device)  # [16, 206]
                
            # Only keep primary labels for AUC calculation
            primary_only_targets = (targets == 1).int()  # Convert to float for AUC compatibility

            self.optimizer.zero_grad()
            with autocast(device_type=self.device):
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)

            # Scale the loss and backpropagate
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()

            total_loss += loss.item() * inputs.size(0)

            # Collect for AUC calculation
            all_targets.append(primary_only_targets.cpu().numpy())
            all_outputs.append(torch.sigmoid(outputs).detach().cpu().numpy())

        # AUC Calculation
        y_true = np.concatenate(all_targets, axis=0) # (40559, 206)
        y_pred = np.concatenate(all_outputs, axis=0)

        #print(y_true.shape, y_true.dtype, y_pred.dtype)
        assert y_true.shape == y_pred.shape
        assert y_true.shape[1] == self.cfg.num_classes
        #assert y_true.dtype == np.int32
        #assert y_pred.dtype == np.float16
        #assert set(np.unique(y_true)).issubset({0, 1})
        
        macro_auc, class_wise = self.calculate_auc(y_true, y_pred)


        return total_loss / len(loader.dataset), macro_auc
        
    def evaluate(self, loader):
        """
        Evaluates the model on the validation set with tqdm progress bar.
        Returns:
            - macro_auc: mean ROC AUC over valid classes
            - per_class_auc: dict of {class_index: AUC}
        """
        self.model.eval()
        all_targets = []
        all_outputs = []
    
        with torch.no_grad():
            for batch in tqdm(loader, desc="Evaluating Batch", leave=False): # dict_keys(['spectrogram', 'labels', 'filename'])
                inputs = batch["spectrogram"].to(self.device)
                targets = batch["labels"].to(self.device).float()
    
                primary_only_targets = (targets == 1).int()
    
                with autocast(device_type=self.device):
                    outputs = self.model(inputs)
                    outputs = torch.sigmoid(outputs).detach().cpu().numpy()
    
                all_targets.append(primary_only_targets.cpu().numpy())
                all_outputs.append(outputs)
    
        y_true = np.concatenate(all_targets, axis=0)
        y_pred = np.concatenate(all_outputs, axis=0)
    
        macro_auc, class_wise = self.calculate_auc(y_true, y_pred)
        return macro_auc, class_wise
            
    def calculate_auc(self, y_true, y_pred):
        """
        Calculates the macro ROC-AUC score.
        
        Parameters:
        -----------
        y_true : np.ndarray
            Ground truth binary labels (multi-hot encoded).
        y_pred : np.ndarray
            Model predictions (probabilities).
        
        Returns:
        --------
        float:
            Macro ROC-AUC score.
        """
        scores = []
        class_scores = {}
        for i in range(y_true.shape[1]):
            if 0 < np.sum(y_true[:, i]) < len(y_true):
                try:
                    auc = roc_auc_score(y_true[:, i], y_pred[:, i])
                    scores.append(auc)
                    class_scores[i] = auc
                except ValueError as e:
                    print(f"[WARNING] ValueError during AUC calculation for class index: {i}")
                    print(e)
                    class_scores[i] = None
            else:
                class_scores[i] = None
        return np.mean(scores) if scores else 0.0, class_scores



    def fit(self, train_loader, val_loader=None, curriculum=None, val_phase_map=None):
        """
        Runs the full training loop with tqdm progress bar.
        """
        best_val_auc = 0.0  # for optional checkpointing
        print(f"[INFO] Starting training... at {time.strftime('%H:%M:%S')}")
        for epoch in tqdm(range(self.cfg.EPOCHS), total=self.cfg.EPOCHS, desc="Training Epoch"):
            start_time = time.time()

            if train_loader is None:
                train_loader = curriculum.get_dataloader(epoch)
                val_loader = val_phase_map.get(epoch, None)

            # Train and Evaluate
            train_loss, train_auc = self.train_one_epoch(train_loader)
            
            if val_loader is not None:
                val_auc, per_class = self.evaluate(val_loader)
                # Record history
                self.history.append((epoch + 1, train_loss, train_auc, val_auc))
                print(f"Epoch {epoch + 1}/{self.cfg.EPOCHS} - "
                      f"Loss: {train_loss:.4f} - "
                      f"Train AUC: {train_auc:.4f} - "
                      f"Val AUC: {val_auc:.4f} - "
                      f"Time: {time.time() - start_time:.1f}s")

                # Print top N underperforming classes (optional)
                underperformers = sorted([(k, v) for k, v in per_class.items() if v is not None], key=lambda x: x[1])[:5]
                print("[INFO] 🔍 Worst classes by AUC:")
                for class_idx, auc in underperformers:
                    print(f"    class_{class_idx}: AUC = {auc:.4f}")
    
                # === Optional checkpointing (uncomment to enable) ===
                if val_auc > best_val_auc:
                    print(f"[INFO] ✅ New best AUC: {val_auc:.4f}, saving checkpoint...")
                    best_val_auc = val_auc
                    torch.save(self.model.state_dict(), os.path.join(self.cfg.OUTPUT_DIR, "best_model.pth"))
            
            else:
                # Record history
                self.history.append((epoch + 1, train_loss, train_auc))
                print(f"Epoch {epoch + 1}/{self.cfg.EPOCHS} - "
                      f"Loss: {train_loss:.4f} - "
                      f"Train AUC: {train_auc:.4f} - "
                      f"Time: {time.time() - start_time:.1f}s")

            # Scheduler step
            self.scheduler.step()
            train_loader = None

        # Display history as a DataFrame
        col_names = ["Epoch", "Train Loss", "Train ROC-AUC", "Val ROC-AUC"] if val_loader else ["Epoch", "Train Loss", "Train ROC-AUC"]
        tools.display_dataframe_to_user(
            name="Training History", 
            dataframe=pd.DataFrame(self.history, columns=col_names)
        )
        self.save_model()

    def save_model(self):
        """
        Saves the model checkpoint.
        """
        checkpoint_path = os.path.join(self.cfg.OUTPUT_DIR, self.cfg.output_model_filename)
        torch.save(self.model.state_dict(), checkpoint_path)
        print(f"[INFO] Model saved to {checkpoint_path}")


In [None]:
# === Load your trained model ===
model = EfficientNetSED(cfg)
print(f"[INFO] Loading weights from {cfg.sed_model_weights_path}")
model.load_state_dict(torch.load(cfg.sed_model_weights_path, map_location=cfg.device))
model.to(cfg.device)

# === Define label mappings ===
_, label_to_index, index_to_label, _ = get_mappings()

# === Define datasets ===
metadata = pd.read_csv(cfg.real_spectrograms_metadata_path)
dataset_real = PrecomputedSpectrogramDataset(metadata, cfg.real_spectrogram_dir, augment=cfg.augment)

dataset_pseudo_80 = PseudoLabeledDataset(
    metadata_csv=cfg.pseudo_spectrograms_metadata_path,
    spec_dir=cfg.pseudo_spectrograms_dir,
    strategy="hybrid",  # or "hard"
    hard_threshold=0.25,
    soft_threshold=0.10,
    index_to_label=index_to_label
)

dataset_pseudo_60 = PseudoLabeledDataset(
    metadata_csv=cfg.pseudo_spectrograms_metadata_path,
    spec_dir=cfg.pseudo_spectrograms_dir,
    strategy="hybrid",  
    hard_threshold=0.18,
    soft_threshold=0.05,
    index_to_label=index_to_label
)

dataset_pseudo_40 = PseudoLabeledDataset(
    metadata_csv=cfg.pseudo_spectrograms_metadata_path,
    spec_dir=cfg.pseudo_spectrograms_dir,
    strategy="hybrid",  
    hard_threshold=0.1,
    soft_threshold=0.01,
    index_to_label=index_to_label
)
# === Combine datasets in curriculum ===
curriculum = CurriculumDatasetWrapper({
    0: dataset_real,
    2: ConcatDataset([dataset_real, dataset_pseudo_80]),
    8: ConcatDataset([dataset_real, dataset_pseudo_60]),
    14: ConcatDataset([dataset_real, dataset_pseudo_40])
}, cfg, collate_fn=lambda x: collate_fn(x))

# Add these to your pipeline setup
val_phase_map = {
    1: DataLoader(dataset_pseudo_80, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.num_workers),
    7: DataLoader(dataset_pseudo_60, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.num_workers),
    6: DataLoader(dataset_pseudo_60, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.num_workers),
    12: DataLoader(dataset_pseudo_40, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.num_workers),
    13: DataLoader(dataset_pseudo_40, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.num_workers),
}
# Loss, Optimizer, and Scheduler
criterion = FocalLossBCE(alpha=cfg.alpha, gamma=cfg.gamma, reduction=cfg.reduction, bce_weight=cfg.bce_weight, focal_weight=cfg.focal_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.LEARNING_RATE)
# Define the base scheduler (Cosine Annealing)
cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.EPOCHS, eta_min=cfg.min_lr)

# Define the warmup scheduler with a 5-epoch warmup
scheduler = GradualWarmupSchedulerV2(
    optimizer,
    multiplier=1,
    total_epoch=5,    # Number of epochs to warm up
    after_scheduler=cosine_scheduler
)
# Initialize trainer
trainer = Trainer(model, cfg, criterion, optimizer, scheduler)
trainer.fit(train_loader=None, curriculum=curriculum, val_phase_map=val_phase_map)

[INFO] Loading weights from /kaggle/input/offline-packages/efficientnet_b0_pretrained.pth
[INFO] Loading weights from /kaggle/input/effnet14/efficientnet_b0_sed.pth
[INFO] Starting training... at 14:35:32


Training Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

[INFO] 🔁 Switching to new curriculum phase at epoch 0
[INFO]   Dataset size: 81117


Training Batch:   0%|          | 0/2535 [00:00<?, ?it/s]

Epoch 1/20 - Loss: 0.0008 - Train AUC: 0.9929 - Time: 263.8s


Training Batch:   0%|          | 0/2535 [00:00<?, ?it/s]

Evaluating Batch:   0%|          | 0/6991 [00:00<?, ?it/s]

In [None]:
class HardNegativeCollector:
    def __init__(self, model, dataset, cfg, threshold=0.5, max_per_class=100, output_csv="hard_negatives.csv"):
        self.model = model.eval()
        self.dataset = dataset
        self.cfg = cfg
        self.threshold = threshold
        self.max_per_class = max_per_class
        self.output_csv = os.path.join(cfg.OUTPUT_DIR, output_csv)

    def collect(self):
        device = self.cfg.device
        loader = DataLoader(self.dataset, batch_size=self.cfg.BATCH_SIZE, shuffle=False,
                            num_workers=self.cfg.num_workers, pin_memory=True)

        hard_examples = []

        with torch.no_grad():
            for batch in tqdm(loader, desc="⛏ Collecting Hard Negatives"):
                inputs = batch["spectrograms"].to(device)
                filenames = batch["filename"]
                true = batch["labels"].cpu().numpy()
                with autocast(device_type=device):
                    pred = self.model(inputs)
                    pred = torch.sigmoid(pred).detach().cpu().numpy()

                for i in range(len(filenames)):
                    filename = filenames[i]
                    y_true = true[i]
                    y_pred = pred[i]

                    # Identify FPs and FNs
                    for class_idx in range(len(y_true)):
                        t, p = y_true[class_idx], y_pred[class_idx]

                        if t == 1 and p < self.threshold:
                            hard_examples.append({
                                "filename": filename,
                                "class_index": class_idx,
                                "error_type": "false_negative",
                                "confidence": p
                            })

                        elif t == 0 and p >= self.threshold:
                            hard_examples.append({
                                "filename": filename,
                                "class_index": class_idx,
                                "error_type": "false_positive",
                                "confidence": p
                            })

        # Build DataFrame
        df = pd.DataFrame(hard_examples)

        # Optionally limit per class
        df_limited = df.groupby(["class_index", "error_type"]).head(self.max_per_class).reset_index(drop=True)

        df_limited.to_csv(self.output_csv, index=False)
        print(f"[INFO] 🔍 Saved {len(df_limited)} hard negatives to {self.output_csv}")
        return df_limited


In [None]:
metadata = pd.read_csv(cfg.real_spectrograms_metadata_path)
dataset_real = PrecomputedSpectrogramDataset(metadata, cfg.real_spectrogram_dir, augment=False)

hnm_collector = HardNegativeCollector(
    model=model,
    dataset=dataset_real,
    cfg=cfg,
    threshold=0.5,
    max_per_class=100,
    output_csv="hard_negatives.csv"
)

hard_df = hnm_collector.collect()


In [None]:
class HardNegativeDataset(Dataset):
    """
    Dataset for training on hard negatives (false positives / false negatives).
    
    Arguments:
    ----------
    hard_csv : str
        Path to the CSV file with hard mistakes (from HardNegativeCollector).
    spec_dir : str
        Directory where the precomputed spectrogram .npy files are stored.
    num_classes : int
        Total number of output classes (206 for this task).
    filter_error_type : str or None
        If specified, filter to only "false_negative" or "false_positive".
    label_weight : float
        Value to assign to the hard mistake class (e.g., 1.0).
    """
    def __init__(self, hard_csv, spec_dir, num_classes, filter_error_type=None, label_weight=1.0):
        self.df = pd.read_csv(hard_csv)
        self.spec_dir = spec_dir
        self.num_classes = num_classes
        self.filter_error_type = filter_error_type
        self.label_weight = label_weight

        if self.filter_error_type:
            self.df = self.df[self.df["error_type"] == self.filter_error_type].reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        spec_path = os.path.join(self.spec_dir, row["filename"])
        spec = np.load(spec_path).astype(np.float32)
        spec = torch.tensor(spec).unsqueeze(0).repeat(3, 1, 1)

        label = torch.zeros(self.num_classes, dtype=torch.float32)
        label[int(row["class_index"])] = self.label_weight

        return {
            "spectrogram": spec,
            "labels": label,
            "filename": row["filename"],
            "class_index": int(row["class_index"]),
            "error_type": row["error_type"]
        }


In [None]:
hard_dataset = HardNegativeDataset(
    hard_csv=os.path.join(cfg.OUTPUT_DIR, "hard_negatives.csv"),
    spec_dir=cfg.real_spectrogram_dir,
    num_classes=cfg.num_classes,
    filter_error_type=None,  # or "false_negative", or "false_positive"
    label_weight=1.0
)

hard_loader = create_dataloader(hard_dataset, cfg, shuffle=True, collate_fn=collate_fn)
