In [1]:
%%time

# --- Install missing packages offline---
try:
    import timm
except ModuleNotFoundError:
    print('Installing timm...')
    !pip install -q /kaggle/input/offline-packages/timm-1.0.15-py3-none-any.whl
    
# --- Core libraries ---
import os
import math
import random
import numpy as np

# --- Data handling ---
import pandas as pd

# --- PyTorch and torchaudio ---
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
import torchaudio
import torchaudio.transforms as T
import torch.nn.functional as F

# --- Miscellaneous ---
from tqdm.notebook import tqdm

# --- Custom tools ---
import timm
import time
from dataclasses import dataclass, field

CPU times: user 12.4 s, sys: 2.47 s, total: 14.8 s
Wall time: 20.1 s


In [2]:
@dataclass
class CFG:
    # General
    seed: int = 315
    debug: bool = False
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    num_workers: int = 4

    # Data paths
    data_path: str = '/kaggle/input/birdclef-2025/'
    metadata_path: str = field(init=False)
    taxonomy_path: str = field(init=False)
    sample_submission_path: str = field(init=False)
    test_soundscapes_path: str = field(init=False)
    train_soundscapes_path: str = field(init=False)
    train_data_path: str = field(init=False)

    # Audio config
    topDB: int = 80
    FS: int = 32000
    CHUNK_LENGTH: float = 5.0
    N_FFT: int = 1024
    HOP_LENGTH: int = 512
    N_MELS: int = 128
    FMIN: int = 50
    FMAX: int = 14000
    POWER: int = 2
    SPEC_DTYPE: str = 'float16'
    SPEC_FRAMES: int = field(init=False)
    CHUNK_SAMPLES: int = field(init=False)

    # Model
    model_name: str = 'efficientnet_b0'
    pretrained: bool = False
    input_directory: str = '/kaggle/input/offline-packages'
    input_model_filename: str = field(init=False)
    output_model_filename: str = field(init=False)
    num_classes: int = field(init=False)

    # 🔁 Ensemble model weights
    timewise_weights_path: str = '/kaggle/input/effnet28/efficientnet_b0_sed.pth'
    freqwise_weights_path: str = '/kaggle/input/effnet14/efficientnet_b0_sed.pth'
    classifier_weights_path: str = '/kaggle/input/effnet33/efficientnet_b0_sed.pth'

    def __post_init__(self):
        self.metadata_path = os.path.join(self.data_path, 'train.csv')
        self.taxonomy_path = os.path.join(self.data_path, 'taxonomy.csv')
        self.sample_submission_path = os.path.join(self.data_path, 'sample_submission.csv')
        self.test_soundscapes_path = os.path.join(self.data_path, 'test_soundscapes')
        self.train_soundscapes_path = os.path.join(self.data_path, 'train_soundscapes')
        self.train_data_path = os.path.join(self.data_path, 'train_audio')
        self.input_model_filename = f"{self.model_name}_pretrained.pth"
        self.output_model_filename = f"{self.model_name}_sed.pth"
        self.SPEC_FRAMES = (self.FS * self.CHUNK_LENGTH) // self.HOP_LENGTH + 1

        taxonomy_df = pd.read_csv(self.taxonomy_path)
        self.num_classes = len(taxonomy_df)

        self.CHUNK_SAMPLES = int(self.FS * self.CHUNK_LENGTH)


## Models

In [3]:
class EfficientNetClassifier(nn.Module):
    """
    Simple EfficientNet-based audio classifier (no SED / attention).

    This model:
    - Uses a pretrained EfficientNet backbone (e.g., b0–b3)
    - Replaces the classifier head with a custom one
    - Includes dropout and batch normalization
    - Outputs clip-level logits (or optionally, probabilities)

    Parameters:
    -----------
    cfg : object
        Configuration object with fields:
        - model_name: str (e.g., 'efficientnet_b0')
        - pretrained: bool
        - num_classes: int
        - model_weights: Optional[str]
        - device: str
    """
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.device = torch.device(cfg.device)

        # Load base model
        self.backbone = timm.create_model(cfg.model_name, pretrained=cfg.pretrained)

        # Replace classifier head
        in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Sequential(
            nn.BatchNorm1d(in_features),
            nn.Dropout(0.3),
            nn.Linear(in_features, cfg.num_classes),
            # nn.Sigmoid()  # Uncomment for multi-label classification
        )

    def forward(self, x):
        """
        Forward pass of the model.

        Parameters:
        -----------
        x : torch.Tensor
            Input tensor [B, 3, Freq, Time] (e.g., log-mel spectrogram)

        Returns:
        --------
        torch.Tensor:
            Logits (or probabilities if sigmoid enabled) [B, num_classes]
        """
        return self.backbone(x)


In [4]:
class EfficientNetTimeSED(nn.Module):
    """
    EfficientNet with a custom SED head for time-wise attention.
    
    This model:
    - Uses a pretrained EfficientNet backbone
    - Applies a frequency-wise attention mechanism
    - Outputs class probabilities for multi-class classification
    
    Arguments:
    ----------
    cfg : object
        Configuration object (an instance of CFG)
    """
    def __init__(self, cfg):
        super().__init__()
        
        # Store config and device
        self.cfg = cfg
        self.device = torch.device(cfg.device)

        # Create model with the correct architecture
        self.backbone = timm.create_model(cfg.model_name, pretrained=cfg.pretrained)

        # Remove classifier head, we will add our own
        self.feature_dim = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()  # Remove classifier

        # Time-wise attention block -> attention mechanism to emphasize important time regions (when did the animal spoke?).
        self.att_block = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),          # Averages across frequency, keeps time axis intact. → shape becomes [B, C, 1, T]
            nn.Conv2d(self.feature_dim, self.feature_dim, kernel_size=1), # → pointwise attention weighting
            nn.Sigmoid()   # → [0, 1] weights per time step
        )

        # Custom classifier head
        self.classifier = nn.Sequential(
            nn.Conv2d(self.feature_dim, cfg.num_classes, kernel_size=1),
            nn.AdaptiveMaxPool2d((1, 1)),
            nn.Flatten()
        )

    def forward(self, x):
        """
        Forward pass of the model.
        
        Parameters:
        -----------
        x : torch.Tensor
            Input tensor of shape [B, 3, M, T], where:
            - B = Batch size
            - M = Mel bands (frequency bins)
            - T = Time frames

        Returns:
        --------
        torch.Tensor:
            Output tensor of shape [B, num_classes]
        """
        features = self.backbone.forward_features(x)  # EfficientNet backbone feature map [B, C, F', T'] where F' is the compressed frequency axis and T' the time steps (how many chunks/segments of the input timeline)
        # print(features.shape)
        attn = self.att_block(features)  # Attention on time bands [B, C, 1, T']
        # print(attn.shape)
        features = features * attn       # Apply attention ( dims down irrelevant time steps and boosts relevant ones)
        
        out = self.classifier(features)  # Classify [B, num_classes]
        return out

In [5]:
class EfficientNetFrequencySED(nn.Module):
    """
    EfficientNet with a custom SED head for frequency-wise attention.
    
    This model:
    - Uses a pretrained EfficientNet backbone
    - Applies a frequency-wise attention mechanism
    - Outputs class probabilities for multi-class classification
    
    Arguments:
    ----------
    cfg : object
        Configuration object (assumes it's an instance of CFG)
    """
    def __init__(self, cfg):
        super().__init__()
        
        # Store config and device
        self.cfg = cfg
        self.device = torch.device(cfg.device)

        # Create model with the correct architecture
        self.backbone = timm.create_model(cfg.model_name, pretrained=cfg.pretrained)

        # Remove classifier head, we will add our own
        self.feature_dim = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()  # Remove classifier

        # Frequency-wise attention block -> attention mechanism to emphasize important frequency regions.
        self.att_block = nn.Sequential(
            nn.AdaptiveAvgPool2d((None, 1)),          # Mean over frequency bands
            nn.Conv2d(self.feature_dim, self.feature_dim, kernel_size=1),
            nn.Sigmoid()
        )

        # Custom classifier head
        self.classifier = nn.Sequential(
            nn.Conv2d(self.feature_dim, cfg.num_classes, kernel_size=1),
            nn.AdaptiveMaxPool2d((1, 1)),
            nn.Flatten()
        )

    def forward(self, x):
        """
        Forward pass of the model.
        
        Parameters:
        -----------
        x : torch.Tensor
            Input tensor of shape [B, 3, M, T], where:
            - B = Batch size
            - M = Mel bands (frequency bins)
            - T = Time frames

        Returns:
        --------
        torch.Tensor:
            Output tensor of shape [B, num_classes]
        """
        x = x.to(self.device)
        features = self.backbone.forward_features(x)  # EfficientNet backbone [B, C, M', T']
        attn = self.att_block(features)  # Attention on frequency bands [B, C, T', 1]
        features = features * attn       # Apply attention
        
        out = self.classifier(features)  # Classify [B, num_classes]
        return out


In [6]:
class ModelEnsembler(torch.nn.Module):
    def __init__(self, cfg, model_classes, weights_paths, weights=None):
        super().__init__()
        self.models = torch.nn.ModuleList()
        self.device = cfg.device
        for cls, path in zip(model_classes, weights_paths):
            model = cls(cfg).to(self.device)
            state_dict = torch.load(path, map_location=self.device, weights_only=True)
            if "model" in state_dict:
                state_dict = state_dict["model"]
            model.load_state_dict(state_dict, strict=False)
            model.eval()
            self.models.append(model)
        self.weights = weights or [1.0 / len(self.models)] * len(self.models)

    def forward(self, x):
        ensemble_logits = 0
        for model, weight in zip(self.models, self.weights):
            logits = model(x)
            ensemble_logits += weight * logits
        return ensemble_logits

In [7]:
class AudioPreprocessor:
    """
    Preprocesses audio for EfficientNet inference by generating log-mel spectrograms.
    
    Methods:
    --------
    process_file(file_path: str) -> List[torch.Tensor]:
        Processes an audio file and returns a list of spectrogram tensors.
    """
    
    def __init__(self, cfg):
        self.cfg = cfg
        self.sample_rate = cfg.FS
        self.chunk_samples = cfg.CHUNK_SAMPLES

        # Torchaudio transforms for spectrogram generation
        self.mel_transform = T.MelSpectrogram(
            sample_rate=cfg.FS,
            n_fft=cfg.N_FFT,
            hop_length=cfg.HOP_LENGTH,
            n_mels=cfg.N_MELS,
            f_min=cfg.FMIN,
            f_max=cfg.FMAX,
            power=cfg.POWER
        )

        self.db_transform = T.AmplitudeToDB(top_db=cfg.topDB)

    def _load_waveform(self, path: str) -> torch.Tensor:
        """
        Load audio and convert to mono.
        """
        waveform, sr = torchaudio.load(path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)  # convert to mono
            
        waveform = waveform[..., :cfg.FS * 60]  # Truncate to 60s max
        return waveform

    def _split_into_chunks(self, waveform: torch.Tensor) -> list:
        """
        Splits the waveform into 5-second chunks, applying padding if necessary.
        """
        total_samples = waveform.shape[-1]
        chunks = []

        for start in range(0, total_samples, self.chunk_samples):
            end = start + self.chunk_samples
            chunk = waveform[..., start:end]

            # If chunk is smaller than required, pad it
            if chunk.shape[-1] < self.chunk_samples:
                pad_size = self.chunk_samples - chunk.shape[-1]
                chunk = F.pad(chunk, (0, pad_size))
                #chunk = self.cyclic_pad(chunk, self.chunk_samples)
            
            chunks.append(chunk)
        return chunks

    def cyclic_pad(self, chunk: torch.Tensor, desired_len: int) -> torch.Tensor:
        repeats = (desired_len // chunk.shape[-1]) + 1
        return chunk.repeat(1, repeats)[..., :desired_len]

    def _waveform_to_spec(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Convert waveform to a normalized log-mel spectrogram.
        """
        mel = self.mel_transform(waveform)
        log_mel = self.db_transform(mel)
        norm_spec = (log_mel - log_mel.mean()) / (log_mel.std() + 1e-6)
        return norm_spec.squeeze(0).cpu()  # [128, Time]

    def process_file(self, file_path: str) -> list:
        """
        Full pipeline to process an audio file.
        
        Parameters:
        -----------
        file_path : str
            Path to the audio file.

        Returns:
        --------
        list of torch.Tensor:
            List of log-mel spectrograms for each 5-second chunk.
        """
        if cfg.debug:
            print(f"[INFO] Processing file: {file_path}")
        waveform = self._load_waveform(file_path)
        chunks = self._split_into_chunks(waveform)
        
        spectrograms = []
        for idx, chunk in enumerate(tqdm(chunks, desc="Generating Spectrograms", leave=False)):
            spec = self._waveform_to_spec(chunk)
            spec = spec.unsqueeze(0).repeat(3, 1, 1)  # [3, M, T]
            spectrograms.append(spec)
        
        return spectrograms


In [8]:
class InferencePipeline:
    """
    Inference pipeline for EfficientNetSED.
    
    Methods:
    --------
    run_inference(file_list: List[str]) -> pd.DataFrame:
        Runs inference on a list of audio files and returns predictions in DataFrame format.
    
    generate_submission(predictions: pd.DataFrame, sample_submission: str, output_path: str):
        Generates the final submission CSV file.
    """
    
    def __init__(self, model, preprocessor, cfg):
        self.model = model
        self.preprocessor = preprocessor
        self.cfg = cfg
        self.device = cfg.device
        self.model.to(self.device)
        self.model.eval()
        
        # Load label map
        taxonomy = pd.read_csv(self.cfg.taxonomy_path)
        self.label_map = taxonomy["primary_label"].values

    def run_inference(self, file_list):
        """
        Run inference on a list of audio files and collect predictions.
        
        Parameters:
        -----------
        file_list : List[str]
            List of file paths for inference.

        Returns:
        --------
        pd.DataFrame:
            DataFrame with row_id and species probabilities for submission.
        """
        print("[INFO] Starting inference on soundscapes...")
        all_results = []

        for file_path in tqdm(file_list, desc="Running Inference"):
            soundscape_id = os.path.basename(file_path).replace(".ogg", "")
            if cfg.debug:
                print(f"[INFO] Processing {soundscape_id}")
            
            # Process audio and get spectrogram chunks
            spectrograms = self.preprocessor.process_file(file_path)
            chunk_predictions = []
            
            for spec in spectrograms:
                # Add batch dimension and move to device
                spec = spec.unsqueeze(0).to(self.device)  # [1, 3, 128, 253]

                # Inference
                with torch.no_grad():
                    logits = self.model(spec)
                    probs = torch.sigmoid(logits).cpu().numpy().flatten()

                chunk_predictions.append(probs)

            # Convert to array
            chunk_predictions = np.stack(chunk_predictions)  # shape: [12, num_classes]
    
            # === Post-Processing: Multiply each chunk prob with mean prob per class ===
            class_mean = chunk_predictions.mean(axis=0)  # [num_classes]
            smoothed = chunk_predictions * class_mean  # shape: [12, num_classes]
    
            # Time tracking
            current_time = 5
            # Convert to DataFrame-like format
            for i, probs in enumerate(smoothed):
    
                # Create row_id for submission format
                row_id = f"{soundscape_id}_{current_time}"
                    
                # Append result
                all_results.append([row_id] + list(probs))
                    
                # Move to the next 5-second window
                current_time += 5

        # Convert to DataFrame
        columns = ["row_id"] + list(self.label_map)
        predictions_df = pd.DataFrame(all_results, columns=columns)
        
        print(f"[INFO] Inference complete! Processed {len(file_list)} files.")
        return predictions_df

    def generate_submission(self, predictions, sample_submission, species_ids, output_path="submission.csv"):
        """
        Generate submission CSV from predictions.
        
        Parameters:
        -----------
        predictions : pd.DataFrame
            DataFrame with the inference results.
        
        sample_submission : str
            Path to the sample submission for formatting reference.
        
        output_path : str
            Output path for the final submission file.
        """
        print("[INFO] Creating submission dataframe...")
        
        # Load sample submission to get the correct structure
        sample_df = pd.read_csv(sample_submission, index_col='row_id')
        
        # Create a dictionary for DataFrame construction
        row_ids = predictions['row_id']
        submission_dict = {'row_id': row_ids}
        
        # Loop through each species and add the prediction to the dictionary
        for species in species_ids:
            if species in predictions.columns:
                submission_dict[species] = predictions[species]
            else:
                submission_dict[species] = 0.0
        
        # Build the DataFrame
        submission_df = pd.DataFrame(submission_dict)
        submission_df.set_index('row_id', inplace=True)
        
        # Ensure column order matches
        submission_df = submission_df[sample_df.columns]
        submission_df = submission_df.reset_index()
        
        # Save to CSV
        submission_df.to_csv(output_path, index=False)
        
        print(f"[INFO] Submission saved to {output_path}")
        return submission_df


In [9]:

#model = EfficientNetClassifier(cfg)
#print(f"[INFO] Loading weights from {cfg.custom_weights_path}")
#model.load_state_dict(torch.load(cfg.custom_weights_path, map_location=cfg.device, weights_only=True))
#model.eval()


In [10]:
# === Initialize components ===
cfg = CFG()
preprocessor = AudioPreprocessor(cfg)

# === Define model classes and weights ===
model_classes = [EfficientNetFrequencySED, EfficientNetTimeSED, EfficientNetClassifier]
weights_paths = [cfg.freqwise_weights_path, cfg.timewise_weights_path, cfg.classifier_weights_path]
ensemble_weights = [0.5, 0.3, 0.2]

# === Initialize ensemble model ===
print("[INFO] Initializing ensemble model...")
for i, (cls, path, wt) in enumerate(zip(model_classes, weights_paths, ensemble_weights)):
    print(f" → Model {i}: {cls.__name__}")
    print(f"    - Weights path: {path}")
    print(f"    - Weight in ensemble: {wt}")

ensemble_model = ModelEnsembler(cfg, model_classes, weights_paths, weights=ensemble_weights)
print(f"[INFO] Ensemble model initialized with {len(ensemble_model.models)} models.\n")

# === Initialize inference pipeline ===
inference_pipeline = InferencePipeline(ensemble_model, preprocessor, cfg)

# === Load test audio files ===
test_path = cfg.test_soundscapes_path  # Use train_soundscapes_path for debug
test_files = [os.path.join(test_path, f) for f in sorted(os.listdir(test_path)) if f.endswith(".ogg")]
print(f"[INFO] Found {len(test_files)} test files for inference.\n")

# === Run inference ===
predictions = inference_pipeline.run_inference(test_files)

# === Generate submission ===
taxonomy_df = pd.read_csv(cfg.taxonomy_path)
species_ids = taxonomy_df['primary_label'].tolist()

submission_df = inference_pipeline.generate_submission(
    predictions,
    sample_submission=cfg.sample_submission_path,
    species_ids=species_ids,
    output_path="submission.csv"
)

print("\n[INFO] Sample of final submission:")
print(submission_df.head())


[INFO] Initializing ensemble model...
 → Model 0: EfficientNetFrequencySED
    - Weights path: /kaggle/input/effnet14/efficientnet_b0_sed.pth
    - Weight in ensemble: 0.5
 → Model 1: EfficientNetTimeSED
    - Weights path: /kaggle/input/effnet28/efficientnet_b0_sed.pth
    - Weight in ensemble: 0.3
 → Model 2: EfficientNetClassifier
    - Weights path: /kaggle/input/effnet33/efficientnet_b0_sed.pth
    - Weight in ensemble: 0.2
[INFO] Ensemble model initialized with 3 models.

[INFO] Found 0 test files for inference.

[INFO] Starting inference on soundscapes...


Running Inference: 0it [00:00, ?it/s]

[INFO] Inference complete! Processed 0 files.
[INFO] Creating submission dataframe...
[INFO] Submission saved to submission.csv

[INFO] Sample of final submission:
Empty DataFrame
Columns: [row_id, 1139490, 1192948, 1194042, 126247, 1346504, 134933, 135045, 1462711, 1462737, 1564122, 21038, 21116, 21211, 22333, 22973, 22976, 24272, 24292, 24322, 41663, 41778, 41970, 42007, 42087, 42113, 46010, 47067, 476537, 476538, 48124, 50186, 517119, 523060, 528041, 52884, 548639, 555086, 555142, 566513, 64862, 65336, 65344, 65349, 65373, 65419, 65448, 65547, 65962, 66016, 66531, 66578, 66893, 67082, 67252, 714022, 715170, 787625, 81930, 868458, 963335, amakin1, amekes, ampkin1, anhing, babwar, bafibi1, banana, baymac, bbwduc, bicwre1, bkcdon, bkmtou1, blbgra1, blbwre1, blcant4, blchaw1, blcjay1, blctit1, blhpar1, blkvul, bobfly1, bobher1, brtpar1, bubcur1, bubwre1, bucmot3, bugtan, butsal1, cargra1, cattyr, chbant1, chfmac1, cinbec1, cocher1, cocwoo1, colara1, colcha1, compau, compot1, ...]
Index: