## LinReg ensemble instad of simple mean

Since 4 models were trained on 4 folds, there is no proper validation data for ensemble coefficients creation. Couple of possible workaraunds:
1. Fit ensemble coefficients on all 4 folds simultaneously (on each fold 3 correspondent models will be better, but overall coefficients may be fair)
1. Fit coefs on the unused birdclef23 data

In [3]:
import re
import os
import gc
import sys
import cv2
import math
import numpy as np
import pandas as pd
from glob import glob
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import librosa
import scipy as sci
import timm

import torch
from torch import nn
from torchvision.models import efficientnet

import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
import torchvision
import torch
import pytorch_lightning as pl
import pandas as pd
from tqdm.auto import tqdm
import math
import librosa
import cv2
import torch.nn as nn
from metric import score
import albumentations as A
import numpy as np
from sklearn.model_selection import StratifiedKFold
import torchaudio


In [5]:
class Config:
    # == global config ==
    SEED = 42 # random seed
    DEVICE = 'cuda'  # device to be used
    MIXED_PRECISION = False  # whether to use mixed-16 precision
    # OUTPUT_DIR = '/kaggle/working/'  # output folder
    OUTPUT_DIR = '/data/yaz/birdclef24/out/'  # output folder
    CKPT_ROOT = '/data/yaz/birdclef24/baselinev2/out/'

    use_cache = False
    
    # == data config ==
    # DATA_ROOT = '/kaggle/input/birdclef-2024'  # root folder
    DATA_ROOT = '/data/yaz/birdclef24/data'  # root folder
    # PREPROCESSED_DATA_ROOT = '/kaggle/input/birdclef24-spectrograms-via-cupy'
    # PREPROCESSED_DATA_ROOT = '/data/yaz/birdclef24/data/specs'
    # LOAD_DATA = True  # whether to load data from pre-processed dataset

    image_size = 256

    SR = 32000  # sample rate
    mel_spec_params = {
        "sample_rate": 32000,
        "n_mels": 128,
        "f_min": 20,
        "f_max": 16000,
        "n_fft": 2048,
        "hop_length": 512,
        "normalized": True,
        "center" : True,
        "pad_mode" : "constant",
        "norm" : "slaney",
        "onesided" : True,
        "mel_scale" : "slaney"
    }

    top_db = 80 

    train_period = 5
    val_period = 5
    secondary_coef = 1.0
    train_duration = train_period * mel_spec_params["sample_rate"]
    val_duration = val_period * mel_spec_params["sample_rate"]

    # == model config ==
    MODEL_TYPE = 'efficientnet_b0'  # model type
    
    # == dataset config ==
    # BATCH_SIZE = 128  # batch size of each step
    BATCH_SIZE = 256 # batch size of each step
    N_WORKERS = 4  # number of workers
    # N_WORKERS = 0  # number of workers
    
    # == AUG ==
    USE_XYMASKING = True  # whether use XYMasking
    
    # == training config ==
    N_FOLDS = 4  # n fold
    EPOCHS = 30  # max epochs
    LR = 1e-3  # learning rate
    WEIGHT_DECAY = 1e-5  # weight decay of optimizer

    
    # == other config ==
    VISUALIZE = False # whether to visualize data and batch


config = Config()

In [6]:
print('fix seed')
pl.seed_everything(config.SEED, workers=True)

Seed set to 42


fix seed


42

In [7]:
df = pd.read_csv(config.DATA_ROOT + '/train_metadata.csv')
df["path"] = config.DATA_ROOT + "/train_audio/" + df["filename"]
df["rating"] = np.clip(df["rating"] / df["rating"].max(), 0.1, 1.0)

skf = StratifiedKFold(n_splits=config.N_FOLDS, random_state=config.SEED, shuffle=True)
df['fold'] = -1
for ifold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=df["primary_label"].values)):
    df.loc[val_idx, 'fold'] = ifold

sub = pd.read_csv(config.DATA_ROOT + "/sample_submission.csv")
target_columns = sub.columns.tolist()[1:]
label_list = target_columns
num_classes = len(target_columns)
bird2id = {b: i for i, b in enumerate(target_columns)}

In [8]:
device = torch.device("cuda:4")

In [9]:
def normalize_melspec(X, eps=1e-6):
    mean = X.mean((1, 2), keepdim=True)
    std = X.std((1, 2), keepdim=True)
    Xstd = (X - mean) / (std + eps)

    norm_min, norm_max = (
        Xstd.min(-1)[0].min(-1)[0],
        Xstd.max(-1)[0].max(-1)[0],
    )
    fix_ind = (norm_max - norm_min) > eps * torch.ones_like(
        (norm_max - norm_min)
    )
    V = torch.zeros_like(Xstd)
    if fix_ind.sum():
        V_fix = Xstd[fix_ind]
        norm_max_fix = norm_max[fix_ind, None, None]
        norm_min_fix = norm_min[fix_ind, None, None]
        V_fix = torch.max(
            torch.min(V_fix, norm_max_fix),
            norm_min_fix,
        )
        V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
        V[fix_ind] = V_fix
    return V


def read_wav(path):
    wav, org_sr = torchaudio.load(path, normalize=True)
    wav = torchaudio.functional.resample(wav, orig_freq=org_sr, new_freq=config.SR)
    return wav


def crop_start_wav(wav, duration_):
    while wav.size(-1) < duration_:
        wav = torch.cat([wav, wav], dim=1)
    wav = wav[:, :duration_]
    return wav

In [10]:
transforms_val = A.Compose([
    A.Resize(config.image_size, config.image_size),
    A.Normalize()
])

In [15]:
all_waves = []
all_targets = []

# https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-efficientvit-inference
if len(glob(f'{config.DATA_ROOT}/test_soundscapes/*.ogg')) > 0:
    ogg_file_paths = glob(f'{config.DATA_ROOT}/test_soundscapes/*.ogg')
else:
    ogg_file_paths = sorted(glob(f'{config.DATA_ROOT}/train_audio/*/*.ogg'))

for i, file_path in tqdm(enumerate(ogg_file_paths), total=len(ogg_file_paths)):

    if i % 100 != 0:
        continue

    bird = file_path.split("/")[-2]
    target = bird2id[bird]

    row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1)  # filename
    audio_data = read_wav(file_path)

    SR = config.SR
    SEGMENT_DURATION = 5  # seconds
    SEGMENT_SAMPLES = SR * SEGMENT_DURATION

    # Iterate over the segments
    for j in range(48):
        start_idx = SR * SEGMENT_DURATION * j
        end_idx = SR * SEGMENT_DURATION * (j + 1)
        wave = audio_data[:, start_idx:end_idx]
        
        # Check if the wave is shorter than the required length
        if wave.size(1) < SEGMENT_SAMPLES:
            # Pad the wave with zeros
            padding = SEGMENT_SAMPLES - wave.size(1)
            wave = torch.nn.functional.pad(wave, (0, padding), "constant", 0)

        all_waves.append(wave) 
        all_targets.append(target)

100%|██████████| 24459/24459 [00:08<00:00, 2801.99it/s]


In [16]:
class BirdDatasetInference(torch.utils.data.Dataset):
    def __init__(self, wavs, transform=None, add_secondary_labels=False, mode=None, use_cache=config.use_cache):
        self.wavs = wavs
        self.processed = [False for _ in range(len(self.wavs))]
        self.bird2id = bird2id
        self.num_classes = num_classes
        self.secondary_coef = config.secondary_coef
        self.add_secondary_labels = add_secondary_labels
        self.mel_transform = torchaudio.transforms.MelSpectrogram(**config.mel_spec_params)
        self.db_transform = torchaudio.transforms.AmplitudeToDB(stype='power', top_db=config.top_db)
        self.transform = transform
        self.mode = mode
        self.use_cache = use_cache

    def __len__(self):
        return len(self.wavs)

    def prepare_spec(self, wav):
        mel_spectrogram = normalize_melspec(self.db_transform(self.mel_transform(wav)))
        mel_spectrogram = mel_spectrogram * 255
        mel_spectrogram = mel_spectrogram.expand(3, -1, -1).permute(1, 2, 0).numpy()
        return mel_spectrogram

    def __getitem__(self, idx):

        if not self.processed[idx]:
            spec = self.prepare_spec(self.wavs[idx])

            if self.transform is not None:
                res = self.transform(image=spec)
                spec = res['image'].astype(np.float32)
            else:
                spec = spec.astype(np.float32)

            spec = spec.transpose(2, 0, 1)

            if self.use_cache:
                self.wavs[idx] = spec
                self.processed[idx] = True
        else:
            spec = self.wavs[idx]

        return {"spec": spec}

In [17]:
class EffNet(nn.Module):
    def __init__(self, model_name=config.MODEL_TYPE, num_classes=None) -> None:
        super().__init__()

        self.model = timm.create_model(
            model_name, 
            pretrained=True, 
            in_chans=3, 
            num_classes=num_classes
        )
    
    def forward(self, x):
        # [B, W, H] -> [B, 1, W, H]
        # x = x.unsqueeze(1)
        x = self.model(x)

        return x

In [18]:
class FocalLossBCE():
    pass

class BirdModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        
        # == backbone ==
        self.backbone = EffNet(num_classes=num_classes)
        
        # == loss function ==
        # self.loss_fn = nn.CrossEntropyLoss()
        self.loss_fn = FocalLossBCE() 
        
        # == record ==
        self.validation_step_outputs = []
        
    def forward(self, images):
        return self.backbone(images)
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            model_optimizer,
            T_max=config.EPOCHS,
            eta_min=1e-6,
        )
        
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss',
                'frequency': 1
            }
        }
    
    @staticmethod
    def mixup(data, targets, alpha):
        indices = torch.randperm(data.size(0))
        data2 = data[indices]
        targets2 = targets[indices]

        lam = torch.tensor([np.random.beta(alpha, alpha)], device=data.device)
        data = data * lam + data2 * (1 - lam)
        targets = targets * lam + targets2 * (1 - lam)

        return data, targets
    
    def training_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch["spec"], batch["target"]
        image, target = self.mixup(image, target, 0.5)
        
        # == pred ==
        y_pred = self(image)
        
        # == compute loss ==
        train_loss = self.loss_fn(y_pred, target)
        
        # == record ==
        self.log('train_loss', train_loss, prog_bar=True, on_step=True, on_epoch=True)
        
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch['spec'], batch['target']
        image = image.to(self.device)
        target = target.to(self.device)
        
        # == pred ==
        with torch.no_grad():
            y_pred = self(image)
            
        self.validation_step_outputs.append({"logits": y_pred, "targets": target})
        
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def on_validation_epoch_end(self):
        
        # = merge batch data =
        outputs = self.validation_step_outputs
        
        output_val = nn.Softmax(dim=1)(torch.cat([x['logits'] for x in outputs], dim=0)).cpu().detach()
        target_val = torch.cat([x['targets'] for x in outputs], dim=0).cpu().detach()
        
        # = compute validation loss =
        val_loss = self.loss_fn(output_val, target_val)
        
        # target to one-hot
        # target_val = torch.nn.functional.one_hot(target_val, num_classes)
        
        # = val with ROC AUC =
        gt_df = pd.DataFrame(target_val.numpy().astype(np.float32), columns=label_list)
        pred_df = pd.DataFrame(output_val.numpy().astype(np.float32), columns=label_list)
        
        gt_df['id'] = [f'id_{i}' for i in range(len(gt_df))]
        pred_df['id'] = [f'id_{i}' for i in range(len(pred_df))]
        
        val_score = score(gt_df, pred_df, row_id_column_name='id')
        
        self.log("val_score", val_score, on_epoch=True)
        self.log("val_loss", val_loss, on_epoch=True)
        
        # clear validation outputs
        self.validation_step_outputs = list()
        
        return {'val_loss': val_loss, 'val_score': val_score}

In [19]:
def predict(data_loader, onnx_model):
    pred = []
    for batch in tqdm(data_loader):
        with torch.no_grad():
            x = batch['spec']
            n_pad = 0
            
            # == make sure the batch_size equal to setting
            if x.shape[0] < config.BATCH_SIZE:
                n_pad = config.BATCH_SIZE - x.shape[0]
                zero_tensor = torch.zeros((n_pad, 3, 256, 256))
                x = torch.cat([x, zero_tensor], dim=0)
            
            outputs = onnx_model.run(output_names, {input_names[0]: x.numpy()})[0]
            outputs = sci.special.softmax(outputs[:config.BATCH_SIZE-n_pad, ...], axis=1)
        pred.append(outputs)
    
    return np.concatenate(pred, axis=0)

In [20]:
import onnx
import onnxruntime as ort

In [21]:
onnx_ckpt_list = [
    f"/data/yaz/birdclef24/baselinev2/fold_{f}.onnx" for f in range(4)
]

In [22]:
predictions = []
test_dataset = BirdDatasetInference(all_waves, transform=transforms_val)

for ckpt in onnx_ckpt_list:
    
    # == init ONNX model ==
    onnx_model = onnx.load(ckpt)
    onnx_model_graph = onnx_model.graph
    onnx_session = ort.InferenceSession(onnx_model.SerializeToString())
    
    # == create dataset & dataloader ==
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.BATCH_SIZE,
        num_workers=config.N_WORKERS,
        shuffle=False,
        drop_last=False
    )
    
    predictions.append(predict(test_loader, onnx_session))
    gc.collect()

# predictions = np.mean(predictions, axis=0)

  0%|          | 0/46 [00:10<?, ?it/s]


NameError: name 'output_names' is not defined