In [21]:
import re
import os
import gc
import sys
import cv2
import math
import numpy as np
import pandas as pd
from glob import glob
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import librosa
import scipy as sci
import timm

import torch
from torch import nn
from torchvision.models import efficientnet

import pytorch_lightning as pl

In [22]:
import os
import torchvision
import torch
import pytorch_lightning as pl
import pandas as pd
from tqdm.auto import tqdm
import math
import librosa
import cv2
import torch.nn as nn
import albumentations as A
import numpy as np
from sklearn.model_selection import StratifiedKFold
import torchaudio


In [23]:
class Config:
    # == global config ==
    SEED = 42 # random seed
    DEVICE = 'cuda'  # device to be used
    MIXED_PRECISION = False  # whether to use mixed-16 precision
    # OUTPUT_DIR = '/kaggle/working/'  # output folder
    OUTPUT_DIR = '/data/yaz/birdclef24/out/'  # output folder
    # CKPT_ROOT = '/kaggle/input/meleffnetbirdclef24/pytorch/4folds/1'
    CKPT_ROOT = '/data/yaz/birdclef24/baselinev2/out'

    use_cache = False
    
    # == data config ==
    # DATA_ROOT = '/kaggle/input/birdclef-2024'  # root folder
    DATA_ROOT = '/data/yaz/birdclef24/data'  # root folder
    # PREPROCESSED_DATA_ROOT = '/kaggle/input/birdclef24-spectrograms-via-cupy'
    # PREPROCESSED_DATA_ROOT = '/data/yaz/birdclef24/data/specs'
    # LOAD_DATA = True  # whether to load data from pre-processed dataset

    image_size = 256

    SR = 32000  # sample rate
    mel_spec_params = {
        "sample_rate": 32000,
        "n_mels": 128,
        "f_min": 20,
        "f_max": 16000,
        "n_fft": 2048,
        "hop_length": 512,
        "normalized": True,
        "center" : True,
        "pad_mode" : "constant",
        "norm" : "slaney",
        "onesided" : True,
        "mel_scale" : "slaney"
    }

    top_db = 80 

    train_period = 5
    val_period = 5
    secondary_coef = 1.0
    train_duration = train_period * mel_spec_params["sample_rate"]
    val_duration = val_period * mel_spec_params["sample_rate"]

    # == model config ==
    MODEL_TYPE = 'efficientnet_b0'  # model type
    
    # == dataset config ==
    BATCH_SIZE = 128  # batch size of each step
#     BATCH_SIZE = 64 # batch size of each step
    N_WORKERS = 4  # number of workers
#     N_WORKERS = 0  # number of workers
    
    # == AUG ==
    USE_XYMASKING = True  # whether use XYMasking
    
    # == training config ==
    N_FOLDS = 4  # n fold
    EPOCHS = 30  # max epochs
    LR = 1e-3  # learning rate
    WEIGHT_DECAY = 1e-5  # weight decay of optimizer

    
    # == other config ==
    VISUALIZE = False # whether to visualize data and batch


config = Config()

In [24]:
print('fix seed')
pl.seed_everything(config.SEED, workers=True)

Seed set to 42


fix seed


42

In [25]:
df = pd.read_csv(config.DATA_ROOT + '/train_metadata.csv')
df["path"] = config.DATA_ROOT + "/train_audio/" + df["filename"]
df["rating"] = np.clip(df["rating"] / df["rating"].max(), 0.1, 1.0)

skf = StratifiedKFold(n_splits=config.N_FOLDS, random_state=config.SEED, shuffle=True)
df['fold'] = -1
for ifold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=df["primary_label"].values)):
    df.loc[val_idx, 'fold'] = ifold

sub = pd.read_csv(config.DATA_ROOT + "/sample_submission.csv")
target_columns = sub.columns.tolist()[1:]
label_list = target_columns
num_classes = len(target_columns)
bird2id = {b: i for i, b in enumerate(target_columns)}

In [26]:
device = torch.device("cpu")

In [27]:
def normalize_melspec(X, eps=1e-6):
    mean = X.mean((1, 2), keepdim=True)
    std = X.std((1, 2), keepdim=True)
    Xstd = (X - mean) / (std + eps)

    norm_min, norm_max = (
        Xstd.min(-1)[0].min(-1)[0],
        Xstd.max(-1)[0].max(-1)[0],
    )
    fix_ind = (norm_max - norm_min) > eps * torch.ones_like(
        (norm_max - norm_min)
    )
    V = torch.zeros_like(Xstd)
    if fix_ind.sum():
        V_fix = Xstd[fix_ind]
        norm_max_fix = norm_max[fix_ind, None, None]
        norm_min_fix = norm_min[fix_ind, None, None]
        V_fix = torch.max(
            torch.min(V_fix, norm_max_fix),
            norm_min_fix,
        )
        V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
        V[fix_ind] = V_fix
    return V


def read_wav(path):
    wav, org_sr = torchaudio.load(path, normalize=True)
    wav = torchaudio.functional.resample(wav, orig_freq=org_sr, new_freq=config.SR)
    return wav


def crop_start_wav(wav, duration_):
    while wav.size(-1) < duration_:
        wav = torch.cat([wav, wav], dim=1)
    wav = wav[:, :duration_]
    return wav

In [28]:
transforms_val = A.Compose([
    A.Resize(config.image_size, config.image_size),
    A.Normalize()
])

In [29]:
class BirdDatasetInference(torch.utils.data.Dataset):
    def __init__(self, specs, transform=None, add_secondary_labels=False, mode=None, use_cache=config.use_cache):
        self.specs = specs
        self.processed = [False for _ in range(len(self.specs))]
        self.bird2id = bird2id
        self.num_classes = num_classes
        self.secondary_coef = config.secondary_coef
        self.add_secondary_labels = add_secondary_labels
        self.mel_transform = torchaudio.transforms.MelSpectrogram(**config.mel_spec_params)
        self.db_transform = torchaudio.transforms.AmplitudeToDB(stype='power', top_db=config.top_db)
        self.transform = transform
        self.mode = mode
        self.use_cache = use_cache

    def __len__(self):
        return len(self.specs)

    def prepare_spec(self, wav):
        mel_spectrogram = normalize_melspec(self.db_transform(self.mel_transform(wav)))
        mel_spectrogram = mel_spectrogram * 255
        mel_spectrogram = mel_spectrogram.expand(3, -1, -1).permute(1, 2, 0).numpy()
        return mel_spectrogram

    def __getitem__(self, idx):

        if not self.processed[idx]:
            spec = self.specs[idx]

            if self.transform is not None:
                res = self.transform(image=spec)
                spec = res['image'].astype(np.float32)
            else:
                spec = spec.astype(np.float32)

            spec = spec.transpose(2, 0, 1)

            if self.use_cache:
                self.specs[idx] = spec
                self.processed[idx] = True
        else:
            spec = self.specs[idx]

        return {"spec": spec}

In [30]:
class EffNet(nn.Module):
    def __init__(self, model_name=config.MODEL_TYPE, num_classes=None) -> None:
        super().__init__()

        self.model = timm.create_model(
            model_name, 
            pretrained=False, 
            in_chans=3, 
            num_classes=num_classes
        )
    
    def forward(self, x):
        # [B, W, H] -> [B, 1, W, H]
        # x = x.unsqueeze(1)
        x = self.model(x)

        return x

In [31]:
class FocalLossBCE():
    pass

class BirdModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        
        # == backbone ==
        self.backbone = EffNet(num_classes=num_classes)
        
        # == loss function ==
        # self.loss_fn = nn.CrossEntropyLoss()
        self.loss_fn = FocalLossBCE() 
        
        # == record ==
        self.validation_step_outputs = []
        
    def forward(self, images):
        return self.backbone(images)
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            model_optimizer,
            T_max=config.EPOCHS,
            eta_min=1e-6,
        )
        
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss',
                'frequency': 1
            }
        }
    
    @staticmethod
    def mixup(data, targets, alpha):
        indices = torch.randperm(data.size(0))
        data2 = data[indices]
        targets2 = targets[indices]

        lam = torch.tensor([np.random.beta(alpha, alpha)], device=data.device)
        data = data * lam + data2 * (1 - lam)
        targets = targets * lam + targets2 * (1 - lam)

        return data, targets
    
    def training_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch["spec"], batch["target"]
        image, target = self.mixup(image, target, 0.5)
        
        # == pred ==
        y_pred = self(image)
        
        # == compute loss ==
        train_loss = self.loss_fn(y_pred, target)
        
        # == record ==
        self.log('train_loss', train_loss, prog_bar=True, on_step=True, on_epoch=True)
        
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch['spec'], batch['target']
        image = image.to(self.device)
        target = target.to(self.device)
        
        # == pred ==
        with torch.no_grad():
            y_pred = self(image)
            
        self.validation_step_outputs.append({"logits": y_pred, "targets": target})
        
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def on_validation_epoch_end(self):
        
        # = merge batch data =
        outputs = self.validation_step_outputs
        
        output_val = nn.Softmax(dim=1)(torch.cat([x['logits'] for x in outputs], dim=0)).cpu().detach()
        target_val = torch.cat([x['targets'] for x in outputs], dim=0).cpu().detach()
        
        # = compute validation loss =
        val_loss = self.loss_fn(output_val, target_val)
        
        # target to one-hot
        # target_val = torch.nn.functional.one_hot(target_val, num_classes)
        
        # = val with ROC AUC =
        gt_df = pd.DataFrame(target_val.numpy().astype(np.float32), columns=label_list)
        pred_df = pd.DataFrame(output_val.numpy().astype(np.float32), columns=label_list)
        
        gt_df['id'] = [f'id_{i}' for i in range(len(gt_df))]
        pred_df['id'] = [f'id_{i}' for i in range(len(pred_df))]
        
        val_score = score(gt_df, pred_df, row_id_column_name='id')
        
        self.log("val_score", val_score, on_epoch=True)
        self.log("val_loss", val_loss, on_epoch=True)
        
        # clear validation outputs
        self.validation_step_outputs = list()
        
        return {'val_loss': val_loss, 'val_score': val_score}

In [77]:
# ckpt_list = [f'{config.CKPT_ROOT}/fold_{i}.ckpt' for i in range(config.N_FOLDS)]
# ckpt_list = [f'{config.CKPT_ROOT}/fold_{i}.ckpt' for i in range(3)]
ckpt_list = [f'{config.CKPT_ROOT}/fold_{i}.ckpt' for i in range(1)]

In [78]:
input_tensor = torch.randn(config.BATCH_SIZE, 3, 256, 256)  # input shape
input_names = ['x']
output_names = ['output']

In [79]:
onnx_ckpt_list = list()
for ckpt_path in ckpt_list:
    ckpt_name = os.path.basename(ckpt_path).split('.')[0]
    # == init model ==
    bird_model = BirdModel()
    
    # == load ckpt ==
    weights = torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict']
    bird_model.load_state_dict(weights)
    bird_model.eval()
    
    # == convert to onnx ==
    torch.onnx.export(bird_model.backbone, input_tensor, f"{ckpt_name}.onnx", verbose=False, input_names=input_names, output_names=output_names)
    
    onnx_ckpt_list.append(f"{ckpt_name}.onnx")

In [80]:
def predict(data_loader, onnx_model):
    pred = []
    for batch in tqdm(data_loader):
        with torch.no_grad():
            x = batch['spec']
            n_pad = 0
            
            # == make sure the batch_size equal to setting
            if x.shape[0] < config.BATCH_SIZE:
                n_pad = config.BATCH_SIZE - x.shape[0]
                zero_tensor = torch.zeros((n_pad, 3, 256, 256))
                x = torch.cat([x, zero_tensor], dim=0)
            
            outputs = onnx_model.run(output_names, {input_names[0]: x.numpy()})[0]
            outputs = sci.special.softmax(outputs[:config.BATCH_SIZE-n_pad, ...], axis=1)
        pred.append(outputs)
    
    return np.concatenate(pred, axis=0)

In [81]:
# !pip install /kaggle/input/onnxruntime/onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl --no-index --find-links /kaggle/input/onnxruntime

In [82]:
import onnx
import onnxruntime as ort

In [100]:
test_samples = df.sample(1000)
gt_target = torch.nn.functional.one_hot(torch.tensor(test_samples['primary_label'].map(bird2id).array))
gt_target

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [101]:
mel_transform = torchaudio.transforms.MelSpectrogram(**config.mel_spec_params)
db_transform = torchaudio.transforms.AmplitudeToDB(stype='power', top_db=config.top_db)

def prepare_spec(wav):
    mel_spectrogram = normalize_melspec(db_transform(mel_transform(wav)))
    mel_spectrogram = mel_spectrogram * 255
    mel_spectrogram = mel_spectrogram.expand(3, -1, -1).permute(1, 2, 0).numpy()
    return mel_spectrogram

all_specs = []
sub_names = []
target = []

# https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-efficientvit-inference
if len(glob(f'{config.DATA_ROOT}/test_soundscapes/*.ogg')) > 0:
    ogg_file_paths = glob(f'{config.DATA_ROOT}/test_soundscapes/*.ogg')
else:
#     ogg_file_paths = sorted(glob(f'{config.DATA_ROOT}/unlabeled_soundscapes/*.ogg'))[:1112]
    # ogg_file_paths = sorted(glob(f'{config.DATA_ROOT}/unlabeled_soundscapes/*.ogg'))[:11]
    ogg_file_paths = test_samples['path']

for i, file_path in tqdm(enumerate(ogg_file_paths), total=len(ogg_file_paths)):
    row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1)  # filename
    audio_data = read_wav(file_path)

    SR = config.SR
    SEGMENT_DURATION = 5  # seconds
    SEGMENT_SAMPLES = SR * SEGMENT_DURATION

    # Iterate over the segments
    num_segments = min(48, audio_data.shape[1] // SEGMENT_SAMPLES)
    for j in range(num_segments):
        start_idx = SR * SEGMENT_DURATION * j
        end_idx = SR * SEGMENT_DURATION * (j + 1)
        wave = audio_data[:, start_idx:end_idx]
        
        # Check if the wave is shorter than the required length
        if wave.size(1) < SEGMENT_SAMPLES:
            # Pad the wave with zeros
            padding = SEGMENT_SAMPLES - wave.size(1)
            wave = torch.nn.functional.pad(wave, (0, padding), "constant", 0)

        all_specs.append(prepare_spec(wave)) 
        sub_names.append(f"{row_id}_{(j+1) * SEGMENT_DURATION}")
        target.append(bird2id[test_samples.iloc[i]['primary_label']])

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:53<00:00, 18.52it/s]


In [102]:
gt_target = torch.nn.functional.one_hot(torch.tensor(target))

In [103]:
predictions = []

test_dataset = BirdDatasetInference(all_specs, transform=transforms_val)

for ckpt in onnx_ckpt_list:
    
    # == init ONNX model ==
    onnx_model = onnx.load(ckpt)
    onnx_model_graph = onnx_model.graph
    onnx_session = ort.InferenceSession(onnx_model.SerializeToString())
    
    # == create dataset & dataloader ==
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.BATCH_SIZE,
        num_workers=config.N_WORKERS,
        shuffle=False,
        drop_last=False
    )
    
    predictions.append(predict(test_loader, onnx_session))
    gc.collect()

predictions = np.mean(predictions, axis=0)

100%|██████████| 55/55 [01:46<00:00,  1.93s/it]


In [104]:
sub_pred = pd.DataFrame(predictions, columns=label_list)
sub_id = pd.DataFrame({'row_id': sub_names})
sub = pd.concat([sub_id, sub_pred], axis=1)

# sub.iloc[:, 1:] = predictions[:sub.shape[0], :]

sub.to_csv('submission.csv',index=False)
print(f'Submissionn shape: {sub.shape}')
sub

Submissionn shape: (6994, 183)


Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1
0,XC640180_5,0.000035,0.000014,0.000110,0.000021,0.000066,0.000079,0.000034,0.000027,0.000029,...,0.000251,0.000069,0.000187,0.000025,0.000034,0.001884,0.000013,0.000021,0.000026,0.000252
1,XC640180_10,0.000025,0.000043,0.000163,0.000026,0.000126,0.000103,0.000055,0.000056,0.000039,...,0.001304,0.000089,0.000134,0.000033,0.000113,0.002224,0.000026,0.000031,0.000081,0.000404
2,XC116833_5,0.000053,0.002070,0.000145,0.000272,0.000157,0.000063,0.000910,0.000036,0.000078,...,0.000513,0.000124,0.000255,0.000192,0.000267,0.000038,0.000126,0.000015,0.009110,0.000183
3,XC116833_10,0.000202,0.016172,0.000665,0.002815,0.000615,0.000403,0.004676,0.001056,0.000193,...,0.007747,0.000147,0.006842,0.002394,0.003764,0.001863,0.001932,0.000235,0.010245,0.000845
4,XC116833_15,0.000366,0.000200,0.000691,0.002389,0.000195,0.000123,0.009688,0.000103,0.000047,...,0.001120,0.001246,0.000859,0.001063,0.000049,0.000405,0.000504,0.000129,0.010744,0.000642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6989,XC302516_5,0.000013,0.000039,0.000664,0.000244,0.000250,0.000083,0.000069,0.000038,0.000016,...,0.000011,0.000063,0.000107,0.000059,0.000102,0.000132,0.000069,0.000044,0.000036,0.000536
6990,XC307304_5,0.008240,0.004188,0.000389,0.000897,0.003018,0.000663,0.001774,0.000216,0.002821,...,0.000140,0.000657,0.003587,0.022961,0.004115,0.005641,0.000658,0.001799,0.000192,0.004618
6991,XC500337_5,0.000024,0.000020,0.000035,0.000014,0.000090,0.000015,0.000010,0.000014,0.000006,...,0.000014,0.000030,0.000201,0.000023,0.000007,0.000529,0.000020,0.000166,0.000012,0.000069
6992,XC500337_10,0.000107,0.000062,0.000217,0.000072,0.000439,0.000061,0.000046,0.000335,0.000034,...,0.000115,0.000237,0.000571,0.000147,0.000079,0.003775,0.000101,0.001537,0.000082,0.000475


In [105]:
label_list[target[-1]]

'grnsan'

In [106]:
import pandas as pd
import pandas.api.types

import sklearn.metrics


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    '''
    Version of macro-averaged ROC-AUC score that ignores all classes that have no true positive labels.
    '''
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    if not pandas.api.types.is_numeric_dtype(submission.values):
        bad_dtypes = {x: submission[x].dtype  for x in submission.columns if not pandas.api.types.is_numeric_dtype(submission[x])}
        raise ParticipantVisibleError(f'Invalid submission data types found: {bad_dtypes}')

    solution_sums = solution.sum(axis=0)
    scored_columns = list(solution_sums[solution_sums > 0].index.values)
    assert len(scored_columns) > 0

    return sklearn.metrics.roc_auc_score(solution[scored_columns].values, submission[scored_columns].values, average='macro')

In [107]:
# if len(all_specs) < 1000:
if True:
    gt = sub.copy()
    # gt.iloc[:, 1:] = np.random.randint(0, 2, size=gt.iloc[:, 1:].shape)
    gt.iloc[:, 1:] = gt_target.numpy()
    # gt.iloc[-1, 1:] = 0
    # gt.iloc[:, 160:] = 0
    print(score(gt, sub.copy(), "row_id"))

0.980999563257912
