## Import libraries

In [40]:
import os
import logging
import random
import gc
import time
import cv2
import math
import warnings
from pathlib import Path
import soundfile as sf

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import librosa

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler

from tqdm.auto import tqdm

import timm

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

## Config

In [41]:
class CFG:
    
    seed = 42
    debug = True  
    apex = False
    num_workers = 2
    
    OUTPUT_DIR = '../working/'

    train_datadir = '../input/data-1024-128-256-256/1024_128_256_256'
    train_csv = '../input/data-1024-128-256-256/1024_128_256_256.csv'
    train_oggdir = '../input/train_audio'
    # test_soundscapes = '../Data/test_soundscapes'
    # submission_csv = '../Data/sample_submission.csv'
    taxonomy_csv = '../Data/taxonomy.csv'
    train_ssdir = '../input/SOUNDSCAPE_1024_128_256_256'
    train_sscsv = '../input/SOUNDSCAPE_1024_128_256_256.csv'
    train_sssub = '../input/submission.csv'
    train_ssoggdir = '../input/train_soundscapes'


    model_name = 'efficientnet_b0'  
    pretrained = True
    in_channels = 1

    LOAD_DATA = False  # then, use on-the-fly spectrogram
    FS = 32000
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)       ########### CHANGE!!
    
    N_FFT = 1024                    ########### CHANGE!!
    HOP_LENGTH = 128  ########## CHANGE!!
    N_MELS = 128  ########## CHANGE!!
    FMIN = 20
    FMAX = 16000
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    epochs = 20  
    batch_size = 32  
    criterion = 'BCEWithLogitsLoss'

    n_fold = 5
    selected_folds = [0, 1, 2, 3, 4]   

    optimizer = 'AdamW'
    lr = 5e-4 
    weight_decay = 1e-5
  
    scheduler = 'CosineAnnealingLR'
    min_lr = 1e-6
    T_max = epochs // 4

    ss_label_smooth_temp = 5.0 # when loading dataset, smooth pseudo-labeled data with sigmoid(data/T)

    augmentation = True
    aug_scheduler = 'Constant' # Constant, Ramp, Exp
    aug_weight_x = 0.5
    aug_weight_y = 0.5

    mixup_alpha = [0.5, 4.0]  # float or list of float with size 2 
    # provide : [initial alpha for ss, final alpha for ss]
    # alternatively, just give a float for fixed alpha (treats train data, ss equally)
    mixup_ss = True  # enable mixup with soundscape data
    mixup_scheduler = 'Ramp'  # Constant, Ramp, Exp
    
    def update_debug_settings(self):
        if self.debug:
            self.epochs = 2
            self.selected_folds = [2]

cfg = CFG()

## Augmentation Pipeline

In [None]:
class AugmentationPipeline :
    def __init__(self, config, current_epo_fn, rng=None) :
        if not config.augmentation :
            print("Augmentation disabled.")
            return
        self.max_epoch = config.epochs
        self.current_epo_fn = current_epo_fn
        self.weight_dict = {
            'xmask' : config.aug_weight_x,
            'ymask' : config.aug_weight_y
        }
        self.aug_scheduler = config.aug_scheduler
        self.aug_delay = 0   # augmentation starts at # epo.
        self.p_max = 0.5 # maximum augmentation proba
        self.exp_decay = 5.0 # decay constant when using 'Exp' scheduler
        if rng == None :
            self.rng = np.random.RandomState()
        else :
            self.rng = rng
    def schedule_p(self) :
        D = self.aug_delay
        T = self.max_epoch
        t = self.current_epo_fn()
        p_max = self.p_max

        if t<D :
            return 0.0
        elif t > T :
            return p_max
        else :
            if self.aug_scheduler == 'Constant' :
                p = p_max
            elif self.aug_scheduler == 'Ramp' :
                p = (t-D)/(T-D) * p_max
            elif self.aug_scheduler == 'Exp' :
                c = self.exp_decay
                p = p_max / (1-np.exp(-(T-D)*c))* (1-np.exp(-(t-D)*c))
            else :
                print(f"Specified {self.aug_scheduler} not defined. Use 'Constant', 'Ramp' and 'Exp'")
                raise Exception("Aug Scheduler type not implemented")
            return p
    
    def __call__(self, spec):
        """Apply augmentations to spectrogram"""

        p = self.schedule_p()

        if self.rng.uniform() < p :

            # Time masking (horizontal stripes)
            if self.rng.uniform() < self.weight_dict['xmask']:
                num_masks = self.rng.randint(1, 3)
                #print("Debug : Xmask applied")
                for _ in range(num_masks):
                    width = self.rng.randint(5, 20)
                    start = self.rng.randint(0, spec.shape[2] - width)
                    spec[0, :, start:start+width] = 0
            
            # Frequency masking (vertical stripes)
            if self.rng.uniform() < self.weight_dict['ymask']:
                num_masks = self.rng.randint(1, 3)
                #print("Debug : Ymask applied")
                for _ in range(num_masks):
                    height = self.rng.randint(5, 20)
                    start = self.rng.randint(0, spec.shape[1] - height)
                    spec[0, start:start+height, :] = 0
      
        return spec

## Mixup Pipeline

In [None]:
class MixupPipeline :
    def __init__(self, config, current_epo_fn, rng=None) :

        self.mixup_delay = 0
        self.p_max = 0
        self.exp_decay = 5.0
        
        if isinstance(config.mixup_alpha, float) :
            # use fixed mixup alpha
            self.alpha_initial = config.mixup_alpha
            self.alpha_final = config.mixup_alpha
        else :
            self.alpha_initial = config.mixup_alpha[0]
            self.alpha_final = config.mixup_alpha[1]

        # self.mixup_ss = config.mixup_ss # do we need this?

        self.current_epo_fn = current_epo_fn
        self.mixup_scheduler = config.mixup_scheduler
        self.total_epo = config.epochs
        
        if rng == None :
            self.rng = np.random.RandomState()
        else :
            self.rng = rng
    
    def mixup_data(self, x_orig, x_ss):
        """Applies mixup to the data batch"""
        batch_size = x_orig.size(0)
        epochratio = (self.current_epo_fn()-self.mixup_delay) / (self.total_epo-self.mixup_delay)
        # form of (t-D) / (T-D)

        # defining original data's alpha
        alpha_origin = self.alpha_initial * (1.0-epochratio) + self.alpha_final * epochratio
        alpha_ss = self.alpha_initial * epochratio + self.alpha_final * (1.0-epochratio)

        lam = self.rng.beta(alpha_origin, alpha_ss)

        mixed_x = lam * x_orig + (1 - lam) * x_ss
        
        return mixed_x, lam
    
    def mixup_criterion(self, pred, y_orig, y_ss, lam, criterion = None):
        """Applies mixup to the loss function"""
        # criterion = F.binary_cross_entropy_with_logits
        if criterion == None :
            criterion = F.binary_cross_entropy_with_logits

        return lam * criterion(pred, y_orig) + (1 - lam) * criterion(pred, y_ss)
    def schedule_p(self) :
        D = self.mixup_delay
        T = self.total_epo
        t = self.current_epo_fn()
        p_max = self.p_max

        if t<D :
            return 0.0
        elif t > T :
            return p_max
        else :
            if self.mixup_scheduler == 'Constant' :
                p = p_max
            elif self.mixup_scheduler == 'Ramp' :
                p = (t-D)/(T-D) * p_max
            elif self.mixup_scheduler == 'Exp' :
                c = self.exp_decay
                p = p_max / (1-np.exp(-(T-D)*c))* (1-np.exp(-(t-D)*c))
            else :
                print(f"Specified {self.mixup_scheduler} not defined. Use 'Constant', 'Ramp' and 'Exp'")
                raise Exception("Aug Scheduler type not implemented")
            return p

## Pre-processing 
- used when LOAD_DATA is False

In [None]:
def audio2melspec(audio_data, cfg):
    """Convert audio data to mel spectrogram"""
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=cfg.FS,
        n_fft=cfg.N_FFT,
        hop_length=cfg.HOP_LENGTH,
        n_mels=cfg.N_MELS,
        fmin=cfg.FMIN,
        fmax=cfg.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

def process_audio_file(audio_path, cfg):
    """Process a single audio file to get the mel spectrogram"""
    try:
        #audio_data, _ = librosa.load(audio_path, sr=cfg.FS)
        audio_data, _ = sf.read(audio_path, dtype='float32')
        # must plot data values

        target_samples = int(cfg.TARGET_DURATION * cfg.FS)

        if len(audio_data) < target_samples:
            n_copy = math.ceil(target_samples / len(audio_data))
            if n_copy > 1:
                audio_data = np.concatenate([audio_data] * n_copy)

        # Extract center 5 seconds
        start_idx = max(0, int(len(audio_data) / 2 - target_samples / 2))
        end_idx = min(len(audio_data), start_idx + target_samples)
        center_audio = audio_data[start_idx:end_idx]

        if len(center_audio) < target_samples:
            center_audio = np.pad(center_audio, 
                                 (0, target_samples - len(center_audio)), 
                                 mode='constant')

        mel_spec = audio2melspec(center_audio, cfg)
        
        if mel_spec.shape != cfg.TARGET_SHAPE:
            mel_spec = cv2.resize(mel_spec, cfg.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)

        return mel_spec.astype(np.float32)
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def generate_spectrograms(df, cfg):
    """Generate spectrograms from audio files"""
    print("Generating mel spectrograms from audio files...")
    start_time = time.time()

    all_bird_data = {}
    errors = []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        if cfg.debug and i >= 1000:
            break
        
        try:
            samplename = row['samplename']
            filepath = row['filepath']
            
            mel_spec = process_audio_file(filepath, cfg)
            
            if mel_spec is not None:
                all_bird_data[samplename] = mel_spec
            
        except Exception as e:
            print(f"Error processing {row.filepath}: {e}")
            errors.append((row.filepath, str(e)))

    end_time = time.time()
    print(f"Processing completed in {end_time - start_time:.2f} seconds")
    print(f"Successfully processed {len(all_bird_data)} files out of {len(df)}")
    print(f"Failed to process {len(errors)} files")
    
    return all_bird_data

## functions

- set_seed : sets random, np, torch seed 
- collate_fn : Custom collate function to handle different sized specs
- taxonomy_process : functionized repeated phrases

In [43]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)

def collate_fn(batch):
    """Custom collate function to handle different sized spectrograms"""
    batch = [item for item in batch if item is not None]
    if len(batch) == 0:
        return {}
        
    result = {key: [] for key in batch[0].keys()}
    
    for item in batch:
        for key, value in item.items():
            result[key].append(value)
    
    for key in result:
        if key == 'target' and isinstance(result[key][0], torch.Tensor):
            result[key] = torch.stack(result[key])
        elif key == 'melspec' and isinstance(result[key][0], torch.Tensor):
            shapes = [t.shape for t in result[key]]
            if len(set(str(s) for s in shapes)) == 1:
                result[key] = torch.stack(result[key])
    
    return result

def taxonomy_process (cfg) :
    """
    returns tuple of (taxonomy_df, species_ids, num_classes, label_to_idx)
    """
    taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
    species_ids = taxonomy_df['primary_label'].tolist()
    num_classes = len(species_ids)
    label_to_idx = {label: idx for idx, label in enumerate(species_ids)}
    return (taxonomy_df, species_ids, num_classes, label_to_idx)

## BirdCLEFDatasetFromNPY

In [45]:
class BirdCLEFDatasetFromNPY(Dataset):
    def __init__(self, df, cfg, augmentor = None, mode="train"):
        self.df = df
        self.cfg = cfg
        self.mode = mode
        self.augmentor = augmentor
        
        _ , self.species_ids, self.num_classes, self.label_to_idx = taxonomy_process(cfg)
        
        if 'samplename' not in self.df.columns:
            self.df['samplename'] = self.df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])

        if 'melpath' not in self.df.columns:
            self.df['melpath'] = self.cfg.train_datadir + '/' + self.df['samplename']

        if cfg.debug:
            self.df = self.df.sample(min(1000, len(self.df)), random_state=cfg.seed).reset_index(drop=True)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        spec = np.load(row['melpath']).astype(np.float32)
        
        spec = torch.tensor(spec, dtype=torch.float32).unsqueeze(0)  # Add channel dimension

        if self.mode == "train" and self.cfg.augmentation :
            spec = self.augmentor(spec)
        
        target = self.encode_label(row['primary_label'], row['secondary_labels']) 
        
        return {
            'melspec': spec, 
            'target': torch.tensor(target, dtype=torch.float32),
            'filename': row['filename'],
            'melpath' : row['melpath']
        }
    
    def encode_label(self, label1, label2=None):
        """Encode label to multi-hot vector"""
        target = np.zeros(self.num_classes)
        if label1 in self.label_to_idx:
            target[self.label_to_idx[label1]] = 1.0
        if label2 :
            if isinstance(label2, str):
                l2 = eval(label2)
            for label in l2:
                if label in self.label_to_idx:
                    target[self.label_to_idx[label]] = 1.0

        return target

## SoundscapeFromNPY

In [None]:
class SoundscapeFromNPY(Dataset):
    def __init__(self, df,labeldf, cfg, augmentor = None, mode="train"):
        self.df = df
        self.cfg = cfg
        self.mode = mode
        self.augmentor = augmentor
        self.ldf = labeldf
        
        _, self.species_ids, self.num_classes, self.label_to_idx = taxonomy_process(cfg)

        self.temperature = cfg.ss_label_smooth_temp


        ## TODO : integrate with train_sssub

# cfg.train_ssdir = '../input/SOUNDSCAPE_1024_128_256_256
# filedir = ./Data/train_soundscapes/H29_20230523_194000.ogg
# filename = H29_20230523_194000.ogg


        if 'melpath' not in self.df.columns:
            self.df['melpath'] = self.cfg.train_ssdir + '/' + self.df['filename'] + '-' + self.df['index'] + '.npy'
            # SSDIR / H*****.ogg-index.npy
        
        if 'samplename' not in self.df.columns:
            self.df['samplename'] = self.df['filename'] + '-' + self.df['index'] + '.npy'

        if cfg.debug:
            self.df = df.sample(min(1000, len(df)), random_state=cfg.seed).reset_index(drop=True)
            self.ldf = labeldf.loc[self.df.index].reset_index(drop=True)
        else:
            self.df = df
            self.ldf = labeldf

        self.df, self.ldf = self._filter_df_by_threshold(self.df, self.ldf)


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        lrow = self.ldf.iloc[idx]
        spec = np.load(row['melpath']).astype(np.float32)
        
        spec = torch.tensor(spec, dtype=torch.float32).unsqueeze(0)  # Add channel dimension

        #if self.mode == "train" and self.cfg.augmentation :
        #    spec = self.augmentor(spec)
        # should we add augmentation to the Soundscape?
        
        target = self._parse_label(lrow, self.temperature) 
        
        return {
            'melspec': spec, 
            'target': torch.tensor(target, dtype=torch.float32),
            'filename': row['filename'],
            'melpath' : row['melpath']
            }
        
    def _filter_df_by_threshold(self, df, labeldf):
        """
        Filters df using self.threshold based on max value of parsed label.
        Prints number of kept and dropped samples.
        """
        filtered_indices = []
        for idx in range(len(df)):
            lrow = labeldf.iloc[idx]
            target = self._parse_label(lrow, self.temperature)
            if np.max(target) >= self.threshold:
                filtered_indices.append(idx)

        filtered_df = df.iloc[filtered_indices].reset_index(drop=True)
        filtered_labeldf = labeldf.iloc[filtered_indices].reset_index(drop=True)

        num_total = len(df)
        num_kept = len(filtered_df)

        print(f"Filtered samples with threshold {self.threshold}:")
        print(f"  Kept: {num_kept} / {num_total} samples")

        return filtered_df, filtered_labeldf



    def _prob2tprob(self, target, T, eps=1e-6):
        """
        Args:
            probabilities: np.array of shape (B, C), values in (0, 1)
            T: temperature (T > 0)
            eps: small constant to prevent log(0)
        Returns:
            softened probabilities in (0, 1)
        """
        # Logit Transformation: log(p / (1 - p))
        logits = np.log((target + eps) / (1 - target + eps))

        # Temperature scaling
        softened_logits = logits / T

        # apply sigmoid again
        softened_probs = 1 / (1 + np.exp(-softened_logits))

        return softened_probs
    
    def _parse_label(self, lrow, T):
        """
        Parse and apply sigmoid with temperature T
        """
        target = np.zeros(self.num_classes)

        for col in lrow.columns :
            target[self.label_to_idx[col]] = lrow[col]

        target = self._prob2tprob(target, T)
        return target



## InfiniteRandomSampler for ss_loader

In [None]:
class InfiniteRandomSampler(Sampler):
    def __init__(self, data_source, generator=None):
        self.data_source = data_source
        self.generator = generator or torch.Generator()

    def __iter__(self):
        while True:
            idx = torch.randint(
                high=len(self.data_source),
                size=(1,),
                generator=self.generator
            ).item()
            yield idx

    def __len__(self):
        return 2**31  # Arbitrarily large number

def infinite_batch_sampler(dataset, batch_size, generator=None):
    infinite_sampler = InfiniteRandomSampler(dataset, generator=generator)
    return BatchSampler(infinite_sampler, batch_size=batch_size, drop_last=True)

## BirdCLEFModel

In [47]:
class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        cfg.num_classes = taxonomy_process(cfg)[2]
        
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=cfg.pretrained,
            in_chans=cfg.in_channels,
            drop_rate=0.2,
            drop_path_rate=0.2
        )
        
        if 'efficientnet' in cfg.model_name:
            backbone_out = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif 'resnet' in cfg.model_name:
            backbone_out = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        else:
            backbone_out = self.backbone.get_classifier().in_features
            self.backbone.reset_classifier(0, '')
        
        self.pooling = nn.AdaptiveAvgPool2d(1)
            
        self.feat_dim = backbone_out
        
        self.classifier = nn.Linear(backbone_out, cfg.num_classes)
        
        self.mixup_enabled = hasattr(cfg, 'mixup_alpha') and cfg.mixup_alpha > 0
        if self.mixup_enabled:
            self.mixup_alpha = cfg.mixup_alpha
            
    def forward(self, x, targets=None):
    
        if self.training and self.mixup_enabled and targets is not None:
            mixed_x, targets_a, targets_b, lam = self.mixup_data(x, targets)
            x = mixed_x
        else:
            targets_a, targets_b, lam = None, None, None
        
        features = self.backbone(x)
        
        if isinstance(features, dict):
            features = features['features']
            
        if len(features.shape) == 4:
            features = self.pooling(features)
            features = features.view(features.size(0), -1)
        
        logits = self.classifier(features)
        
        if self.training and self.mixup_enabled and targets is not None:
            loss = self.mixup_criterion(F.binary_cross_entropy_with_logits, 
                                       logits, targets_a, targets_b, lam)
            return logits, loss
            
        return logits
    
    def mixup_data(self, x, targets):
        """Applies mixup to the data batch"""
        batch_size = x.size(0)

        lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)

        indices = torch.randperm(batch_size).to(x.device)

        mixed_x = lam * x + (1 - lam) * x[indices]
        
        return mixed_x, targets, targets[indices], lam
    
    def mixup_criterion(self, criterion, pred, y_a, y_b, lam):
        """Applies mixup to the loss function"""
        return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [48]:
def get_optimizer(model, cfg):
  
    if cfg.optimizer == 'Adam':
        optimizer = optim.Adam(
            model.parameters(),
            lr=cfg.lr,
            weight_decay=cfg.weight_decay
        )
    elif cfg.optimizer == 'AdamW':
        optimizer = optim.AdamW(
            model.parameters(),
            lr=cfg.lr,
            weight_decay=cfg.weight_decay
        )
    elif cfg.optimizer == 'SGD':
        optimizer = optim.SGD(
            model.parameters(),
            lr=cfg.lr,
            momentum=0.9,
            weight_decay=cfg.weight_decay
        )
    else:
        raise NotImplementedError(f"Optimizer {cfg.optimizer} not implemented")
        
    return optimizer

def get_scheduler(optimizer, cfg):
   
    if cfg.scheduler == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=cfg.T_max,
            eta_min=cfg.min_lr
        )
    elif cfg.scheduler == 'ReduceLROnPlateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=2,
            min_lr=cfg.min_lr,
            verbose=True
        )
    elif cfg.scheduler == 'StepLR':
        scheduler = lr_scheduler.StepLR(
            optimizer,
            step_size=cfg.epochs // 3,
            gamma=0.5
        )
    elif cfg.scheduler == 'OneCycleLR':
        scheduler = lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=cfg.lr,
            steps_per_epoch=len(train_loader),
            epochs=cfg.epochs,
            pct_start=0.1
        )
    elif cfg.scheduler == 'CosineAnnealingLRwithWarmup' :
        warmup_epochs = 5
        warmpup_scheduler = lr_scheduler.LambdaLR(
            optimizer, 
            lr_lambda=lambda epoch: epoch / warmup_epochs)
        cosine_scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=cfg.T_max,
            eta_min=cfg.min_lr
        )
        scheduler = lr_scheduler.SequentialLR(
            optimizer, 
            schedulers=[warmup_scheduler, cosine_scheduler],
            milestones=[warmup_epochs]
        )
    else:
        scheduler = None
        
    return scheduler

def get_criterion(cfg):
 
    if cfg.criterion == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss()
    else:
        raise NotImplementedError(f"Criterion {cfg.criterion} not implemented")
        
    return criterion

## Train

In [49]:
def train_one_epoch(model, loader, optimizer, criterion, device, scheduler=None):
    
    model.train()
    losses = []
    all_targets = []
    all_outputs = []
    
    pbar = tqdm(enumerate(loader), total=len(loader), desc="Training")
    
    for step, batch in pbar:
    
        if isinstance(batch['melspec'], list):
            batch_outputs = []
            batch_losses = []
            
            for i in range(len(batch['melspec'])):
                inputs = batch['melspec'][i].unsqueeze(0).to(device)
                target = batch['target'][i].unsqueeze(0).to(device)
                
                optimizer.zero_grad()
                output = model(inputs)
                loss = criterion(output, target)
                loss.backward()
                
                batch_outputs.append(output.detach().cpu())
                batch_losses.append(loss.item())
            
            optimizer.step()
            outputs = torch.cat(batch_outputs, dim=0).numpy()
            loss = np.mean(batch_losses)
            targets = batch['target'].numpy()
            
        else:
            inputs = batch['melspec'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if isinstance(outputs, tuple):
                outputs, loss = outputs  
            else:
                loss = criterion(outputs, targets)
                
            loss.backward()
            optimizer.step()
            
            outputs = outputs.detach().cpu().numpy()
            targets = targets.detach().cpu().numpy()
        
        if scheduler is not None and isinstance(scheduler, lr_scheduler.OneCycleLR):
            scheduler.step()
            
        all_outputs.append(outputs)
        all_targets.append(targets)
        losses.append(loss if isinstance(loss, float) else loss.item())
        
        pbar.set_postfix({
            'train_loss': np.mean(losses[-10:]) if losses else 0,
            'lr': optimizer.param_groups[0]['lr']
        })
    
    all_outputs = np.concatenate(all_outputs)
    all_targets = np.concatenate(all_targets)
    auc = calculate_auc(all_targets, all_outputs)
    avg_loss = np.mean(losses)
    
    return avg_loss, auc

# Treat ss data as normal data, one epo is just concatenation of [train_loader, ss_loader]
def train_1epo_with_ss(model, loader, optimizer, criterion, device, ss_loader, scheduler=None):
    
    model.train()
    losses = []
    sslosses = []
    all_targets = []
    all_outputs = []
    
    pbar = tqdm(enumerate(loader), total=len(loader), desc="Training")
    
    for step, batch in pbar:
    
        if isinstance(batch['melspec'], list):
            batch_outputs = []
            batch_losses = []
            
            for i in range(len(batch['melspec'])):
                inputs = batch['melspec'][i].unsqueeze(0).to(device)
                target = batch['target'][i].unsqueeze(0).to(device)
                
                optimizer.zero_grad()
                output = model(inputs)
                loss = criterion(output, target)
                loss.backward()
                
                batch_outputs.append(output.detach().cpu())
                batch_losses.append(loss.item())
            
            optimizer.step()
            outputs = torch.cat(batch_outputs, dim=0).numpy()
            loss = np.mean(batch_losses)
            targets = batch['target'].numpy()
            
        else:
            inputs = batch['melspec'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if isinstance(outputs, tuple):
                outputs, loss = outputs  
            else:
                loss = criterion(outputs, targets)
                
            loss.backward()
            optimizer.step()
            
            outputs = outputs.detach().cpu().numpy()
            targets = targets.detach().cpu().numpy()
        
        # if scheduler is not None and isinstance(scheduler, lr_scheduler.OneCycleLR):
        #     scheduler.step()
            
        all_outputs.append(outputs)
        all_targets.append(targets)
        losses.append(loss if isinstance(loss, float) else loss.item())
        
        pbar.set_postfix({
            'train_loss': np.mean(losses[-10:]) if losses else 0,
            'lr': optimizer.param_groups[0]['lr']
        })

    # Enumerating with Soundscapes

    pbarss = tqdm(enumerate(ss_loader), total=len(ss_loader), desc="PsuedoLabel Training")
    for _, batch in pbarss :
        if isinstance(batch['melspec'], list):
            batch_losses = []

            for i in range(len(batch['melspec'])):
                inputs = batch['melspec'][i].unsqueeze(0).to(device)
                target = batch['target'][i].unsqueeze(0).to(device)
                
                optimizer.zero_grad()
                output = model(inputs)
                loss = criterion(output, target)
                loss.backward()
                batch_losses.append(loss.item())
            
            optimizer.step()
            loss = np.mean(batch_losses)
            
        else:
            inputs = batch['melspec'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if isinstance(outputs, tuple):
                outputs, loss = outputs  
            else:
                loss = criterion(outputs, targets)
                
            loss.backward()
            optimizer.step()
        
        if scheduler is not None and isinstance(scheduler, lr_scheduler.OneCycleLR):
            scheduler.step()
            
        sslosses.append(loss if isinstance(loss, float) else loss.item())
        
        pbarss.set_postfix({
            'train_loss': np.mean(sslosses[-10:]) if sslosses else 0,
            'lr': optimizer.param_groups[0]['lr']
        })

# mixup ss with original train data
def train_1epo_with_ss_mixup(model, loader, optimizer, criterion, device, ss_loader, mixuppipeline, rng=None, scheduler=None):
    
    model.train()
    losses = []

    if rng == None :
        rng = np.random.RandomState()
    
    pbar = tqdm(enumerate(loader), total=len(loader), desc="Training")
    ss_iter = iter(ss_loader)
    
    for step, batch in pbar:
        batch_ss = next(ss_iter)
    
        if isinstance(batch['melspec'], list):
            batch_outputs = []
            batch_losses = []
            
            for i in range(len(batch['melspec'])):
                inputs = batch['melspec'][i].unsqueeze(0).to(device)
                target = batch['target'][i].unsqueeze(0).to(device)
                optimizer.zero_grad()

                if rng.uniform() < mixuppipeline.schedule_p() :   
                    inputs_ss = batch_ss['melspec'][i].unsqueeze(0).to(device)
                    target_ss = batch_ss['target'][i].unsqueeze(0).to(device)

                    mixed_inputs, lam = mixuppipeline.mixup_data(inputs, inputs_ss)
                    # idk it works with batch

                    output = model(mixed_inputs)

                    loss = mixuppipeline.mixup_criterion(output, target, target_ss, lam)
                else :
                    output = model(inputs)
                    loss = criterion(output, target)

                loss.backward()
                
                batch_outputs.append(output.detach().cpu())
                batch_losses.append(loss.item())
            
            optimizer.step()
            outputs = torch.cat(batch_outputs, dim=0).numpy()
            loss = np.mean(batch_losses)
            targets = batch['target'].numpy()
            
        else:
            inputs = batch['melspec'].to(device)
            targets = batch['target'].to(device)
            optimizer.zero_grad()

            if rng.uniform() < mixuppipeline.schedule_p() :

                inputs_ss = batch_ss['melspec'].to(device)
                targets_ss = batch_ss['target'].to(device)
            
                mixed_inputs, lam = mixuppipeline.mixup_data(inputs, inputs_ss)

                outputs = model(mixed_inputs)
            else :
                outputs = model(inputs)

            if isinstance(outputs, tuple):
                outputs, loss = outputs  
            else:
                if targets_ss is not None : # performed mixup cause target_ss is not none

                    loss = mixuppipeline.mixup_criterion(outputs, target, targets_ss, lam)
                else :
                    loss = criterion(outputs, targets)
                
            loss.backward()
            optimizer.step()
            
            outputs = outputs.detach().cpu().numpy()
            targets = targets.detach().cpu().numpy()
        
        if scheduler is not None and isinstance(scheduler, lr_scheduler.OneCycleLR):
            scheduler.step()
            
        losses.append(loss if isinstance(loss, float) else loss.item())
        
        pbar.set_postfix({
            'train_loss': np.mean(losses[-10:]) if losses else 0,
            'lr': optimizer.param_groups[0]['lr']
        })
    # auc = calculate_auc(all_targets, all_outputs)
    avg_loss = np.mean(losses)
    
    return avg_loss, None # return ave_loss, auc


def validate(model, loader, criterion, device):
   
    model.eval()
    losses = []
    all_targets = []
    all_outputs = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validation"):
            if isinstance(batch['melspec'], list):
                batch_outputs = []
                batch_losses = []
                
                for i in range(len(batch['melspec'])):
                    inputs = batch['melspec'][i].unsqueeze(0).to(device)
                    target = batch['target'][i].unsqueeze(0).to(device)
                    
                    output = model(inputs)
                    loss = criterion(output, target)
                    
                    batch_outputs.append(output.detach().cpu())
                    batch_losses.append(loss.item())
                
                outputs = torch.cat(batch_outputs, dim=0).numpy()
                loss = np.mean(batch_losses)
                targets = batch['target'].numpy()
                
            else:
                inputs = batch['melspec'].to(device)
                targets = batch['target'].to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                
                outputs = outputs.detach().cpu().numpy()
                targets = targets.detach().cpu().numpy()
            
            all_outputs.append(outputs)
            all_targets.append(targets)
            losses.append(loss if isinstance(loss, float) else loss.item())
    
    all_outputs = np.concatenate(all_outputs)
    all_targets = np.concatenate(all_targets)
    
    auc = calculate_auc(all_targets, all_outputs)
    avg_loss = np.mean(losses)
    
    return avg_loss, auc

def calculate_auc(targets, outputs):
  
    num_classes = targets.shape[1]
    aucs = []
    
    probs = 1 / (1 + np.exp(-outputs))
    
    for i in range(num_classes):
        
        if np.sum(targets[:, i]) > 0:
            class_auc = roc_auc_score(targets[:, i], probs[:, i])
            aucs.append(class_auc)
    
    return np.mean(aucs) if aucs else 0.0

In [50]:
def run_training(df, cfg, ssdf = None, sslabeldf = None):
    """Training function that can either use pre-computed spectrograms or generate them on-the-fly"""

    cfg.num_classes = taxonomy_process(cfg)[2]
    
    if cfg.debug:
        cfg.update_debug_settings()

    
    if cfg.LOAD_DATA:
        if 'filepath' not in df.columns:
            df['filepath'] = cfg.train_datadir + '/' + df.filename
        if 'samplename' not in df.columns:
            df['samplename'] = df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])
        
    skf = StratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    
    best_scores = []

    batch_sampler = infinite_batch_sampler(
        ss_dataset, 
        batch_size=cfg.batch_size, 
        generator=torch.Generator().manual_seed(cfg.seed))

    ss_dataset = SoundscapeFromNPY(ssdf, sslabeldf, cfg, augmentor=dataaugmentor, mode='train')
    ss_loader = DataLoader(
        ss_dataset,
        batch_size=cfg.batch_size,
        batch_sampler=batch_sampler,
        shuffle=True,
        num_workers=cfg.num_workers,
        pin_memony=True,
        collate_fn=collate_fn
    )

    val_loss_log = np.empty((0, 6))


    for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['primary_label'])):
        if fold not in cfg.selected_folds:
            continue
            
        print(f'\n{"="*30} Fold {fold} {"="*30}')
        
        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)
        
        print(f'Training set: {len(train_df)} samples')
        print(f'Validation set: {len(val_df)} samples')
        
        current_epo = 0
        current_epo_fn = lambda : current_epo

        dataaugmentor = AugmentationPipeline(cfg, current_epo_fn, rng=None)
        mixuppipeline = MixupPipeline(cfg, current_epo_fn, rng = None)

        train_dataset = BirdCLEFDatasetFromNPY(train_df, cfg, augmentor=dataaugmentor, mode='train')
        val_dataset = BirdCLEFDatasetFromNPY(val_df, cfg, augmentor=None, mode='valid')

        train_loader = DataLoader(
            train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True, 
            num_workers=cfg.num_workers,
            pin_memory=True,
            collate_fn=collate_fn,
            drop_last=True
        )
        
        val_loader = DataLoader(
            val_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=False, 
            num_workers=cfg.num_workers,
            pin_memory=True,
            collate_fn=collate_fn
        )
        
        model = BirdCLEFModel(cfg).to(cfg.device)
        optimizer = get_optimizer(model, cfg)
        criterion = get_criterion(cfg)
        
        scheduler = get_scheduler(optimizer, cfg)
        
        best_auc = 0
        best_epoch = 0

        for epoch in range(cfg.epochs):
            print(f"\ncur_epoch = {current_epo}")
            print(f"\nEpoch {epoch+1}/{cfg.epochs}")
            
            # train_loss, train_auc = train_one_epoch(
            #     model, 
            #     train_loader, 
            #     optimizer, 
            #     criterion, 
            #     cfg.device,
            #     scheduler if isinstance(scheduler, lr_scheduler.OneCycleLR) else None,
            #     ss_loader=ss_loader
            # )
            train_auc = None
            train_loss, _ = train_1epo_with_ss_mixup(
                model, 
                train_loader, 
                optimizer, 
                criterion, 
                cfg.device,
                scheduler if isinstance(scheduler, lr_scheduler.OneCycleLR) else None,
                ss_loader=ss_loader,
                mixuppipeline=mixuppipeline,
                rng=None
            )

            val_loss, val_auc = validate(model, val_loader, criterion, cfg.device)

            if scheduler is not None and not isinstance(scheduler, lr_scheduler.OneCycleLR):
                if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(val_loss)
                else:
                    scheduler.step()

            train_auc = 0.0 if train_auc==None else train_auc

            print(f"Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}")
            print(f"Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}")
            
            val_loss_log = np.append(val_loss_log, np.array([[fold, epoch, train_loss, train_auc, val_loss, val_auc]]), axis=0)
            file_path = os.path.join(cfg.OUTPUT_DIR, "val_loss_log")
            np.save(file_path, val_loss_log)
            print(f"loss file saved! {file_path}")


            if val_auc > best_auc:
                best_auc = val_auc
                best_epoch = epoch + 1
                print(f"New best AUC: {best_auc:.4f} at epoch {best_epoch}")

                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
                    'epoch': epoch,
                    'val_auc': val_auc,
                    'train_auc': train_auc,
                    'cfg': cfg
                }, f"model_fold{fold}.pth")
            current_epo += 1
        
        best_scores.append(best_auc)
        print(f"\nBest AUC for fold {fold}: {best_auc:.4f} at epoch {best_epoch}")
        
        # Clear memory
        del model, optimizer, scheduler, train_loader, val_loader
        torch.cuda.empty_cache()
        gc.collect()
    
    print("\n" + "="*60)
    print("Cross-Validation Results:")
    for fold, score in enumerate(best_scores):
        print(f"Fold {cfg.selected_folds[fold]}: {score:.4f}")
    print(f"Mean AUC: {np.mean(best_scores):.4f}")
    print("="*60)

In [51]:
def checkhparams (cfg) :
    t_datadir = cfg.train_datadir.split("/")[2]

    try :
        parsed = t_datadir.split("-")[1:]
        nfft, nmel, shapex, shapey = parsed # , hlength = parsed
    except Exception as e :
        print(e)
    if int(parsed[0]) == cfg.N_FFT and int(parsed[1]) == cfg.N_MELS and int(parsed[2]) == cfg.TARGET_SHAPE[0] and int(parsed[3]) == cfg.TARGET_SHAPE[1] : #and int(parsed[4]) == cfg.HOP_LENGTH :
        print("Config and dataset hyperparameter does match.")
        return

    else :
        print(parsed)
        print(cfg.N_FFT, cfg.N_MELS, cfg.TARGET_SHAPE[0], cfg.TARGET_SHAPE[1], cfg.HOP_LENGTH)

        raise Exception("Config and dataset hyperparameter does not match")
        return None


## Main

In [None]:
if __name__ == "__main__":
    import time
    
    print("\nLoading training data...")
    train_df = pd.read_csv(cfg.train_csv)
    ss_df = pd.read_csv(cfg.train_sscsv)
    ss_label_df = pd.read_csv(cfg.train_sssub)
    checkhparams(cfg)

    print("\nStarting training...")
    print(f"LOAD_DATA is set to {cfg.LOAD_DATA}")
    if cfg.LOAD_DATA:
        print("Using pre-computed mel spectrograms from NPY file")
    else:
        print("Will generate spectrograms on-the-fly during training")
    
    run_training(train_df, cfg, ss_df, ss_label_df)
    
    print("\nTraining complete!")

### TO DO :

- Original mixup이 model 내에서 일어나는데, batch단위로 train_one_epoch에서 일어나도록 바꾸기
- model(inputs)의 output 인수가 1개가 되니까 if절 삭제
- ss_data loading시 thresholding 추가