# Import packages

Import all required packages.

In [1]:
import os
import gc
import sys
import cv2
import math
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
import librosa
from scipy import signal as sci_signal
import json

import torch
from torch import nn
from torchvision.models import efficientnet

#import tensorflow as tf

import albumentations as albu

import pytorch_lightning as pl
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar

# import score function of BirdCLEF
#sys.path.append('/kaggle/input/birdclef-roc-auc')
#sys.path.append('/kaggle/usr/lib/kaggle_metric_utilities')
#from metric import score

In [2]:
# Import for visualization
import matplotlib as mpl
cmap = mpl.cm.get_cmap('coolwarm')
import matplotlib.pyplot as plt
import librosa.display as lid
import IPython.display as ipd
#import cv2

  cmap = mpl.cm.get_cmap('coolwarm')


In [3]:
# Saturated steam

T = torch.tensor(25) #Ceslium

Es = 6.112*torch.e**(17.67*T/(T + 243.5)) # 'hectopascal'
Es.item()

31.674293518066406

In [4]:
# mixing ratio
q = 1.484e-06 #kg/kg
p = 1000  #units.hPa

q = q * 1000
w = q / (1-q)

# Water partial pressure
e = w / (0.622 + w) * p
e

2.383702343092022

In [5]:
# relative humidity
100 * e / Es.item()

7.525668541691085

# Configuration

Hyper-paramters

In [6]:
class config:
    
    # == global config ==
    SEED = 28082015  # random seed
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' # device to be used
    MIXED_PRECISION = False  # whether to use mixed-16 precision
    OUTPUT_DIR = './output/'  # output folder
    
    # == data config ==
    DATA_ROOT = 'E:/PycharmProjects/birdclef24/data'  # root folder
    PREPROCESSED_DATA_ROOT = '/kaggle/input/birdclef24-spectrograms-via-cupy'
    LOAD_DATA = True  # whether to load data from pre-processed dataset

    
    # == model config ==
    MODEL_TYPE = 'efficientnet_b0'  # model type
    
    # == dataset config ==
    BATCH_SIZE = 256  # batch size of each step
    N_WORKERS = 6  # number of workers
    
    
    # == training config ==
    FOLDS = 7  # n fold
    EPOCHS = 200  # max epochs
    LR = 7e-4  # learning rate
    WEIGHT_DECAY = 9e-6  # weight decay of optimizer
    
    # == other config ==
    VISUALIZE = True  # whether to visualize data and batch
    
    
print('fix seed')
pl.seed_everything(config.SEED, workers=True)

CFG = config

fix seed


Seed set to 28082015


In [7]:
class ECA(nn.Module):
    def __init__(self, kernel_size=5):
        super().__init__()
        self.kernel_size = kernel_size
        self.supports_masking = True
        self.conv = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=kernel_size, stride=1, padding="same", bias=False)
    def forward(self, inputs):
        b, c, s = inputs.shape
        
        x = torch.mean(inputs, axis = -1)
        x = x.view(b, 1, c)
        x = self.conv(x)
        x = x.squeeze(1)
        x = nn.Sigmoid()(x)
        x = x[:,:,None]
        return inputs * x


class CausalDWConv1D(nn.Module):
    def __init__(self, 
        kernel_size=17,
        dilation_rate=1,
        use_bias=False,
        in_channels = 64,
        out_channels = 32,       
        depthwise_initializer='glorot_uniform',
        **kwargs):
        super().__init__()
        #self.causal_pad = tf.keras.layers.ZeroPadding1D((dilation_rate*(kernel_size-1),0),name=name + '_pad')
        self.dw_conv = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size, 
            stride=1, 
            padding='same', 
            dilation=dilation_rate, 
            groups=out_channels if kernel_size > 3 else 1,
            bias=False, 
            padding_mode='zeros')

        
    def forward(self, inputs):
        x = self.dw_conv(inputs)
        return x


class Conv1DBlock(nn.Module):
    def __init__(self, 
                 kernel_size=17,
                 channels = 32,
                 expand_channels = 64,
                 drop_rate=0.0,
                ):
        super().__init__()
        self.kernel_size = kernel_size
        self.conv = CausalDWConv1D(
                        kernel_size=kernel_size,
                        dilation_rate=1,
                        use_bias=False,
                        in_channels = expand_channels,
                        out_channels = expand_channels
                    )
        self.dnn_expand = nn.Linear(in_features = channels, 
                                    out_features = expand_channels
                                     )
        self.dnn_project = nn.Linear(in_features = expand_channels, 
                             out_features = channels
                                    )
        self.bn = nn.BatchNorm1d(num_features = expand_channels, eps=0.95)
        self.eca = ECA()
        self.dropout = nn.Dropout(drop_rate)
        self.act = nn.SiLU()

    def forward(self, inputs):
        skip = inputs

        x = inputs.permute([0,2,1])
        x = self.dnn_expand(x)
        
        x = x.permute([0,2,1])
        x = self.act(x)
        x = self.conv(x)
        x = self.bn(x)
        x = self.eca(x)
        
        x = x.permute([0,2,1])
        x = self.dnn_project(x)
        x = x.permute([0,2,1])

        return x + skip


class Conv1DModel(nn.Module):
    def __init__(self, 
                 kernel_size=17,
                 channels = 32,
                 expand_channels = 64,
                 drop_rate=0.0,
                 num_blocks_in_stage = 3,
                 input_len = 32_000*5,
                 n_classes = 182
                ):
        super().__init__()
        self.stem_conv = nn.Linear(in_features = 1, 
                                    out_features = channels
                                     )
        self.stem_bn = nn.BatchNorm1d(num_features = channels, eps=0.95)

        self.ConvStage_1 = nn.ModuleList([
            Conv1DBlock(kernel_size=kernel_size, channels = channels,expand_channels = expand_channels, drop_rate=drop_rate)
                                         for _ in range(num_blocks_in_stage)])
        self.PoolStage_1 = nn.AvgPool1d(kernel_size=(4))
        
        self.ConvStage_2 = nn.ModuleList([
            Conv1DBlock(kernel_size=kernel_size, channels = channels,expand_channels = expand_channels, drop_rate=drop_rate)
                                          for _ in range(num_blocks_in_stage)])
        self.PoolStage_2 = nn.AvgPool1d(kernel_size=(4))

        
        self.ConvStage_3 = nn.ModuleList([
            Conv1DBlock(kernel_size=kernel_size, channels = channels,expand_channels = expand_channels, drop_rate=drop_rate)
                                          for _ in range(num_blocks_in_stage)])
        self.PoolStage_3 = nn.AvgPool1d(kernel_size=(4))

        self.pre_out = nn.Linear(in_features = channels, out_features = n_classes*2)
        self.dropout = nn.Dropout(drop_rate)
        self.out_act = nn.SiLU()
        self.out = nn.Linear(in_features = n_classes*2, out_features = n_classes)
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, inputs):
        
        b, s = inputs.shape
        x = inputs.view(b, s, 1)
        x = self.stem_conv(x)
        x = x.permute([0,2,1])
        x = self.stem_bn(x)

        for block in self.ConvStage_1:
            x = block(x)
        x = self.PoolStage_1(x)

        for block in self.ConvStage_2:
            x = block(x)
        x = self.PoolStage_2(x)

        for block in self.ConvStage_3:
            x = block(x)
        x = self.PoolStage_3(x)

        x = x.mean(axis=2)

        x = self.pre_out(x)
        x = self.dropout(x)
        x = self.out_act(x)
        
        logits = self.out(x)
        probs = self.sigmoid(logits)

        return {
                "clipwise_logits_long": logits,
                "clipwise_pred_long": probs,
            }


        

In [8]:

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
        

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head, dropout):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = nn.MultiheadAttention(n_embd, n_head)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x, q = None):
        if q is not None:
            X = (q, x, x)
        else:
            X = (x, x, x)
        y = self.sa(*X)
        y = y[0]
        
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x


In [9]:
class ConvTransBlock(nn.Module):
    def __init__(self, block_kernels = [5, 3], n_head = 4, channels=16, expand_channels=32, drop_rate = 0.1, att_drop_rate = 0.25, n_features=25):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.conv = nn.Sequential(*[
            Conv1DBlock(kernel_size=k, channels = channels,expand_channels = expand_channels, drop_rate=drop_rate)
            for k in block_kernels
        ])

        self.block = Block(n_embd = channels, n_head=n_head, dropout = att_drop_rate)

    def forward(self, x):
        x = self.conv(x)
        x = x.permute([0,2,1])
        x = self.block(x)
        x = x.permute([0,2,1])
        return x

In [10]:
class ConvMixerBlock(nn.Module):
    def __init__(self, kernel_size = 3, channels=16):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.dw_conv = nn.Conv1d(
            in_channels = channels, 
            out_channels = channels, 
            kernel_size = kernel_size, 
            stride=1, 
            padding='same', 
            dilation=1, 
            groups=channels,
            bias=False, 
            padding_mode='zeros')

        self.pw_conv = nn.Conv1d(
            in_channels = channels, 
            out_channels = channels, 
            kernel_size = 1, 
            stride=1, 
            padding='same', 
            dilation=1, 
            groups=1,
            bias=False, 
            padding_mode='zeros')

        self.gelu1 = nn.GELU()
        self.gelu2 = nn.GELU()       
        self.bn1 = nn.BatchNorm1d(channels)             
        self.bn2 = nn.BatchNorm1d(channels)     


    def forward(self, x):
        y = x

        x = self.dw_conv(x)
        x = self.gelu1(x)        
        x = self.bn1(x)

        x = y + x
        x = self.pw_conv(x)
        x = self.gelu2(x)        
        x = self.bn2(x)
        
        return x

In [11]:
ConvMixerBlock()(torch.ones([8,16,60])).shape

torch.Size([8, 16, 60])

In [12]:
class SelfAttFeatureExctractor(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, kernel_size = 3, channels=16, drop_rate = 0.1, n_features=25):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.channels = channels
        self.inputConv = nn.ModuleList([nn.Conv1d(in_channels = 1,
                                                out_channels = channels,
                                                kernel_size = kernel_size,
                                                stride=1, 
                                                padding='same') for _ in range(n_features)])
        
        self.projConv = nn.ModuleList([nn.Conv1d(in_channels = channels,
                                                out_channels = channels,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same') for _ in range(n_features)])

        self.lns = nn.ModuleList([nn.LayerNorm(channels) for _ in range(n_features)])

        
        
    def forward(self, x):
        x = x.view(-1, 556)
        
        state_t = x[:, 0:60] # - 273
        state_q0001 = x[:, 60:120] #*1_000
        state_q0002 = x[:, 120:180] #*1_000
        state_q0003 = x[:, 180:240] #*1_000
        state_u = x[:, 240:300] #/ 100
        state_v = x[:, 300:360] #/ 100
    
        state_ps = x[:, 360:361] #/ 100_000 - 1
        pbuf_SOLIN = x[:, 361:362] #/ 1000
        pbuf_LHFLX = x[:, 362:363] #/ 1000
        pbuf_SHFLX = x[:, 363:364] #/ 1000
        pbuf_TAUX = x[:, 364:365] #/ 1
        pbuf_TAUY = x[:, 365:366] #/ 1
        pbuf_COSZRS = x[:, 366:367] #/ 1
        cam_in_ALDIF = x[:, 367:368] #/ 1
        cam_in_ALDIR = x[:, 368:369] #/ 1
        cam_in_ASDIF = x[:, 369:370] #/ 1
        cam_in_ASDIR = x[:, 370:371] #/ 1
        cam_in_LWUP = x[:, 371:372] # / 1000
        cam_in_ICEFRAC = x[:, 372:373] #/ 1
        cam_in_LANDFRAC = x[:, 373:374] #/1
        cam_in_OCNFRAC = x[:, 374:375]  #/1
        cam_in_SNOWHLAND = x[:, 375:376]# / 1
    
        pbuf_ozone = x[:, 376:436] #* 100_000
        pbuf_CH4 = x[:, 436:496] #* 100_000
        pbuf_N2O = x[:, 496:556] #* 100_000
            
        inputs_60 = [
                state_t,
                state_q0001,
                state_q0002,
                state_q0003, 
                state_u,
                state_v,
    
                pbuf_ozone,
                pbuf_CH4,
                pbuf_N2O
        ]

        inputs_flat = [            
                torch.repeat_interleave(state_ps, 60, dim=-1),
                torch.repeat_interleave(pbuf_SOLIN, 60, dim=-1),
                torch.repeat_interleave(pbuf_LHFLX, 60, dim=-1),
                torch.repeat_interleave(pbuf_SHFLX, 60, dim=-1),
                torch.repeat_interleave(pbuf_TAUX, 60, dim=-1),
               torch.repeat_interleave(pbuf_TAUY, 60, dim=-1),
                torch.repeat_interleave(pbuf_COSZRS, 60, dim=-1),
                torch.repeat_interleave(cam_in_ALDIF, 60, dim=-1),
                torch.repeat_interleave(cam_in_ALDIR, 60, dim=-1),
               torch.repeat_interleave(cam_in_ASDIF, 60, dim=-1),
                torch.repeat_interleave(cam_in_ASDIR, 60, dim=-1),
                torch.repeat_interleave(cam_in_LWUP, 60, dim=-1),
                torch.repeat_interleave(cam_in_ICEFRAC, 60, dim=-1),
                torch.repeat_interleave(cam_in_LANDFRAC, 60, dim=-1),
                torch.repeat_interleave(cam_in_OCNFRAC, 60, dim=-1),
                torch.repeat_interleave(cam_in_SNOWHLAND, 60, dim=-1),
        ]

        inputs = inputs_60 + inputs_flat
        
        expanded = []
        for i, conv in enumerate(self.inputConv):
            t = inputs[i]
            t = t.view(-1, 1, 60)
            expanded.append(conv(t))

        global_input = torch.cat([t.view(-1, self.channels, 60, 1) for t in expanded], axis = -1)
        global_input = global_input.mean(axis=-1)
        k = global_input#.permute([0,2,1])

        var_attention = []
        for i, feature in enumerate(expanded):
            q = feature.permute([0,2,1]) #B, L, C
            v = self.projConv[i](feature).permute([0,2,1]) #B, L, C
            
            att = nn.Softmax()(torch.matmul(q, k) / self.channels**0.5).permute([0,2,1]) #B, L, L
            
            y = torch.matmul(att, v) #B, L, L * #B, L, C --> #B, L, C
            y = self.lns[i](y)
        
            var_attention.append((y + q).permute([0,2,1])) #B, C, L
        return torch.cat(var_attention, 1)#.permute([0,2,1])


In [13]:
class FeatureExctractor(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, kernel_size = 7, channels=16, expand_channels=32, drop_rate = 0.1, n_features=25):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.Scales60 = nn.ModuleList([nn.Conv1d(in_channels = 1,
                                                out_channels = channels,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same') for _ in range(9)])
        
        self.ScalesFlat = nn.ModuleList([nn.Conv1d(in_channels = 1,
                                                out_channels = channels//2,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same') for _ in range(16)])
        
        self.ConvExt60 = nn.ModuleList([
            Conv1DBlock(kernel_size=kernel_size, channels = channels,expand_channels = expand_channels, drop_rate=drop_rate)
                                          for _ in range(9)])
        self.ConvExtFlat = nn.ModuleList([
            Conv1DBlock(kernel_size=kernel_size, channels = channels//2,expand_channels = expand_channels//2, drop_rate=drop_rate)
                                          for _ in range(16)])

    def forward(self, x):
        x = x.view(-1, 556)
        
        state_t = x[:, 0:60] - 273
        state_q0001 = x[:, 60:120] *1_000
        state_q0002 = x[:, 120:180] *1_000
        state_q0003 = x[:, 180:240] *1_000
        state_u = x[:, 240:300] / 100
        state_v = x[:, 300:360] / 100
    
        state_ps = x[:, 360:361]/ 100_000 - 1
        pbuf_SOLIN = x[:, 361:362] / 1000
        pbuf_LHFLX = x[:, 362:363] / 1000
        pbuf_SHFLX = x[:, 363:364] / 1000
        pbuf_TAUX = x[:, 364:365] / 1
        pbuf_TAUY = x[:, 365:366] / 1
        pbuf_COSZRS = x[:, 366:367] / 1
        cam_in_ALDIF = x[:, 367:368] / 1
        cam_in_ALDIR = x[:, 368:369] / 1
        cam_in_ASDIF = x[:, 369:370] / 1
        cam_in_ASDIR = x[:, 370:371] / 1
        cam_in_LWUP = x[:, 371:372] / 1000
        cam_in_ICEFRAC = x[:, 372:373] / 1
        cam_in_LANDFRAC = x[:, 373:374] /1
        cam_in_OCNFRAC = x[:, 374:375]  /1
        cam_in_SNOWHLAND = x[:, 375:376] / 1
    
        pbuf_ozone = x[:, 376:436] * 100_000
        pbuf_CH4 = x[:, 436:496] * 100_000
        pbuf_N2O = x[:, 496:556] * 100_000
            
        inputs_60 = [
                state_t,
                state_q0001,
                state_q0002,
                state_q0003, 
                state_u,
                state_v,
    
                pbuf_ozone,
                pbuf_CH4,
                pbuf_N2O
        ]

        inputs_flat = [            
                torch.repeat_interleave(state_ps, 60, dim=-1),
                torch.repeat_interleave(pbuf_SOLIN, 60, dim=-1),
                torch.repeat_interleave(pbuf_LHFLX, 60, dim=-1),
                torch.repeat_interleave(pbuf_SHFLX, 60, dim=-1),
                torch.repeat_interleave(pbuf_TAUX, 60, dim=-1),
               torch.repeat_interleave(pbuf_TAUY, 60, dim=-1),
                torch.repeat_interleave(pbuf_COSZRS, 60, dim=-1),
                torch.repeat_interleave(cam_in_ALDIF, 60, dim=-1),
                torch.repeat_interleave(cam_in_ALDIR, 60, dim=-1),
               torch.repeat_interleave(cam_in_ASDIF, 60, dim=-1),
                torch.repeat_interleave(cam_in_ASDIR, 60, dim=-1),
                torch.repeat_interleave(cam_in_LWUP, 60, dim=-1),
                torch.repeat_interleave(cam_in_ICEFRAC, 60, dim=-1),
                torch.repeat_interleave(cam_in_LANDFRAC, 60, dim=-1),
                torch.repeat_interleave(cam_in_OCNFRAC, 60, dim=-1),
                torch.repeat_interleave(cam_in_SNOWHLAND, 60, dim=-1),
        ]
        
        output = []
        for i, conv in enumerate(self.ConvExt60):
            t = inputs_60[i]
            t = t.view(-1, 1, 60)
            t = self.Scales60[i](t)
            output.append(conv(t))
            
        for i, conv in enumerate(self.ConvExtFlat):
            t = inputs_flat[i]
            t = t.view(-1, 1, 60)
            t = self.ScalesFlat[i](t)
            output.append(conv(t))


        return torch.cat(output, 1)#.permute([0,2,1])


In [14]:
FeatureExctractor()(torch.ones([8,556])).shape

torch.Size([8, 272, 60])

In [15]:
class LEADHead(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()

        self.act = nn.SELU()
        self.conv_seq = nn.Conv1d(in_channels = n_embd, out_channels = 6,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same')
        
        self.conv_flat = nn.Conv1d(in_channels = n_embd, out_channels = 8,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same')

        self.expand = nn.Linear(in_features = n_embd, out_features = n_embd*4)
        self.out = nn.Linear(in_features = n_embd*4, out_features = 368)
        self.drop = nn.Dropout(0.05)
    
    def forward(self, x):

        x = x.permute([0,2,1])
        
        x = self.expand(x)
        #x = self.act(x)
        x = torch.mean(x, axis = 1)
        x = self.drop(x)
        x = self.out(x)

        return x

In [16]:
class LEADHead(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()

        self.act = nn.SELU()
        self.conv_seq = nn.Conv1d(in_channels = n_embd, out_channels = 6,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same')
        
        self.conv_flat = nn.Conv1d(in_channels = n_embd, out_channels = 8,
                                                kernel_size = 1,
                                                stride=1, 
                                                padding='same')
        
    def forward(self, x):

        #x = x.permute([0,2,1])
        
        p_seq = self.conv_seq(x)
        p_seq = nn.Flatten()(p_seq)
    
        p_flat = self.conv_flat(x)
        p_flat = torch.mean(p_flat, axis = -1)
        
        return torch.cat([p_seq, p_flat], axis= -1)

In [17]:
LEADHead(32)(torch.ones([8,32,60])).shape

torch.Size([8, 368])

In [18]:
# batch_size = 16
# block_size = 256
# max_iters = 5000
# learning_rate = 3e-4
# eval_iters = 100
# n_embd = 384
# n_head = 8
# n_layer = 12
# dropout = 0.2

nn_config = dict(
    n_embd = 256,
    n_head = 4,
    fe_channels = 32, 
    encoder_layers = 3, 
    fe_drop_rate = 0.05,
    att_drop_rate = 0.1,
    n_features = 25,
    bottleneck_k_size = 5,
    block_kernels = [5, 3]
)

    
class LEADModelAtt(nn.Module):
    def __init__(self, n_embd = 64, n_head = 4, encoder_layers = 3, fe_channels=16, fe_drop_rate=0.1, 
                 att_drop_rate=0.2, n_features = 25, bottleneck_k_size = 3, block_kernels = [5, 3]):
        super().__init__()
        self.fe = FeatureExctractor(kernel_size = 7, channels=fe_channels, expand_channels=fe_channels*2, drop_rate = fe_drop_rate, n_features=n_features)
        self.linearStem = nn.Linear(fe_channels*9 + fe_channels//2 * 16, n_embd)
        self.bottleneck = Conv1DBlock(kernel_size=bottleneck_k_size, channels = n_embd, expand_channels = n_embd*2, drop_rate=fe_drop_rate)

        self.blocks = nn.Sequential(*[ConvTransBlock(block_kernels = block_kernels, 
                                                     channels = n_embd, 
                                                     expand_channels = n_embd*2, 
                                                     n_head=n_head, 
                                                     drop_rate = fe_drop_rate, 
                                                     att_drop_rate = att_drop_rate) for _ in range(encoder_layers)])
        
        self.head  = LEADHead(n_embd = n_embd)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



    def forward(self, inputs, targets=None):
        #B, T = inputs.shape

        xf = self.fe(inputs)
        xf = xf.permute([0,2,1])
        xf = self.linearStem(xf)
        xf = xf.permute([0,2,1])
        xf = self.bottleneck(xf)
        x = xf#.permute([0,2,1])
        
        x = self.blocks(x)

        out = self.head(x)
        return out

In [19]:
# batch_size = 16
# block_size = 256
# max_iters = 5000
# learning_rate = 3e-4
# eval_iters = 100
# n_embd = 384
# n_head = 8
# n_layer = 12
# dropout = 0.2

nn_config_convmixer = dict(
    n_embd = 256,
    fe_channels = 64, 
    fe_drop_rate = 0.05,
    att_drop_rate = 0.1,
    n_features = 25,
    bottleneck_k_size = 5,
    convmixer_blocks=6
)

    
class LEAPModelConvmixer(nn.Module):
    def __init__(self, n_embd = 64, fe_channels=16, fe_drop_rate=0.1, 
                 att_drop_rate=0.2, n_features = 25, bottleneck_k_size = 3, convmixer_blocks=3):
        super().__init__()
        self.fe = FeatureExctractor(kernel_size = 7, channels=fe_channels, expand_channels=fe_channels*2, drop_rate = fe_drop_rate, n_features=n_features)
        self.linearStem = nn.Linear(fe_channels*9 + fe_channels//2 * 16, n_embd)
        self.bottleneck = Conv1DBlock(kernel_size=bottleneck_k_size, channels = n_embd, expand_channels = n_embd*2, drop_rate=fe_drop_rate)

        self.convmixer_blocks = nn.Sequential(*[
             ConvMixerBlock(channels=n_embd, kernel_size=3)
            for _ in range(convmixer_blocks)
        ])

        self.head  = LEADHead(n_embd = n_embd)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



    def forward(self, inputs, targets=None):
        #B, T = inputs.shape

        xf = self.fe(inputs)
        xf = xf.permute([0,2,1])
        xf = self.linearStem(xf)
        xf = xf.permute([0,2,1])
        xf = self.bottleneck(xf)
        x = xf#.permute([0,2,1])
        
        x = self.convmixer_blocks(x)
        
        out = self.head(x)
        return out

In [20]:
# batch_size = 16
# block_size = 256
# max_iters = 5000
# learning_rate = 3e-4
# eval_iters = 100
# n_embd = 384
# n_head = 8
# n_layer = 12
# dropout = 0.2

nn_config_sa = dict(
    n_embd = 192,
    n_head = 4,
    fe_channels = 32, 
    encoder_layers = 2, 
    fe_drop_rate = 0.1,
    att_drop_rate = 0.2,
    n_features = 25,
    bottleneck_k_size = 3,
    block_kernels = [5, 3]
)

    
class LeapModelSelfAtt(nn.Module):
    def __init__(self, n_embd = 64, n_head = 4, encoder_layers = 3, fe_channels=16, fe_drop_rate=0.1, 
                 att_drop_rate=0.2, n_features = 25, bottleneck_k_size = 3, block_kernels = [5, 3]):
        super().__init__()
        #self.fe = FeatureExctractor(kernel_size = 7, channels=fe_channels, expand_channels=fe_channels*2, drop_rate = fe_drop_rate, n_features=n_features)
        self.fe = SelfAttFeatureExctractor(kernel_size = 7, channels=fe_channels, drop_rate = fe_drop_rate, n_features=n_features)
        
        self.linearStem = nn.Linear(fe_channels*n_features, n_embd)
        self.bottleneck = Conv1DBlock(kernel_size=bottleneck_k_size, channels = n_embd, expand_channels = n_embd*2, drop_rate=fe_drop_rate)

        self.blocks = nn.Sequential(*[ConvTransBlock(block_kernels = block_kernels, 
                                                     channels = n_embd, 
                                                     expand_channels = n_embd*2, 
                                                     n_head=n_head, 
                                                     drop_rate = fe_drop_rate, 
                                                     att_drop_rate = att_drop_rate) for _ in range(encoder_layers)])
        
        self.head  = LEADHead(n_embd = n_embd)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



    def forward(self, inputs, targets=None):
        #B, T = inputs.shape

        xf = self.fe(inputs)
        xf = xf.permute([0,2,1])
        xf = self.linearStem(xf)
        xf = xf.permute([0,2,1])
        xf = self.bottleneck(xf)
        x = xf#.permute([0,2,1])
        
        x = self.blocks(x)

        out = self.head(x)
        return out

In [21]:
LEAPModelConvmixer(**nn_config_convmixer)(torch.ones([8, 556])).shape

torch.Size([8, 368])

In [22]:
# batch_size = 16
# block_size = 256
# max_iters = 5000
# learning_rate = 3e-4
# eval_iters = 100
# n_embd = 384
# n_head = 8
# n_layer = 12
# dropout = 0.2

nn_config = dict(
    n_embd = 256,
    n_head = 4,
    fe_channels = 64, 
    encoder_layers = 5, 
    fe_drop_rate = 0.05,
    att_drop_rate = 0.1,
    n_features = 25,
    bottleneck_k_size = 5,
    block_kernels = [5, 3],
    out_att_blocks=0
)

    
class LEADModelAtt(nn.Module):
    def __init__(self, n_embd = 64, n_head = 4, encoder_layers = 3, fe_channels=16, fe_drop_rate=0.1, 
                 att_drop_rate=0.2, n_features = 25, bottleneck_k_size = 3, block_kernels = [5, 3], out_att_blocks=3):
        super().__init__()
        self.fe = FeatureExctractor(kernel_size = 7, channels=fe_channels, expand_channels=fe_channels*2, drop_rate = fe_drop_rate, n_features=n_features)
        self.linearStem = nn.Linear(fe_channels*9 + fe_channels//2 * 16, n_embd)
        self.bottleneck = Conv1DBlock(kernel_size=bottleneck_k_size, channels = n_embd, expand_channels = n_embd*2, drop_rate=fe_drop_rate)

        self.blocks = nn.Sequential(*[ConvTransBlock(block_kernels = block_kernels, 
                                                     channels = n_embd, 
                                                     expand_channels = n_embd*2, 
                                                     n_head=n_head, 
                                                     drop_rate = fe_drop_rate, 
                                                     att_drop_rate = att_drop_rate) for _ in range(encoder_layers)])

        self.out_att_blocks = nn.Sequential(*[
             Block(n_embd = n_embd, n_head=n_head, dropout = att_drop_rate)
            for _ in range(out_att_blocks)
        ])

        
        self.head  = LEADHead(n_embd = n_embd)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



    def forward(self, inputs, targets=None):
        #B, T = inputs.shape

        xf = self.fe(inputs)
        xf = xf.permute([0,2,1])
        xf = self.linearStem(xf)
        xf = xf.permute([0,2,1])
        xf = self.bottleneck(xf)
        x = xf#.permute([0,2,1])
        
        x = self.blocks(x)

        x = x.permute([0,2,1])
        x = self.out_att_blocks(x)
        x = x.permute([0,2,1])
        
        out = self.head(x)
        return out

In [23]:
fe_channels = 32
fe_channels*9 + fe_channels//2 * 16

544

In [24]:
LEADModelAtt(**nn_config)(torch.ones([8, 556])).shape

torch.Size([8, 368])

In [25]:
import torchvision

class FocalLossBCE(torch.nn.Module):
    def __init__(
            self,
            alpha: float = 0.25,
            gamma: float = 2,
            reduction: str = "mean",
            bce_weight: float = 1.0,
            focal_weight: float = 1.0,
    ):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = torch.nn.BCEWithLogitsLoss(reduction=reduction) #, pos_weight=sample_weights_420)
        self.bce_weight = bce_weight
        self.focal_weight = focal_weight

    def forward(self, logits, targets):
        focall_loss = torchvision.ops.focal_loss.sigmoid_focal_loss(
            inputs=logits,
            targets=targets,
            alpha=self.alpha,
            gamma=self.gamma,
            reduction=self.reduction,
        )
        bce_loss = self.bce(logits, targets)
        return self.bce_weight * bce_loss + self.focal_weight * focall_loss


criterion = FocalLossBCE(focal_weight=5, alpha = 0.3)

# DATASET

In [26]:
df = pd.read_parquet("train_data_sample.parquet").sample(100000).drop('sample_id', axis=1).reset_index(drop=True)

In [27]:
mean_y = df.iloc[:, 556:].mean().to_numpy()
std_y = df.iloc[:, 556:].std().to_numpy()
std_y = np.clip(std_y, 1e-10, 1e3)

In [28]:
class LEAD_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, augmentation=False, mode='train'):
        if mode == 'train':
            self.df = df.reset_index(drop=True)
        elif mode == 'valid':
            self.df = df.reset_index(drop=True)
        else:
            self.df = df.reset_index(drop=True)
        self.mode = mode
        self.augmentation = augmentation
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        x = self.df.iloc[idx, :556].to_numpy()
        y = self.df.iloc[idx, 556:].to_numpy() 
        y = (y - mean_y) / std_y
        
        return torch.tensor(x), torch.tensor(y)

 

In [29]:
LEAD_Dataset(df).__getitem__(3)[1].view(1, -1)

tensor([[ 3.9310e-01,  4.6546e-01,  9.0145e-01,  1.0507e+00,  1.0750e+00,
          1.0396e+00,  9.0132e-01,  7.9084e-01,  7.4044e-01,  7.1175e-01,
          7.2677e-01,  8.0456e-01,  9.5097e-01,  1.0196e+00,  1.0495e+00,
          1.0270e+00,  8.9646e-01,  4.7574e-01,  6.2911e-01, -2.1918e-01,
          1.0749e-01, -4.1529e-01, -1.3455e-01, -1.9661e-01, -1.1210e-01,
         -3.2775e-02, -1.9219e-03, -5.1749e-02, -6.6888e-02, -7.9437e-02,
         -8.0190e-02, -1.0551e-01, -7.3092e-02, -1.2082e-01, -1.8616e-01,
         -1.0781e-01, -8.9888e-02, -1.7816e-01, -3.2381e-01, -5.9465e-01,
         -1.8779e-01, -3.6965e-01, -5.3235e-01, -8.6318e-01, -2.6964e-01,
          2.4821e-01, -1.2117e-01,  1.4580e-02, -1.3717e-01, -4.5144e-01,
         -1.2246e+00, -5.9878e-02, -2.0266e-01, -3.7990e-01, -3.5348e-01,
         -3.1475e-01, -2.5823e-01, -1.1840e-01,  5.8951e-01,  8.0244e-01,
         -4.2185e-06,  1.0310e-06,  4.9206e-07,  1.4455e-07,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.

In [30]:
LEADModelAtt(**nn_config)(torch.cat([LEAD_Dataset(df).__getitem__(3)[0].view(1, -1), LEAD_Dataset(df).__getitem__(5)[0].view(1, -1)]))

tensor([[-1.4763e-01, -1.7441e-01, -1.2951e-01, -1.2872e-01, -4.8616e-02,
         -1.2692e-01, -1.5802e-01, -7.1071e-02,  8.4509e-03, -1.3914e-01,
         -8.4053e-02, -1.0286e-01, -1.2038e-01, -9.4866e-02, -9.6882e-02,
         -1.8872e-01, -9.1128e-02, -1.4886e-02, -1.8602e-01, -4.8317e-02,
         -9.0935e-02, -1.6477e-01, -2.9389e-02,  1.8322e-02, -7.8579e-02,
         -1.2545e-01, -8.8488e-02, -1.0550e-01, -9.1428e-02, -8.4059e-02,
         -1.6925e-01, -2.3096e-01, -1.3539e-01, -1.3797e-01, -6.7603e-02,
         -7.2184e-02, -1.5089e-01, -9.3572e-03, -5.7932e-02, -2.0231e-01,
         -6.4616e-02, -2.0541e-01, -2.2879e-01, -3.0158e-01, -3.5694e-01,
         -2.9445e-01, -3.2000e-01, -3.5246e-01, -3.5334e-01, -3.5246e-01,
         -3.7985e-01, -4.8834e-01, -4.6142e-01, -4.1793e-01, -4.0620e-01,
         -4.2311e-01, -3.9149e-01, -3.5878e-01, -3.2743e-01, -3.0547e-01,
         -9.2148e-02, -6.9875e-02, -2.4380e-01, -1.7994e-01, -2.7905e-01,
         -1.8783e-01, -1.5013e-01, -8.

In [31]:
a = LEAD_Dataset(df).__getitem__(63)[1]
b = LEAD_Dataset(df).__getitem__(9)[1]

In [32]:
a.shape

torch.Size([368])

In [33]:
nn.MSELoss()(torch.tensor(np.expand_dims(a, 0)), torch.tensor(np.expand_dims(b, 0)))

tensor(2.2001)

In [34]:
def r2_score(y_pred:torch.Tensor, y_true:torch.Tensor) -> float:
    """
    Calculate the R^2 (coefficient of determination) regression score.
    
    Parameters
    ----------
    y_pred : torch.Tensor
        The predicted values.
    y_true : torch.Tensor
        The true values.

    Returns
    -------
    float
        The R^2 score, a float value.
    """
    
    ss_res = torch.sum((y_true - y_pred) ** 2)
    ss_tot = torch.sum((y_true - torch.mean(y_true)) ** 2)
    
    r2 = 1 - ss_res / ss_tot
    
    return r2.item()

In [35]:

import torch
#from torcheval.metrics import R2Score 
from torchmetrics.regression import R2Score
metric = R2Score()



class LEADModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        
        # == backbone ==
        #self.backbone = LEADModelAtt(**nn_config).to(config.DEVICE)
        #self.backbone = LEAPModelConvmixer(**nn_config_convmixer).to(config.DEVICE)
        self.backbone = LeapModelSelfAtt(**nn_config_sa).to(config.DEVICE)

        self.loss_fn = nn.MSELoss()
        self.metric = R2Score()
        
        # == record ==
        self.validation_step_outputs = []
        
    def forward(self, images):
        return self.backbone(images)
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=config.EPOCHS,
            T_mult=1,
            eta_min=1e-7,
            last_epoch=-1
        )
        
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss',
                'frequency': 1
            }
        }
    
    def training_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch
        image = image.to(self.device).float()
        target = target.to(self.device).float()
        
        # == pred ==
        y_pred = self(image)
        
        # == compute loss ==
        train_loss = self.loss_fn(y_pred, target)
        
        # == record ==
        self.log('train_loss', train_loss, True)
        
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch
        image = image.to(self.device).float()
        target = target.to(self.device).float()
        
        # == pred ==
        with torch.no_grad():
            y_pred = self(image)
            
        self.validation_step_outputs.append({"logits": y_pred, "targets": target})
        
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def on_epoch_start(self):
        print('\n')

    def on_load_checkpoint(self, checkpoint: dict) -> None:
        state_dict = checkpoint["state_dict"]
        model_state_dict = self.state_dict()
        is_changed = False
        for k in state_dict:
            if k in model_state_dict:
                if state_dict[k].shape != model_state_dict[k].shape:
                    print(f"Skip loading parameter: {k}, "
                                f"required shape: {model_state_dict[k].shape}, "
                                f"loaded shape: {state_dict[k].shape}")
                    state_dict[k] = model_state_dict[k]
                    is_changed = True
            else:
                print(f"Dropping parameter {k}")
                is_changed = True

        if is_changed:
            checkpoint.pop("optimizer_states", None)
    
    def on_validation_epoch_end(self):
        
        # = merge batch data =
        outputs = self.validation_step_outputs
        
        #output_val = nn.Sigmoid()(torch.cat([x['logits'] for x in outputs], dim=0)).cpu().detach()
        #output_val = torch.cat([x['logits'] for x in outputs], dim=0).cpu().detach()
        #target_val = torch.cat([x['targets'] for x in outputs], dim=0).cpu().detach()
        output_val = torch.cat([x['logits'] for x in outputs], dim=0)#.cpu().detach()
        target_val = torch.cat([x['targets'] for x in outputs], dim=0)#.cpu().detach()
        
        
        # = compute validation loss =
        val_loss = self.loss_fn(output_val, target_val)
        # == record ==
        print(f"val_loss: {val_loss}")
        self.log('val_loss', val_loss, True)
        
        val_loss = val_loss.cpu().detach()

    
        #output_val = nn.Sigmoid()(output_val).cpu().detach()
        output_val = output_val.cpu().detach()
        target_val = target_val.cpu().detach()

  
        y = (output_val * std_y) + mean_y
        
        y_pred = target_val
        y_pred[:, std_y < 1e-9] = 0
        y_pred = (y_pred * std_y) + mean_y

        # r2=0
        # for i in range(368):
        #     r2_i = self.metric(y_pred[:, i], y[:, i])
        #     r2 += r2_i
        # val_score  = r2/ 368
        #val_score = self.metric(y_pred, y)

        r2=0
        for i in range(368):
            r2_i = self.metric(output_val[:, i], target_val[:, i])
            if r2_i > 1e-6:
                r2 += r2_i
        val_score  = r2/ 368

        
        
        # self.metric.update(target_val, output_val)
        # val_score = self.metric.compute()
        
        # target to one-hot
        #target_val = torch.nn.functional.one_hot(target_val, len(label_list))
        
        # = val with ROC AUC =
        # gt_df = pd.DataFrame(target_val.numpy().astype(np.float32), columns=label_list)
        # pred_df = pd.DataFrame(output_val.numpy().astype(np.float32), columns=label_list)
        
        # gt_df['id'] = [f'id_{i}' for i in range(len(gt_df))]
        # pred_df['id'] = [f'id_{i}' for i in range(len(pred_df))]
        
        # val_score = score(gt_df.drop(cols_drop_on_val, axis=1), pred_df.drop(cols_drop_on_val, axis=1), row_id_column_name='id')
        
        print(f"val_R2: {val_score}")
        
        self.log("val_R2", val_score, True)
        
        # clear validation outputs
        self.validation_step_outputs = list()
        
        return {'val_loss': val_loss, 'val_R2': val_score}

In [36]:
USE_CHECKPOINT = False
#CHK_PATH = './pretrain_checkpoints/eca_nfnet_l0_fold_0_0.97126.ckpt'


def run_training(fold_id, total_df):
    print('================================================================')
    print(f"==== Running training for fold {fold_id} ====")
    
    # == create dataset and dataloader ==
    train_df = total_df[total_df['fold'] != fold_id].drop('fold', axis=1).copy()
    valid_df = total_df[total_df['fold'] == fold_id].drop('fold', axis=1).copy()
    
    print(f'Train Samples: {len(train_df)}')
    print(f'Valid Samples: {len(valid_df)}')
    
  
    train_ds = LEAD_Dataset(train_df)
    val_ds =  LEAD_Dataset(valid_df)
    #val_ds = WaveAllFileDataset(df=valid_df, name_col="filepath", **val_dataset_config)
    
    
    train_dl = torch.utils.data.DataLoader(
        train_ds,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        #num_workers=config.N_WORKERS,
        pin_memory=True,
        #persistent_workers=True
    )
    
    val_dl = torch.utils.data.DataLoader(
        val_ds,
        batch_size=config.BATCH_SIZE * 2,
        shuffle=False,
        #num_workers=config.N_WORKERS,
        pin_memory=True,
        #persistent_workers=True
    )
    
    # == init model ==
    if USE_CHECKPOINT:
        model = LEADModel.load_from_checkpoint(CHK_PATH, strict=False)
    else:
        model = LEADModel()
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(monitor='val_loss',
                                          dirpath=config.OUTPUT_DIR,
                                          save_top_k=1,
                                          save_last=True,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}",
                                          mode='min')

    callbacks_to_use = [checkpoint_callback, TQDMProgressBar(refresh_rate=1)]

    print(f'trainer')
    # == init trainer ==
    trainer = pl.Trainer(
        max_epochs=config.EPOCHS,
        val_check_interval=1.,
        num_sanity_val_steps=0,
        callbacks=callbacks_to_use,
        enable_model_summary=False,
        accelerator="gpu" if torch.cuda.is_available() else 'auto',
        deterministic=True,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
    )
    
    # == Training ==
    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)
    
    # == Prediction ==
    best_model_path = checkpoint_callback.best_model_path
    weights = torch.load(best_model_path)['state_dict']
    model.load_state_dict(weights)
    
    
    return trainer

In [37]:
#train_df = train_df[train_df.target<30].reset_index(drop=True)

kf = KFold(n_splits=config.FOLDS, shuffle=True, random_state=config.SEED)
df['fold'] = 0
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    df.loc[val_idx, 'fold'] = fold
    

In [38]:
#config.EPOCHS = 10
#config.LR = 1e-5

In [39]:


import logging

def disable_logging_during_tests():
    # Store the current log level to restore it later
    original_log_level = logging.getLogger().getEffectiveLevel()

    # Set the log level to a higher level, e.g., WARNING or CRITICAL
    logging.disable(logging.ERROR)

    # Run your tests here

    # Restore the original log level after the tests
    logging.disable(original_log_level)

# Call this function before running your tests
disable_logging_during_tests()



In [None]:
selected_folds = [0,4,5]
    
# training
torch.set_float32_matmul_precision('high')



for f in range(config.FOLDS):
    
    if f not in selected_folds:
        continue
    
    # get validation index
    #val_idx = list(train_df[train_df['fold'] == f].index)
    
    # main loop of f-fold
    trainer = run_training(f, df)
    

    
    # only training one fold
    #break


# for idx, val_score in enumerate(fold_val_score_list):
#     print(f'Fold {idx} Val Score: {val_score:.5f}')

# oof_gt_df = oof_df[['samplename'] + label_list].copy()
# oof_pred_df = oof_df[['samplename'] + pred_cols].copy()
# oof_pred_df.columns = ['samplename'] + label_list
# oof_score = score(oof_gt_df, oof_pred_df, 'samplename')
# print(f'OOF Score: {oof_score:.5f}')

#oof_df.to_csv(f"{config.OUTPUT_DIR}/oof_pred.csv", index=False)

==== Running training for fold 0 ====
Train Samples: 85714
Valid Samples: 14286
trainer


E:\PycharmProjects\birdclef24\venv\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
E:\PycharmProjects\birdclef24\venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:653: Checkpoint directory E:\PycharmProjects\LEAP\output exists and is not empty.
E:\PycharmProjects\birdclef24\venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `D

Training: |                                                                                                   …

  return self._call_impl(*args, **kwargs)


Validation: |                                                                                                 …

val_loss: 0.7267396450042725
val_R2: 0.04140517860651016


Validation: |                                                                                                 …

val_loss: 0.8799475431442261
val_R2: 0.0091019868850708


Validation: |                                                                                                 …

val_loss: 0.8439425826072693
val_R2: 0.01818281225860119


Validation: |                                                                                                 …

val_loss: 0.7283050417900085
val_R2: 0.040166694670915604


Validation: |                                                                                                 …

val_loss: 0.811728298664093
val_R2: 0.033796437084674835


Validation: |                                                                                                 …

val_loss: 0.8380553722381592
val_R2: 0.008417516946792603


Validation: |                                                                                                 …

val_loss: 0.7335361242294312
val_R2: 0.03158706799149513


Validation: |                                                                                                 …

val_loss: 0.7348616123199463
val_R2: 0.03480054438114166


Validation: |                                                                                                 …

val_loss: 0.7427145838737488
val_R2: 0.022251347079873085


Validation: |                                                                                                 …

val_loss: 0.7218511700630188
val_R2: 0.040942803025245667


Validation: |                                                                                                 …

val_loss: 0.7699981927871704
val_R2: 0.0053141978569328785


Validation: |                                                                                                 …

val_loss: 0.7190405130386353
val_R2: 0.04740745201706886


Validation: |                                                                                                 …

val_loss: 0.7888358235359192
val_R2: 0.00562881538644433


Validation: |                                                                                                 …

val_loss: 0.7428426742553711
val_R2: 0.020871788263320923


Validation: |                                                                                                 …

val_loss: 0.7834185361862183


In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(tokenized, split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
model = LEADModelAtt(**nn_config).to(config.DEVICE)

In [None]:
train_dl = torch.utils.data.DataLoader(
        LEAD_Dataset(df.drop('fold', axis=1)),
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        #persistent_workers=True
    )

In [None]:
max_iters = 2000
eval_iters = 1000
learning_rate = 3e-4
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iters):
    #print(iter)
    # if iter % eval_iters == 0:
    #     losses = estimate_loss()
    #     print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    #xb, yb = LEAD_Dataset(df.drop('fold', axis=1)).__getitem__(iter)

    xb, yb = next(iter(train_dl))

    # evaluate the loss
    logits = model.forward(xb.to('cuda').float(), yb.to('cuda').float())

    loss = nn.MSELoss()(logits, yb.to('cuda').float())
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())





In [None]:
#config.EPOCHS = 25  # max epochs
#config.LR = 3e-4  # learning rate

In [None]:
%reload_ext tensorboard
%tensorboard --logdir ./lightning_logs/version_0/



In [None]:
import torch
from pytorch_metric_learning.losses import ArcFaceLoss
num_classes = 384
embedding_size = 64

loss_func = ArcFaceLoss(num_classes, embedding_size, margin=28.6, scale=64).to(torch.device('cuda'))

In [None]:
label = torch.range(1, 100)#.view(-1,1)
emb = torch.rand([100, 64])

la  = loss_func(emb, label.long())

In [None]:
la * 0.1

In [None]:
!pip install pytorch-metric-learning