In [1]:
import os, gc, random
import numpy as np
import pandas as pd 
from pathlib import Path
import matplotlib.pyplot as plt
from typing import List, Dict
from tqdm.notebook import tqdm
from time import time, ctime
import warnings

from sklearn.model_selection import KFold, GroupKFold

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import v2
from torch.optim.lr_scheduler import OneCycleLR,  CosineAnnealingWarmRestarts
from torch.optim import Adam, AdamW
from torch.cuda.amp import autocast, GradScaler

from scipy.signal import butter, lfilter, freqz
from scipy.stats import entropy
from scipy.special import rel_entr, softmax

warnings.filterwarnings("ignore")


In [2]:
def get_logger(log_dir, logger_name="train_model.log"):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger_file = os.path.join(log_dir, logger_name)
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=logger_file, mode="a+")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
class ModelConfig:
    SEED = 20
    SPLIT_ENTROPY = 5.5
    MODEL_NAME = "ResnetGRU_v1_LB048"
    MODEL_BACKBONE = "reset_gru"
    BATCH_SIZE = 32
    EPOCHS = 20
    EARLY_STOP_ROUNDS = 5
    GRADIENT_ACCUMULATION_STEPS = 1
    DROP_RATE = 0.15 # default: 0.1
    DROP_PATH_RATE = 0.25 # default: 0.2
    WEIGHT_DECAY = 0.01
    AMP = True
    PRINT_FREQ = 100
    NUM_WORKERS = 0 
    MAX_GRAD_NORM = 1e7
    REGULARIZATION = 0.15
    RESNET_GRU_BANDPASS = None #(0.5, 20)
    RESNET_GRU_IN_CHANNELS = 8
    RESNET_GRU_KERNELS = [3, 5, 7, 9, 11]
    RESNET_GRU_FIXED_KERNEL_SIZE = 5
    RESNET_GRU_DOWNSAMPLE = 5 # None #5
    RESNET_GRU_HIDDEN_SIZE = 304 #448 #304
    RESNET_GRU_DILATED = False

In [4]:
N_GPU = torch.cuda.device_count()
if N_GPU > 1:
    DEVICE = torch.device("cuda")
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
elif N_GPU == 1:
    DEVICE = torch.device("cuda:0")
else:
    DEVICE = torch.device("cpu")

print("Use Device: ", DEVICE)

Use Device:  cuda:0


In [5]:
class KagglePaths:
    OUTPUT_DIR = "/kaggle/working/"
    PRE_LOADED_EEGS = '/kaggle/input/brain-eeg-spectrograms/eeg_specs.npy'
    PRE_LOADED_SPECTROGRAMS = '/kaggle/input/brain-spectrograms/specs.npy'
    TRAIN_CSV = "/kaggle/input/hms-harmful-brain-activity-classification/train.csv"
    TRAIN_EEGS = "/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/"
    TRAIN_SPECTROGRAMS = "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/"
    TEST_CSV = "/kaggle/input/hms-harmful-brain-activity-classification/test.csv"
    TEST_SPECTROGRAMS = "/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/"
    TEST_EEGS = "/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/"


class LocalPaths:
    OUTPUT_DIR = "./outputs/"
    PRE_LOADED_EEGS = './inputs/brain-eeg-spectrograms/eeg_specs.npy'
    PRE_LOADED_SPECTROGRAMS = './inputs/brain-spectrograms/specs.npy'
    TRAIN_CSV = "./inputs/hms-harmful-brain-activity-classification/train.csv"
    TRAIN_EEGS = "./inputs/hms-harmful-brain-activity-classification/train_eegs"
    TRAIN_SPECTROGRAMS = "./inputs/hms-harmful-brain-activity-classification/train_spectrograms"
    TEST_CSV = "./inputs/hms-harmful-brain-activity-classification/test.csv"
    TEST_SPECTROGRAMS = "./inputs/hms-harmful-brain-activity-classification/test_spectrograms"
    TEST_EEGS = "./inputs/hms-harmful-brain-activity-classification/test_eegs"

PATHS = KagglePaths if os.path.exists("/kaggle") else LocalPaths

print("Output Dir: ", PATHS.OUTPUT_DIR)

EEG_FEAT_ALL = [
    'Fp1', 'F3', 'C3', 'P3', 
    'F7', 'T3', 'T5', 'O1', 
    'Fz', 'Cz', 'Pz', 'Fp2', 
    'F4', 'C4', 'P4', 'F8', 
    'T4', 'T6', 'O2', 'EKG'
    ]

EEG_FEAT_USE =  ['Fp1','T3','C3','O1','Fp2','C4','T4','O2']
EEG_FEAT_INDEX = {x:y for x,y in zip(EEG_FEAT_USE, range(len(EEG_FEAT_USE)))}

BRAIN_ACTIVITY = ['seizure', 'lpd', 'gpd', 'lrda', 'grda', 'other']
TARGETS = [f"{lb}_vote" for lb in BRAIN_ACTIVITY]
TARGETS_PRED = [f"{lb}_pred" for lb in BRAIN_ACTIVITY]

seed_everything(ModelConfig.SEED)

print(EEG_FEAT_INDEX)

Output Dir:  ./outputs/
{'Fp1': 0, 'T3': 1, 'C3': 2, 'O1': 3, 'Fp2': 4, 'C4': 5, 'T4': 6, 'O2': 7}


In [6]:
logger = get_logger(PATHS.OUTPUT_DIR, f"{ModelConfig.MODEL_NAME}_train.log")

# Load Data

In [7]:
def eeg_from_parquet(parquet_path: str, use_feature=EEG_FEAT_USE, display: bool = False) -> np.ndarray:
    # === Extract full length EEG Sequence ===
    # fill missing values with mean
    # first fill missing values with mean of each column
    # then if all values are missing, fill with 0
    eeg = pd.read_parquet(parquet_path, columns=use_feature)
    eeg = eeg.fillna(eeg.mean(skipna=True)).fillna(0)
    data = eeg.values.astype(np.float32)
    
    rows = len(eeg)
    offset = (rows - 10_000) // 2 # 50 * 200 = 10_000
    data = data[offset:offset+10_000, :]

    if display:
        fig, ax = plt.subplots(len(use_feature), 1, figsize=(10, 2*len(use_feature)), sharex=True)
        
        for i, feat in enumerate(use_feature):
            ax[i].plot(data[:, i], label=feat)
            ax[i].legend()
            ax[i].grid()
       
        name = parquet_path.split('/')[-1].split('.')[0]
        ax[0].set_title(f'EEG {name}',size=16)
        fig.tight_layout()
        plt.show()    
    return data

In [8]:
%%time
CREATE_EEGS = False
ALL_EEG_SIGNALS = {}
eeg_paths = list(Path(PATHS.TRAIN_EEGS).glob('*.parquet'))
preload_eegs_path = Path('./inputs/eegs_full.npy')

if CREATE_EEGS:
    count = 0
    for parquet_path in tqdm(eeg_paths, total=len(eeg_paths)):
        eeg_id = int(parquet_path.stem)
        eeg_path = str(parquet_path)
        data = eeg_from_parquet(eeg_path, display=False)
        ALL_EEG_SIGNALS[eeg_id] = data
        count += 1
    np.save("./inputs/eegs_full.npy", ALL_EEG_SIGNALS)
else:
    ALL_EEG_SIGNALS = np.load(preload_eegs_path, allow_pickle=True).item()

CPU times: user 178 ms, sys: 1.27 s, total: 1.45 s
Wall time: 1.45 s


In [9]:
def gen_non_overlap_samples(df_csv, targets):
    # Reference Discussion:
    # https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/467021

    tgt_list = targets.tolist()
    brain_activity = ['seizure', 'lpd', 'gpd', 'lrda', 'grda', 'other']

    agg_dict = {
        'spectrogram_id': 'first',
        'spectrogram_label_offset_seconds': ['min', 'max'],
        'patient_id': 'first',
        'expert_consensus': 'first'
    }

    groupby = df_csv.groupby(['eeg_id'] + tgt_list)
    train = groupby.agg(agg_dict)
    train = train.reset_index()
    train.columns = ['_'.join(col).strip() for col in train.columns.values]
    train.columns = ["eeg_id"] + tgt_list + ['spectrogram_id', 'min', 'max', 'patient_id', 'target']
    
    train['total_votes'] = train[tgt_list].sum(axis=1)
    train[tgt_list] = train[tgt_list].div(train['total_votes'], axis=0)
    
    return train

In [10]:
# # Enhanced Samples Split 

# train_csv = pd.read_csv(PATHS.TRAIN_CSV)
# targets = train_csv.columns[-6:].tolist()

# raw_csv_len = len(train_csv)

# subset_counts = train_csv.groupby(['eeg_id']+targets).size().reset_index(name='subset_counts')
# train_csv = train_csv.merge(subset_counts, on=['eeg_id']+targets, how='left')

# tmp_cols = ['expert_consensus', 'eeg_label_offset_seconds', 'subset_counts']

# def sample_rule(x):
#     if (x['subset_counts'].min() > 3) & ((x['expert_consensus']!='Other').any()):
#         return x['eeg_label_offset_seconds'].sample(n=(x['subset_counts'].min()//3))
#     else:
#         return x['eeg_label_offset_seconds'].sample(n=1)

# train_samples = train_csv.groupby(['eeg_id']+targets)[tmp_cols].apply(sample_rule).reset_index()
# train_samples = train_samples.rename(columns={'eeg_label_offset_seconds': 'eeg_off_seconds'})
# train_samples.drop(columns=['level_7'], inplace=True)

# train_meta = train_csv.groupby(['eeg_id']+targets).agg({
#     'spectrogram_id': 'first',
#     'spectrogram_label_offset_seconds': ['min', 'max'],
#     'eeg_sub_id': 'count',
#     'eeg_label_offset_seconds': ['min', 'max'],
#     'patient_id': 'first',
# }).reset_index()

# agged_cols = [
#     'spectrogram_id', 'min', 'max', 'subset_counts', 'eeg_off_min', 'eeg_off_max', 'patient_id'
# ]
# train_meta.columns = ['eeg_id'] + targets + agged_cols
# train_meta = train_meta[['eeg_id'] + agged_cols + targets]

# train_meta['total_votes'] = train_meta[targets].sum(axis=1)
# train_meta['target'] = train_meta[targets].idxmax(axis=1).apply(lambda x: x.split('_')[0])
# train_meta['fold'] = -1

# K_FOLDS = 5
# kf = KFold(n_splits=K_FOLDS, shuffle=False)
# unique_eegs = train_meta['eeg_id'].unique()
# for fold, (_, valid_idx) in enumerate(kf.split(unique_eegs)):
#     train_meta.loc[train_meta['eeg_id'].isin(unique_eegs[valid_idx]), 'fold'] = fold

# train_all = train_samples.merge(train_meta, on=['eeg_id']+targets, how='left')

# train_all[targets] = train_all[targets].div(train_all['total_votes'], axis=0)

# train_all['stage'] = train_all['total_votes'].apply(lambda x: 1 if x < 10 else 2)

# train_all

In [11]:
# Original Split 

train_csv = pd.read_csv(PATHS.TRAIN_CSV)
targets = train_csv.columns[-6:]

print("targets: ", targets.to_list())

train_csv['total_votes'] = train_csv[targets].sum(axis=1)
train_csv[targets] = train_csv[targets].astype('float32')

targets_prob = [f"{t.split('_')[0]}_prob" for t in targets]
train_csv[targets_prob] = train_csv[targets].div(train_csv['total_votes'], axis=0)
# train_csv['rel_entropy'] = train_csv[targets_prob].apply(lambda row: sum(rel_entr([1/6]*6, row.values+1e-5)), axis=1)
# train_csv['entropy'] = train_csv[targets_prob].apply(lambda row: entropy(row.values), axis=1)

# hard_csv = train_csv[train_csv['entropy'] < ModelConfig.SPLIT_ENTROPY].copy().reset_index(drop=True)
# hard_csv = train_csv[train_csv['entropy'] >= 0.75].copy().reset_index(drop=True)
hard_csv = train_csv[train_csv['total_votes'] >= 6].copy().reset_index(drop=True)


train_all = gen_non_overlap_samples(train_csv, targets)
train_hard = gen_non_overlap_samples(hard_csv, targets)

print("train_all.shape = ", train_all.shape)
print("train_all nan_count: ", train_all.isnull().sum().sum())
display(train_all.head())

print(" ")

print("train_hard.shape = ", train_hard.shape)
print("train_hard nan_count: ", train_hard.isnull().sum().sum())
display(train_hard.head())

targets:  ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
train_all.shape =  (20183, 13)
train_all nan_count:  0


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,spectrogram_id,min,max,patient_id,target,total_votes
0,568657,0.0,0.0,0.25,0.0,0.166667,0.583333,789577333,0.0,16.0,20654,Other,12.0
1,582999,0.0,0.857143,0.0,0.071429,0.0,0.071429,1552638400,0.0,38.0,20230,LPD,14.0
2,642382,0.0,0.0,0.0,0.0,0.0,1.0,14960202,1008.0,1032.0,5955,Other,1.0
3,751790,0.0,0.0,1.0,0.0,0.0,0.0,618728447,908.0,908.0,38549,GPD,1.0
4,778705,0.0,0.0,0.0,0.0,0.0,1.0,52296320,0.0,0.0,40955,Other,2.0


 
train_hard.shape =  (6492, 13)
train_hard nan_count:  0


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,spectrogram_id,min,max,patient_id,target,total_votes
0,568657,0.0,0.0,0.25,0.0,0.166667,0.583333,789577333,0.0,16.0,20654,Other,12.0
1,582999,0.0,0.857143,0.0,0.071429,0.0,0.071429,1552638400,0.0,38.0,20230,LPD,14.0
2,1895581,0.076923,0.0,0.0,0.0,0.076923,0.846154,128369999,1138.0,1138.0,47999,Other,13.0
3,2482631,0.0,0.0,0.133333,0.066667,0.133333,0.666667,978166025,1902.0,1944.0,20606,Other,15.0
4,2521897,0.0,0.0,0.083333,0.083333,0.333333,0.5,673742515,0.0,4.0,62117,Other,12.0


# Dataset

In [12]:
# Functional Utils
def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter(order, [lowcut, highcut], fs=fs, btype='band')
    y = lfilter(b, a, data)
    return y

def denoise_filter(x):
    # Sample rate and desired cutoff frequencies (in Hz).
    fs = 200.0
    lowcut = 1.0
    highcut = 25.0
    
    # Filter a noisy signal.
    T = 50
    nsamples = T * fs
    t = np.arange(0, nsamples) / fs
    y = butter_bandpass_filter(x, lowcut, highcut, fs, order=6)
    y = (y + np.roll(y,-1)+ np.roll(y,-2)+ np.roll(y,-3))/4
    y = y[0:-1:4]
    
    return y

def mu_law_encoding(data, mu):
    mu_x = np.sign(data) * np.log(1 + mu * np.abs(data)) / np.log(mu + 1)
    return mu_x

def mu_law_expansion(data, mu):
    s = np.sign(data) * (np.exp(np.abs(data) * np.log(mu + 1)) - 1) / mu
    return s

def quantize_data(data, classes):
    mu_x = mu_law_encoding(data, classes)
    return mu_x #quantized

def butter_lowpass_filter(data, cutoff_freq=20, sampling_rate=200, order=4):
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff_freq / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    filtered_data = lfilter(b, a, data, axis=0)
    return filtered_data


In [13]:
class EEGSeqDataset(Dataset):
    def __init__(self, df, config, eegs, mode='train', verbose=False):
        self.df = df
        self.mode = mode
        self.eegs = eegs
        self.verbose = verbose
        self.downsample = config.RESNET_GRU_DOWNSAMPLE
        self.use_bandpass = config.RESNET_GRU_BANDPASS
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        X, y_prob = self.__data_generation(idx)
        
        if self.downsample is not None:
            X = X[::self.downsample,:]
        
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y_prob, dtype=torch.float32)
    
    def __data_generation(self, index):
        row = self.df.iloc[index]
        
        if self.verbose:
            print(f"Row {index}", row[['eeg_id', 'eeg_off_min', 'target']].tolist())

        X = np.zeros((10_000, 8), dtype='float32')
        
        # # start_sec = int((row['eeg_off_min'] + row['eeg_off_max']) // 2)
        # eeg_seq = self.eegs[row.eeg_id]
        # len_seq = eeg_seq.shape[0]
        # start_at = int(row['eeg_off_min']) + (len_seq - 10_000) // 2 
        # # !!! use randomly sampled offset !!!
        # # start_sec = int(row['eeg_off_sample']) 
        # data = eeg_seq[start_at:start_at+10_000, :]
        
        data = self.eegs[row.eeg_id]

        # === Feature engineering ===
        X[:,0] = data[:,EEG_FEAT_INDEX['Fp1']] - data[:,EEG_FEAT_INDEX['T3']]
        X[:,1] = data[:,EEG_FEAT_INDEX['T3']] - data[:,EEG_FEAT_INDEX['O1']]

        X[:,2] = data[:,EEG_FEAT_INDEX['Fp1']] - data[:,EEG_FEAT_INDEX['C3']]
        X[:,3] = data[:,EEG_FEAT_INDEX['C3']] - data[:,EEG_FEAT_INDEX['O1']]

        X[:,4] = data[:,EEG_FEAT_INDEX['Fp2']] - data[:,EEG_FEAT_INDEX['C4']]
        X[:,5] = data[:,EEG_FEAT_INDEX['C4']] - data[:,EEG_FEAT_INDEX['O2']]

        X[:,6] = data[:,EEG_FEAT_INDEX['Fp2']] - data[:,EEG_FEAT_INDEX['T4']]
        X[:,7] = data[:,EEG_FEAT_INDEX['T4']] - data[:,EEG_FEAT_INDEX['O2']]

        # === Standarize ===
        X = np.clip(X,-1024, 1024)
        X = np.nan_to_num(X, nan=0) / 32.0

        # === Butter Low-pass Filter ===
        # ??? change to bandpass filter (low=0.5, hight=20, order=2) ???
        if self.use_bandpass is not None:
            X = butter_lowpass_filter(X, self.use_bandpass[0], self.use_bandpass[1], order=2)
            
        X = butter_lowpass_filter(X) 
        
        if self.mode != 'test':
            y_prob = row[TARGETS].values.astype(np.float32)
        else:
            y_prob = np.zeros(6, dtype='float32')

        return X, y_prob 

In [14]:
# # visualize the dataset
# train_dataset = EEGSeqDataset(train_all, ModelConfig, ALL_EEG_SIGNALS, mode="train")
# train_loader = DataLoader(train_dataset, drop_last=True, batch_size=16, num_workers=4, pin_memory=True, shuffle=False)

# for batch in train_loader:
#     X, y = batch
#     print(f"X shape: {X.shape}")
#     print(f"y shape: {y.shape}")
    
#     fig, axes = plt.subplots(4, 1, figsize=(20, 20))
#     ax_idx = 0
#     for item in np.random.choice(range(X.shape[0]), 4):
#         offset = 0
#         for col in range(X.shape[-1]):
#             if col != 0:
#                 offset -= X[item,:,col].min()
#             axes[ax_idx].plot(np.arange(X.shape[1]), X[item,:,col]+offset, label=f'feature {col+1}')
#             offset += X[item,:,col].max()
#         print(y[item])
#         # axes[ax_idx].set_title(f'Weight = {weights[item]}',size=14)
#         axes[ax_idx].legend()
#         ax_idx += 1
#     fig.tight_layout()
#     plt.show()
#     break

# del train_dataset, train_loader
# torch.cuda.empty_cache()
# gc.collect()

# Model

### Resnet 1D Encoder

In [15]:
class ResNet_1D_Block(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, downsampling, dropout=0.0, dilation=1):
        super(ResNet_1D_Block, self).__init__()
        self.block = nn.Sequential(
            nn.BatchNorm1d(num_features=in_channels),
            nn.Hardswish(), #nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation=dilation, bias=False),
            nn.BatchNorm1d(num_features=out_channels),
            nn.Hardswish(), #nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Conv1d(out_channels, out_channels, kernel_size, stride, padding, dilation=dilation, bias=False),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        )
        self.downsampling = downsampling

    def forward(self, x):
        identity = self.downsampling(x)
        out = self.block(x)
        out += identity
        return out

class SelfAttentionPooling(nn.Module):
    """
    Implementation of SelfAttentionPooling 
    Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition
    https://arxiv.org/pdf/2008.01077v1.pdf
    """
    def __init__(self, input_dim):
        super(SelfAttentionPooling, self).__init__()
        self.W = nn.Linear(input_dim, 1)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, batch_rep):
        """
        input:
            batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension
        attention_weight:
            att_w : size (N, T, 1)
        return:
            utter_rep: size (N, H)
        """
        att_w = self.softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)
        utter_rep = torch.sum(batch_rep * att_w, dim=1)

        return utter_rep

class ResNetGRU(nn.Module):
    def __init__(self, config=ModelConfig, num_classes=6):
        super(ResNetGRU, self).__init__()

        self.planes = 24
        self.kernels = config.RESNET_GRU_KERNELS
        self.in_channels = config.RESNET_GRU_IN_CHANNELS
        self.use_dilation = config.RESNET_GRU_DILATED

        fixed_kernel_size = config.RESNET_GRU_FIXED_KERNEL_SIZE
        hidden_size = config.RESNET_GRU_HIDDEN_SIZE

        # Define the separate convolutional layers
        self.parallel_conv = self._make_parallel_conv_layers()
        # Define the ResNet part of the model
        self.resnet_part = self._make_resnet_part(fixed_kernel_size, n_blocks=9)
        # Define the GRU part of the model
        self.rnn = nn.GRU(input_size=self.in_channels, hidden_size=128, num_layers=1, bidirectional=True)
        self.pooling = SelfAttentionPooling(256)
        # Define the final fully connected layer
        self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)

    def _make_parallel_conv_layers(self):
        return nn.ModuleList([
            nn.Conv1d(
                in_channels=self.in_channels, 
                out_channels=self.planes, 
                kernel_size=kernel_size,
                stride=1, 
                padding=0, 
                bias=False
            ) for kernel_size in self.kernels
        ])

    def _make_resnet_part(self, fixed_kernel_size, n_blocks=9):
        # prepare resnet layers
        downsampling = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)

        if self.use_dilation:
            dilation_rates = [1, 2, 2, 2, 2, 4, 4, 4, 4] #[1] * n_blocks
        else:
            dilation_rates = [1] * n_blocks

        paddings = [fixed_kernel_size//2 * rate for rate in dilation_rates]
        resnet_layers = [
            ResNet_1D_Block(
                in_channels=self.planes, 
                out_channels=self.planes, 
                kernel_size=fixed_kernel_size, 
                stride=1, 
                padding=paddings[i], 
                downsampling=downsampling,
                dropout=0.0,
                dilation=dilation_rates[i])
            for i in range(n_blocks)
        ]
        # return the resnet encoder
        return nn.Sequential(
            nn.BatchNorm1d(num_features=self.planes),
            nn.SiLU(), #nn.ReLU(inplace=False),
            nn.Conv1d(
                in_channels=self.planes, 
                out_channels=self.planes, 
                kernel_size=fixed_kernel_size, 
                stride=2, 
                padding=2, 
                bias=False
            ),
            *resnet_layers,
            nn.BatchNorm1d(num_features=self.planes),
            nn.SiLU(), #nn.ReLU(inplace=False),
            nn.AvgPool1d(kernel_size=6, stride=6, padding=2)
        )
    
    def forward(self, x):
        # extract features using resnet 
        x = x.permute(0, 2, 1)
        out_sep = [conv(x) for conv in self.parallel_conv]
        out = torch.cat(out_sep, dim=2)
        out = self.resnet_part(out)
        out = out.reshape(out.shape[0], -1)
        # extract features using rnn
        rnn_out, _ = self.rnn(x.permute(0, 2, 1))
        new_rnn_h = self.pooling(rnn_out)
        # concatenate the features
        new_out = torch.cat([out, new_rnn_h], dim=1) 
        # total features = 424 = 24*6 + 128*2 
        # pass through the final fully connected layer
        result = self.fc(new_out)  
        
        return result


### Dilated Inception Wavenet Encoder

In [16]:
# from typing import List

# class DilatedInception(nn.Module):
#     def __init__(self, in_channels: int, out_channels: int, kernel_sizes: List[int], dilation: int) -> None:
#         super().__init__()
#         assert out_channels % len(kernel_sizes) == 0, "`out_channels` must be divisible by the number of kernel sizes."
#         hidden_dim = out_channels // len(kernel_sizes)
#         self.convs = nn.ModuleList([
#             nn.Conv1d(in_channels, hidden_dim, k, padding='same', dilation=dilation)
#             for k in kernel_sizes
#         ])

#     def forward(self, x):
#         outputs = [conv(x) for conv in self.convs]
#         out = torch.cat(outputs, dim=1)
#         return out

# class GatedTCN(nn.Module):
#     def __init__(self, in_dim: int, h_dim: int, kernel_sizes: List[int], dilation_factor: int, dropout: float = 0.0) -> None:
#         super().__init__()
#         self.filt = DilatedInception(in_dim, h_dim, kernel_sizes, dilation=dilation_factor)
#         self.gate = DilatedInception(in_dim, h_dim, kernel_sizes, dilation=dilation_factor)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, x):
#         x_filt = torch.tanh(self.filt(x))
#         x_gate = torch.sigmoid(self.gate(x))
#         h = x_filt * x_gate
#         h = self.dropout(h)
#         return h

# class WaveBlock(nn.Module):
#     def __init__(self, n_layers: int, in_dim: int, h_dim: int, kernel_sizes: List[int]) -> None:
#         super().__init__()
#         self.dilation_rates = [2**i for i in range(n_layers)]
#         self.in_conv = nn.Conv1d(in_dim, h_dim, kernel_size=1)
#         self.gated_tcns = nn.ModuleList([
#             GatedTCN(h_dim, h_dim, kernel_sizes, dilation)
#             for dilation in self.dilation_rates
#         ])
#         self.skip_convs = nn.ModuleList([
#             nn.Conv1d(h_dim, h_dim, kernel_size=1)
#             for _ in range(n_layers)
#             ])
#         self._initialize_weights()

#     def _initialize_weights(self):
#         nn.init.xavier_uniform_(self.in_conv.weight, gain=nn.init.calculate_gain('relu'))
#         nn.init.zeros_(self.in_conv.bias)
#         for conv in self.skip_convs:
#             nn.init.xavier_uniform_(conv.weight, gain=nn.init.calculate_gain('relu'))
#             nn.init.zeros_(conv.bias)

#     def forward(self, x):
#         # x: (B, C, L)
#         x = self.in_conv(x)
#         x_skip = x
#         for gated_tcn, skip_conv in zip(self.gated_tcns, self.skip_convs):
#             x = gated_tcn(x)
#             x = skip_conv(x)
#             x_skip = x_skip + x
#         return x_skip

# class DilatedWaveNet(nn.Module):
#     """WaveNet architecture with dilated inception conv, enhanced with list comprehension for input processing."""

#     def __init__(self, kernel_sizes: List[int]) -> None:
#         super().__init__()
#         self.kernel_sizes = kernel_sizes
        
#         # Initialize wave blocks with specified kernel sizes
#         self.wave_module = nn.Sequential(
#             WaveBlock(9, 8, 128, self.kernel_sizes), #12
#             WaveBlock(6, 128, 256, self.kernel_sizes), #8
#             WaveBlock(3, 256, 512, self.kernel_sizes), #4
#             WaveBlock(1, 512, 512, self.kernel_sizes), #1
#         )
#         self.pool_layer = nn.AdaptiveAvgPool1d(1)

#     def forward(self, x) -> torch.Tensor:
#         # x: (B, L, C)
#         bs, seq_len, n_channels = x.shape
#         x = x.permute(0, 2, 1) # -> (B, C, L)
#         # Process different parts of the input with list comprehension
#         x = self.wave_module(x)
#         x = self.pool_layer(x) # ->(B, 512, 1)
#         x = x.reshape(bs, n_channels, -1).reshape(bs, n_channels//2, 2, 64)
#         features = x.mean(dim=2).reshape(bs, -1) # -> (16, 256)
# #         pooled_outputs = [(x[:, i:i+64] + x[:, i+64:i+128]) / 2 for i in range(0, n_channels, 2)]
# #         # Combine the pooled features and reshape for classification
# #         features = torch.cat(pooled_outputs, dim=1).reshape(bs, -1)
       
#         return features

### Dilated ResNet 1D Encoder

In [17]:
# class ResnetBlock(nn.Module):
#     def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, dropout=0.0):
#         super(ResnetBlock, self).__init__()

#         self.bn1 = nn.BatchNorm1d(in_channels)
#         self.relu1 = nn.ReLU()
#         self.conv1 = nn.Conv1d(
#             in_channels, out_channels, kernel_size, 
#             stride=stride, 
#             padding=dilation*(kernel_size//2), 
#             dilation=dilation, 
#             bias=False)
#         self.drop1 = nn.Dropout(p=dropout)
#         self.bn2 = nn.BatchNorm1d(out_channels)
#         self.relu2 = nn.ReLU()
#         self.drop2 = nn.Dropout(p=dropout)
#         self.conv2 = nn.Conv1d(
#             out_channels, out_channels, kernel_size, 
#             stride=stride, 
#             padding=dilation*(kernel_size//2), 
#             dilation=dilation, 
#             bias=False)
        
#         self.bn3 = nn.BatchNorm1d(out_channels)
#         self.relu3 = nn.ReLU()
#         self.downsample = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)

#     def forward(self, x):
#         identity = x
#         identity = self.downsample(identity)

#         out = self.bn1(x)
#         out = self.relu1(out)
#         out = self.drop1(out)
#         out = self.conv1(out)

#         out = self.bn2(out)
#         out = self.relu2(out)
#         out = self.drop2(out)
#         out = self.conv2(out)

#         out = self.downsample(out)

#         out += identity
#         out = self.bn3(out)
#         out = self.relu3(out)

#         return out

# class DilatedResnet(nn.Module):
#     def __init__(self, in_channels, out_channels, kernel_size, n_layers, expansion_factor=4):
#         super(DilatedResnet, self).__init__()

#         self.in_channels = in_channels
#         self.kernel_size = kernel_size
#         self.h_dim = out_channels // n_layers
        
#         fix_kernel_size = 5
#         self.conv1 = nn.Conv1d(
#             self.in_channels, self.h_dim, kernel_size=fix_kernel_size, stride=1, padding=fix_kernel_size//2
#             )

#         dilation_rates = [expansion_factor**i for i in range(n_layers)]

#         self.blocks = nn.ModuleList([
#             ResnetBlock(self.h_dim, self.h_dim, self.kernel_size, dilation=dilation)
#             for dilation in dilation_rates
#         ])

#     def forward(self, x):
#         x = self.conv1(x)
#         outputs = [ block(x) for block in self.blocks ]
#         output = torch.cat(outputs, dim=1)
        
#         return output

# class DilatedResnetEncoder(nn.Module):
#     def __init__(self, kernel_sizes=[3, 5, 7, 9], in_channels=8, planes=24, dilate_layers=[6,3,1], expansion_factor=4):
#         super(DilatedResnetEncoder, self).__init__()

#         self.in_channels = in_channels
#         self.planes = planes
#         self.kernel_sizes = kernel_sizes
#         self.dilate_layers = dilate_layers # must be 3 layers
#         self.expansion_factor = expansion_factor
        
#         # out_channels = self.planes * self.in_channels
#         # fix_kernel_size = 5
#         # self.conv1 = nn.Conv1d(
#         #     self.in_channels, out_channels, kernel_size=fix_kernel_size, stride=1, padding=fix_kernel_size//2
#         #     )
        
#         self.blocks = nn.ModuleList([
#             self._make_dilated_block(kernel_size)
#             for kernel_size in self.kernel_sizes
#         ])

#         bottleneck_in_channels = self.in_channels * self.planes * self.dilate_layers[1] * self.dilate_layers[2]
#         bottoleneck_out_channels = self.in_channels * self.planes

#         self.bottleneck = nn.Sequential(
#             nn.BatchNorm1d(num_features=bottleneck_in_channels),
#             nn.ReLU(),
#             nn.Conv1d(
#                 in_channels=bottleneck_in_channels,
#                 out_channels=bottoleneck_out_channels,
#                 kernel_size=1,
#                 stride=1,
#                 padding=0,
#                 bias=False
#             )
#         )
        
#         self.pooling = nn.AdaptiveAvgPool1d(1)
#         # self.blocks = nn.ModuleList([
#         #     nn.Sequential(*[
#         #         ResidualBlock(
#         #             out_channels, out_channels, kernel_size, dilation=dilation
#         #         ) for dilation in self.dilate_layers
#         #     ])
#         #     for kernel_size in self.kernel_sizes
#         # ])

#     def _make_dilated_block(self, kernel_size):
#         out_channel_1 = self.in_channels * self.planes
#         block_1 = DilatedResnet(self.in_channels, out_channel_1, kernel_size, self.dilate_layers[0], self.expansion_factor)

#         out_channel_2 = out_channel_1 * self.dilate_layers[1]
#         block_2 = DilatedResnet(out_channel_1, out_channel_2, kernel_size, self.dilate_layers[1], self.expansion_factor)

#         out_channel_3 = out_channel_2 * self.dilate_layers[2]
#         block_3 = DilatedResnet(out_channel_2, out_channel_3, kernel_size, self.dilate_layers[2], self.expansion_factor)

#         return nn.Sequential(block_1, block_2, block_3)
        
    
#     def forward(self, x):
#         # <- # [batch_size, seq_len=2000, in_channels=8]
#         x = x.permute(0, 2, 1)
#         # x = self.conv1(x)
#         outputs = [ block(x) for block in self.blocks ]
#         outputs = [ self.bottleneck(out) for out in outputs ]
#         output = torch.cat(outputs, dim=1)
#         output = self.pooling(output).squeeze(-1)
        
#         return output

In [18]:
train_dataset = EEGSeqDataset(train_all, ModelConfig, ALL_EEG_SIGNALS, mode="train")
train_loader = DataLoader(train_dataset, drop_last=True, batch_size=16, num_workers=4, pin_memory=True, shuffle=False)

model = ResNetGRU(config=ModelConfig, num_classes=6)

model.to(DEVICE)
for i, batch in enumerate(train_loader):
    X, y = batch
    X = X.to(DEVICE)
    y = y.to(DEVICE)
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    y_pred = model(X)
    print(y_pred.shape)
    break 

del model, train_dataset, train_loader, X, y
torch.cuda.empty_cache()
gc.collect()

X shape: torch.Size([16, 2000, 8])
y shape: torch.Size([16, 6])
torch.Size([16, 6])


0

In [19]:
!nvidia-smi

Fri Apr  5 14:16:19 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.239.06   Driver Version: 470.239.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0B:00.0 Off |                  N/A |
| 26%   36C    P2    55W / 260W |   1605MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Train

In [20]:

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
class Trainer:

    def __init__(self, model, config, logger):

        self.model = model
        self.logger = logger
        self.config = config
        
        self.early_stop_rounds = config.EARLY_STOP_ROUNDS
        self.early_stop_counter = 0
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.kl_div_loss = nn.KLDivLoss(reduction='batchmean')
        self.ce_loss = nn.CrossEntropyLoss()
        self.gamma = config.REGULARIZATION
        
        # self.criterion = nn.KLDivLoss(reduction="batchmean")
    
    def criterion(self, y_pred, y_true, weights=None, mode='train'):
        kl_loss = self.kl_div_loss(F.log_softmax(y_pred, dim=1), y_true)
        if (self.gamma is not None) & (mode == 'train'):
            softmax_probs = F.softmax(y_pred, dim=1)  # Compute softmax probabilities
            entropy_loss = -(softmax_probs * torch.log(softmax_probs + 1e-9)).sum(dim=1).mean(dim=0) # Compute entropy, add epsilon to avoid log(0)
            return kl_loss - self.gamma * entropy_loss
        else:
            return kl_loss
        
    def train(self, train_loader, valid_loader, from_checkpoint=None):

        self.optimizer = AdamW(self.model.parameters(), lr=8e-3, weight_decay=self.config.WEIGHT_DECAY)

        # CosineAnnealingWarmRestarts( 
        #     self.optimizer,
        #     T_0=20,
        #     eta_min=1e-6,
        #     T_mult=1,
        #     last_epoch=-1
        # )
        self.scheduler =  OneCycleLR(
            self.optimizer,
            max_lr=1e-4,
            epochs=self.config.EPOCHS,
            steps_per_epoch=len(train_loader),
            pct_start=0.1,
            anneal_strategy="cos",
            final_div_factor=100,
        )

        if from_checkpoint is not None:
            self.model.load_state_dict(torch.load(from_checkpoint, map_location=self.device))

        self.model.to(self.device)
        best_weights, best_preds, best_loss = None, None, float("inf")
        loss_records = {"train": [], "valid": []}

        for epoch in range(self.config.EPOCHS):
            start_epoch = time()

            train_loss, _ = self._train_or_valid_epoch(epoch, train_loader, is_train=True)
            valid_loss, valid_preds = self._train_or_valid_epoch(epoch, valid_loader, is_train=False)

            loss_records["train"].append(train_loss)
            loss_records["valid"].append(valid_loss)

            elapsed = time() - start_epoch

            info = f"{'-' * 100}\nEpoch {epoch + 1} - "
            info += f"Average Loss: (train) {train_loss:.4f}; (valid) {valid_loss:.4f} | Time: {elapsed:.2f}s"
            self.logger.info(info)

            if valid_loss < best_loss:
                best_loss = valid_loss
                best_weights = self.model.state_dict()
                best_preds = valid_preds
                self.logger.info(f"Best model found in epoch {epoch + 1} | valid loss: {best_loss:.4f}")
                self.early_stop_counter = 0
            
            else:
                self.early_stop_counter += 1
                if self.early_stop_counter >= self.early_stop_rounds:
                    self.logger.info(f"Early stopping at epoch {epoch + 1}")
                    break

        return best_weights, best_preds, loss_records

    def _train_or_valid_epoch(self, epoch_id, dataloader, is_train=True):

        self.model.train() if is_train else self.model.eval()
        mode = "Train" if is_train else "Valid"

        len_loader = len(dataloader)
        scaler = GradScaler(enabled=self.config.AMP)
        loss_meter, predicts_record = AverageMeter(), []

        start = time()
        pbar = tqdm(dataloader, total=len(dataloader), unit="batch", desc=f"{mode} [{epoch_id}]")
        for step, (X, y) in enumerate(pbar):
            X, y = X.to(self.device), y.to(self.device)

            if is_train:
                with autocast(enabled=self.config.AMP):
                    y_pred = self.model(X)
                    loss = self.criterion(y_pred, y)
                if self.config.GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / self.config.GRADIENT_ACCUMULATION_STEPS
                scaler.scale(loss).backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.MAX_GRAD_NORM)
                if (step + 1) % self.config.GRADIENT_ACCUMULATION_STEPS == 0:
                    scaler.step(self.optimizer)
                    scaler.update()
                    self.optimizer.zero_grad()
                    self.scheduler.step()
            else:
                with torch.no_grad():
                    y_pred = self.model(X)
                    loss = self.criterion(y_pred, y, mode='valid')
                if self.config.GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / self.config.GRADIENT_ACCUMULATION_STEPS
                
                predicts_record.append(y_pred.to('cpu').numpy())
            
            loss_meter.update(loss.item(), y.size(0))
            end = time()

            if (step % self.config.PRINT_FREQ == 0) or (step == (len_loader - 1)):
                lr = self.scheduler.get_last_lr()[0]
                info = f"Epoch {epoch_id + 1} [{step}/{len_loader}] | {mode} Loss: {loss_meter.avg:.4f}"
                if is_train:
                    info += f" Grad: {grad_norm:.4f} LR: {lr:.4e}"
                info += f" | Elapse: {end - start:.2f}s"
                print(info)

        if not is_train:
            predicts_record = np.concatenate(predicts_record)
            
        return loss_meter.avg, predicts_record


In [21]:
def train_fold(model, fold_id, train_folds, valid_folds, logger, stage=1, checkpoint=None):

    train_dataset = EEGSeqDataset(train_folds, ModelConfig, ALL_EEG_SIGNALS, mode="train")
    valid_dataset = EEGSeqDataset(valid_folds, ModelConfig, ALL_EEG_SIGNALS, mode="valid")

    # ======== DATALOADERS ==========
    loader_kwargs = {
        "batch_size": ModelConfig.BATCH_SIZE,
        "num_workers": ModelConfig.NUM_WORKERS,
        "pin_memory": True,
        "shuffle": False,
    }

    train_loader = DataLoader(train_dataset, drop_last=True, collate_fn=None, **loader_kwargs)
    valid_loader = DataLoader(valid_dataset, drop_last=False, collate_fn=None, **loader_kwargs)

    if checkpoint is not None:
        print(f"Loading model from checkpoint: {checkpoint}")

    trainer = Trainer(model, ModelConfig, logger)
    best_weights, best_preds, loss_records = trainer.train(
        train_loader, valid_loader, from_checkpoint=checkpoint)

    save_model_name = f"{ModelConfig.MODEL_NAME}_fold_{fold_id}_stage_{stage}.pth"
    torch.save(best_weights, os.path.join(PATHS.OUTPUT_DIR, save_model_name))

    del train_dataset, valid_dataset, train_loader, valid_loader
    torch.cuda.empty_cache()
    gc.collect()

    return best_preds, loss_records

In [22]:
def evaluate_oof(oof_df):
    '''
    Evaluate the out-of-fold dataframe using KL Divergence (torch and kaggle)
    '''
    kl_loss = nn.KLDivLoss(reduction="batchmean")
    labels = torch.tensor(oof_df[TARGETS].values.astype('float32'))
    preds = F.log_softmax(
        torch.tensor(oof_df[TARGETS_PRED].values.astype('float32'), requires_grad=False),
        dim=1
    )
    kl_torch = kl_loss(preds, labels).item()

    return kl_torch

In [23]:
from kl_divergence import score as kaggle_score 
from sklearn.metrics import confusion_matrix
import seaborn as sns

TARGET2ID = {'Seizure': 0, 'LPD': 1, 'GPD': 2, 'LRDA': 3, 'GRDA': 4, 'Other': 5}

def calc_kaggle_score(oof_df):
    submission_df = oof_df[['eeg_id']+TARGETS_PRED].copy()
    submission_df.columns = ['eeg_id'] + TARGETS
    solution_df = oof_df[['eeg_id']+TARGETS].copy()
    return kaggle_score(solution_df, submission_df, 'eeg_id')

def analyze_oof(oof_csv):

    kl_criteria = nn.KLDivLoss(reduction='batchmean')
    softmax = nn.Softmax(dim=1)

    oof_df = pd.read_csv(oof_csv)
    oof_df['target_pred'] = oof_df[TARGETS_PRED].apply(lambda x: np.argmax(x), axis=1)
    oof_df['target_id'] = oof_df[TARGETS].apply(lambda x: np.argmax(x), axis=1)
    
    oof_df["kl_loss"] = oof_df.apply(
    lambda row: 
        kl_criteria(
            F.log_softmax(
                    torch.tensor(row[TARGETS_PRED].values.astype(np.float32)).unsqueeze(0)
                , dim=1
                ), 
            torch.tensor(row[TARGETS].values.astype(np.float32))
            ).numpy(),
    axis=1)

    oof_df["kl_loss"] = oof_df['kl_loss'].astype(np.float32)

    oof_df[TARGETS_PRED] = softmax( torch.tensor(oof_df[TARGETS_PRED].values.astype(np.float32)) )

    oof_df.head()

    return oof_df

In [24]:
def prepare_k_fold(df, k_folds=5):

    kf = KFold(n_splits=k_folds, shuffle=True, random_state=ModelConfig.SEED)
    unique_spec_id = df['spectrogram_id'].unique()
    df['fold'] = k_folds

    for fold, (train_index, valid_index) in enumerate(kf.split(unique_spec_id)):
        df.loc[df['spectrogram_id'].isin(unique_spec_id[valid_index]), 'fold'] = fold

    return df

In [25]:
# Major Train Loop
# ================== Logger ==================
logger.info(f"{'*' * 100}")
logger.info(f"Script Start: {ctime()}")
logger.info(f"Model Configurations:")
for key, value in ModelConfig.__dict__.items():
    if not key.startswith("__"):
        logger.info(f"{key}: {value}")
logger.info(f"{'*' * 100}")

# ================== Prepare Training ==================
oof_stage_1, oof_stage_2 = pd.DataFrame(), pd.DataFrame()
loss_history_1, loss_history_2 = [], []
t_start = time()

K_FOLDS = 5
train_all = prepare_k_fold(train_all, k_folds=K_FOLDS)

for fold in range(0, K_FOLDS):
    tik_total = time()
    tik = time()

    valid_folds = train_all[(train_all['fold'] == fold) ].reset_index(drop=True)
    train_folds = train_all[(train_all['fold'] != fold) ].reset_index(drop=True)
    train_size, valid_size = train_folds.shape[0], valid_folds.shape[0]

    # ================== Stage 1: Train ====================
    # model = ResNetGRU(
    #     kernels=ModelConfig.RESNET_GRU_KERNELS, 
    #     in_channels=8, 
    #     fixed_kernel_size=ModelConfig.RESNET_GRU_FIXED_KERNEL_SIZE,
    #     hidden_size=ModelConfig.RESNET_GRU_HIDDEN_SIZE,
    #     num_classes=6
    #     )
    model = ResNetGRU(config=ModelConfig, num_classes=6)

    ## STAGE 1
    logger.info(f"{'=' * 100}\nFold: {fold}\n{'=' * 100}")
    logger.info(f"- Stage 1 | Train: {train_size}; Valid: {valid_size} -")
    valid_predicts, loss_records = train_fold(
        model, fold, train_folds, valid_folds, logger, stage=1, checkpoint=None)

    loss_history_1.append(loss_records)
    valid_folds[TARGETS_PRED] = valid_predicts
    kl_loss_torch = evaluate_oof(valid_folds)
    info = f"{'=' * 100}\nFold {fold} Valid Loss: {kl_loss_torch}\n"
    info += f"Elapse: {(time() - tik) / 60:.2f} min \n{'=' * 100}"
    logger.info(info)

    oof_stage_1 = pd.concat([oof_stage_1, valid_folds], axis=0).reset_index(drop=True)
    oof_stage_1.to_csv(os.path.join(PATHS.OUTPUT_DIR, f"{ModelConfig.MODEL_NAME}_oof_1.csv"), index=False)

    # ================== Stage 2: Train ====================
    tik = time()
    # model = ResNetGRU(
    #     kernels=ModelConfig.RESNET_GRU_KERNELS, 
    #     in_channels=8, 
    #     fixed_kernel_size=ModelConfig.RESNET_GRU_FIXED_KERNEL_SIZE,
    #     hidden_size=ModelConfig.RESNET_GRU_HIDDEN_SIZE,
    #     num_classes=6
    #     )
    model = ResNetGRU(config=ModelConfig, num_classes=6)
    
    train_folds_2 = train_hard[~train_hard['eeg_id'].isin(valid_folds['eeg_id'])].reset_index(drop=True)
    valid_folds_2 = train_hard[ train_hard['eeg_id'].isin(valid_folds['eeg_id'])].reset_index(drop=True)
    train_size = train_folds_2.shape[0]
    valid_size = valid_folds_2.shape[0]
    
    ## STAGE 2
    logger.info(f"- Stage 2 | Train: {train_size}; Valid: {valid_size} -")

    # model_dir = "/home/shiyi/kaggle_hms/outputs/ResnetGRU_Originalsplit/Reg015"
    # checkpoint = list(Path(model_dir).glob(f"*_fold_{fold}_stage_1.pth"))[0]
    checkpoint = list(Path(PATHS.OUTPUT_DIR).glob(f"{ModelConfig.MODEL_NAME}_fold_{fold}_stage_1.pth"))[0]

    valid_predicts, loss_records = train_fold(
        model, fold, train_folds_2, valid_folds_2, logger, stage=2, checkpoint=checkpoint)
    
    loss_history_2.append(loss_records)
    valid_folds_2[TARGETS_PRED] = valid_predicts
    kl_loss_torch = evaluate_oof(valid_folds_2)
    info = f"{'=' * 100}\nFold {fold} Valid Loss: {kl_loss_torch}\n"
    info += f"Elapse: {(time() - tik) / 60:.2f} min \n{'=' * 100}"
    logger.info(info)

    oof_stage_2 = pd.concat([oof_stage_2, valid_folds_2], axis=0).reset_index(drop=True)
    oof_stage_2.to_csv(os.path.join(PATHS.OUTPUT_DIR, f"{ModelConfig.MODEL_NAME}_oof_2.csv"), index=False)

    logger.info(f"Fold {fold} Elapse: {(time() - tik_total) / 60:.2f} min")

info = f"{'=' * 100}\nTraining Complete!\n"
cv_results_1 = evaluate_oof(oof_stage_1)
cv_results_2 = evaluate_oof(oof_stage_2)
info += f"CV Result: Stage 1: {cv_results_1} | Stage 2: {cv_results_2}\n"
info += f"Elapse: {(time() - t_start) / 60:.2f} min \n{'=' * 100}"
logger.info(info)

****************************************************************************************************
Script Start: Fri Apr  5 14:16:28 2024
Model Configurations:
SEED: 20
SPLIT_ENTROPY: 5.5
MODEL_NAME: ResnetGRU_v1_LB048
MODEL_BACKBONE: reset_gru
BATCH_SIZE: 32
EPOCHS: 20
EARLY_STOP_ROUNDS: 5
GRADIENT_ACCUMULATION_STEPS: 1
DROP_RATE: 0.15
DROP_PATH_RATE: 0.25
WEIGHT_DECAY: 0.01
AMP: True
PRINT_FREQ: 100
NUM_WORKERS: 0
MAX_GRAD_NORM: 10000000.0
REGULARIZATION: 0.15
RESNET_GRU_BANDPASS: None
RESNET_GRU_IN_CHANNELS: 8
RESNET_GRU_KERNELS: [3, 5, 7, 9, 11]
RESNET_GRU_FIXED_KERNEL_SIZE: 5
RESNET_GRU_DOWNSAMPLE: 5
RESNET_GRU_HIDDEN_SIZE: 304
RESNET_GRU_DILATED: False
****************************************************************************************************
Fold: 0
- Stage 1 | Train: 16195; Valid: 3988 -


Train [0]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 1 [0/506] | Train Loss: 1.2684 Grad: 80195.3828 LR: 4.0002e-06 | Elapse: 0.23s
Epoch 1 [100/506] | Train Loss: 1.1780 Grad: 91877.4531 LR: 6.3447e-06 | Elapse: 6.03s
Epoch 1 [200/506] | Train Loss: 1.1851 Grad: 78362.3906 LR: 1.3062e-05 | Elapse: 11.85s
Epoch 1 [300/506] | Train Loss: 1.1739 Grad: 52261.8555 LR: 2.3509e-05 | Elapse: 17.65s
Epoch 1 [400/506] | Train Loss: 1.1569 Grad: 65165.1328 LR: 3.6686e-05 | Elapse: 23.46s
Epoch 1 [500/506] | Train Loss: 1.1345 Grad: 50491.7734 LR: 5.1329e-05 | Elapse: 29.27s
Epoch 1 [505/506] | Train Loss: 1.1335 Grad: 49006.5547 LR: 5.2075e-05 | Elapse: 29.56s


Valid [0]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 1 [0/125] | Valid Loss: 1.2688 | Elapse: 0.07s
Epoch 1 [100/125] | Valid Loss: 1.3073 | Elapse: 5.09s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 1.1335; (valid) 1.3002 | Time: 35.85s
Best model found in epoch 1 | valid loss: 1.3002


Epoch 1 [124/125] | Valid Loss: 1.3002 | Elapse: 6.28s


Train [1]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 2 [0/506] | Train Loss: 1.0906 Grad: 53102.0742 LR: 5.2224e-05 | Elapse: 0.06s
Epoch 2 [100/506] | Train Loss: 1.0051 Grad: 114642.4766 LR: 6.6890e-05 | Elapse: 5.93s
Epoch 2 [200/506] | Train Loss: 1.0109 Grad: 54453.7422 LR: 8.0129e-05 | Elapse: 11.80s
Epoch 2 [300/506] | Train Loss: 1.0020 Grad: 49530.3711 LR: 9.0674e-05 | Elapse: 17.65s
Epoch 2 [400/506] | Train Loss: 0.9903 Grad: 42100.6680 LR: 9.7515e-05 | Elapse: 23.48s
Epoch 2 [500/506] | Train Loss: 0.9761 Grad: 49098.9688 LR: 9.9996e-05 | Elapse: 29.30s
Epoch 2 [505/506] | Train Loss: 0.9753 Grad: 72887.2188 LR: 1.0000e-04 | Elapse: 29.59s


Valid [1]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 2 [0/125] | Valid Loss: 0.9886 | Elapse: 0.05s
Epoch 2 [100/125] | Valid Loss: 1.1559 | Elapse: 5.03s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.9753; (valid) 1.1520 | Time: 35.80s
Best model found in epoch 2 | valid loss: 1.1520


Epoch 2 [124/125] | Valid Loss: 1.1520 | Elapse: 6.21s


Train [2]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 3 [0/506] | Train Loss: 1.0148 Grad: 54527.7500 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/506] | Train Loss: 0.8859 Grad: 143155.5938 LR: 9.9969e-05 | Elapse: 5.90s
Epoch 3 [200/506] | Train Loss: 0.8889 Grad: 76853.5703 LR: 9.9879e-05 | Elapse: 11.75s
Epoch 3 [300/506] | Train Loss: 0.8759 Grad: 64058.4180 LR: 9.9729e-05 | Elapse: 17.59s
Epoch 3 [400/506] | Train Loss: 0.8652 Grad: 101814.7109 LR: 9.9520e-05 | Elapse: 23.43s
Epoch 3 [500/506] | Train Loss: 0.8544 Grad: 112661.5781 LR: 9.9253e-05 | Elapse: 29.33s
Epoch 3 [505/506] | Train Loss: 0.8536 Grad: 172687.9688 LR: 9.9238e-05 | Elapse: 29.62s


Valid [2]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 3 [0/125] | Valid Loss: 0.8720 | Elapse: 0.05s
Epoch 3 [100/125] | Valid Loss: 1.0324 | Elapse: 5.06s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.8536; (valid) 1.0284 | Time: 35.88s
Best model found in epoch 3 | valid loss: 1.0284


Epoch 3 [124/125] | Valid Loss: 1.0284 | Elapse: 6.25s


Train [3]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 4 [0/506] | Train Loss: 0.9465 Grad: 107741.2969 LR: 9.9235e-05 | Elapse: 0.06s
Epoch 4 [100/506] | Train Loss: 0.7908 Grad: 168054.0625 LR: 9.8905e-05 | Elapse: 5.92s
Epoch 4 [200/506] | Train Loss: 0.7956 Grad: 95452.7344 LR: 9.8517e-05 | Elapse: 11.76s
Epoch 4 [300/506] | Train Loss: 0.7864 Grad: 108433.1328 LR: 9.8071e-05 | Elapse: 17.64s
Epoch 4 [400/506] | Train Loss: 0.7799 Grad: 142799.2656 LR: 9.7569e-05 | Elapse: 23.51s
Epoch 4 [500/506] | Train Loss: 0.7705 Grad: 185971.6250 LR: 9.7009e-05 | Elapse: 29.36s
Epoch 4 [505/506] | Train Loss: 0.7697 Grad: 243632.0938 LR: 9.6980e-05 | Elapse: 29.66s


Valid [3]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 4 [0/125] | Valid Loss: 0.7766 | Elapse: 0.06s
Epoch 4 [100/125] | Valid Loss: 0.9451 | Elapse: 5.05s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.7697; (valid) 0.9418 | Time: 35.89s
Best model found in epoch 4 | valid loss: 0.9418


Epoch 4 [124/125] | Valid Loss: 0.9418 | Elapse: 6.23s


Train [4]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 5 [0/506] | Train Loss: 0.8508 Grad: 142093.4844 LR: 9.6974e-05 | Elapse: 0.06s
Epoch 5 [100/506] | Train Loss: 0.7134 Grad: 200724.8281 LR: 9.6355e-05 | Elapse: 5.92s
Epoch 5 [200/506] | Train Loss: 0.7141 Grad: 124650.0859 LR: 9.5682e-05 | Elapse: 11.78s
Epoch 5 [300/506] | Train Loss: 0.7060 Grad: 171952.1875 LR: 9.4954e-05 | Elapse: 17.64s
Epoch 5 [400/506] | Train Loss: 0.7010 Grad: 192114.5469 LR: 9.4172e-05 | Elapse: 23.44s
Epoch 5 [500/506] | Train Loss: 0.6930 Grad: 230700.6406 LR: 9.3338e-05 | Elapse: 29.22s
Epoch 5 [505/506] | Train Loss: 0.6923 Grad: 306685.9062 LR: 9.3295e-05 | Elapse: 29.51s


Valid [4]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 5 [0/125] | Valid Loss: 0.6830 | Elapse: 0.05s
Epoch 5 [100/125] | Valid Loss: 0.8775 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.6923; (valid) 0.8740 | Time: 35.68s
Best model found in epoch 5 | valid loss: 0.8740


Epoch 5 [124/125] | Valid Loss: 0.8740 | Elapse: 6.17s


Train [5]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 6 [0/506] | Train Loss: 0.7521 Grad: 217481.0156 LR: 9.3287e-05 | Elapse: 0.06s
Epoch 6 [100/506] | Train Loss: 0.6364 Grad: 214858.6406 LR: 9.2398e-05 | Elapse: 5.90s
Epoch 6 [200/506] | Train Loss: 0.6395 Grad: 179513.4375 LR: 9.1459e-05 | Elapse: 11.74s
Epoch 6 [300/506] | Train Loss: 0.6331 Grad: 243579.5781 LR: 9.0471e-05 | Elapse: 17.55s
Epoch 6 [400/506] | Train Loss: 0.6307 Grad: 178296.9688 LR: 8.9434e-05 | Elapse: 23.34s
Epoch 6 [500/506] | Train Loss: 0.6247 Grad: 282665.1562 LR: 8.8351e-05 | Elapse: 29.17s
Epoch 6 [505/506] | Train Loss: 0.6240 Grad: 348618.1562 LR: 8.8296e-05 | Elapse: 29.45s


Valid [5]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 6 [0/125] | Valid Loss: 0.6522 | Elapse: 0.06s
Epoch 6 [100/125] | Valid Loss: 0.8300 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.6240; (valid) 0.8277 | Time: 35.55s
Best model found in epoch 6 | valid loss: 0.8277


Epoch 6 [124/125] | Valid Loss: 0.8277 | Elapse: 6.10s


Train [6]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 7 [0/506] | Train Loss: 0.6522 Grad: 261430.0000 LR: 8.8285e-05 | Elapse: 0.06s
Epoch 7 [100/506] | Train Loss: 0.5753 Grad: 273790.9375 LR: 8.7153e-05 | Elapse: 5.83s
Epoch 7 [200/506] | Train Loss: 0.5823 Grad: 232632.1719 LR: 8.5977e-05 | Elapse: 11.55s
Epoch 7 [300/506] | Train Loss: 0.5779 Grad: 312131.6875 LR: 8.4759e-05 | Elapse: 17.34s
Epoch 7 [400/506] | Train Loss: 0.5773 Grad: 207232.3750 LR: 8.3499e-05 | Elapse: 23.13s
Epoch 7 [500/506] | Train Loss: 0.5733 Grad: 336507.2500 LR: 8.2199e-05 | Elapse: 28.98s
Epoch 7 [505/506] | Train Loss: 0.5726 Grad: 408953.1562 LR: 8.2133e-05 | Elapse: 29.27s


Valid [6]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 7 [0/125] | Valid Loss: 0.6340 | Elapse: 0.05s
Epoch 7 [100/125] | Valid Loss: 0.8045 | Elapse: 5.04s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.5726; (valid) 0.8025 | Time: 35.48s
Best model found in epoch 7 | valid loss: 0.8025


Epoch 7 [124/125] | Valid Loss: 0.8025 | Elapse: 6.21s


Train [7]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 8 [0/506] | Train Loss: 0.5914 Grad: 293529.8438 LR: 8.2120e-05 | Elapse: 0.06s
Epoch 8 [100/506] | Train Loss: 0.5344 Grad: 408541.4062 LR: 8.0780e-05 | Elapse: 5.87s
Epoch 8 [200/506] | Train Loss: 0.5428 Grad: 296973.5000 LR: 7.9403e-05 | Elapse: 11.70s
Epoch 8 [300/506] | Train Loss: 0.5387 Grad: 318389.6250 LR: 7.7991e-05 | Elapse: 17.51s
Epoch 8 [400/506] | Train Loss: 0.5389 Grad: 249331.1094 LR: 7.6546e-05 | Elapse: 23.32s
Epoch 8 [500/506] | Train Loss: 0.5360 Grad: 399929.4688 LR: 7.5070e-05 | Elapse: 29.14s
Epoch 8 [505/506] | Train Loss: 0.5354 Grad: 449000.7812 LR: 7.4995e-05 | Elapse: 29.43s


Valid [7]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 8 [0/125] | Valid Loss: 0.6372 | Elapse: 0.05s
Epoch 8 [100/125] | Valid Loss: 0.7918 | Elapse: 4.98s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.5354; (valid) 0.7886 | Time: 35.57s
Best model found in epoch 8 | valid loss: 0.7886


Epoch 8 [124/125] | Valid Loss: 0.7886 | Elapse: 6.13s


Train [8]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 9 [0/506] | Train Loss: 0.5614 Grad: 322659.9375 LR: 7.4980e-05 | Elapse: 0.06s
Epoch 9 [100/506] | Train Loss: 0.5018 Grad: 373975.7188 LR: 7.3472e-05 | Elapse: 5.89s
Epoch 9 [200/506] | Train Loss: 0.5112 Grad: 321378.5625 LR: 7.1936e-05 | Elapse: 11.69s
Epoch 9 [300/506] | Train Loss: 0.5073 Grad: 355235.1875 LR: 7.0374e-05 | Elapse: 17.51s
Epoch 9 [400/506] | Train Loss: 0.5073 Grad: 301291.4062 LR: 6.8788e-05 | Elapse: 23.32s
Epoch 9 [500/506] | Train Loss: 0.5052 Grad: 454428.0000 LR: 6.7179e-05 | Elapse: 29.16s
Epoch 9 [505/506] | Train Loss: 0.5045 Grad: 425457.8750 LR: 6.7098e-05 | Elapse: 29.45s


Valid [8]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 9 [0/125] | Valid Loss: 0.6402 | Elapse: 0.06s
Epoch 9 [100/125] | Valid Loss: 0.7847 | Elapse: 5.01s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.5045; (valid) 0.7813 | Time: 35.64s
Best model found in epoch 9 | valid loss: 0.7813


Epoch 9 [124/125] | Valid Loss: 0.7813 | Elapse: 6.18s


Train [9]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 10 [0/506] | Train Loss: 0.5404 Grad: 351547.0312 LR: 6.7082e-05 | Elapse: 0.06s
Epoch 10 [100/506] | Train Loss: 0.4766 Grad: 345213.4375 LR: 6.5452e-05 | Elapse: 5.87s
Epoch 10 [200/506] | Train Loss: 0.4845 Grad: 323537.2500 LR: 6.3803e-05 | Elapse: 11.68s
Epoch 10 [300/506] | Train Loss: 0.4804 Grad: 345002.1875 LR: 6.2138e-05 | Elapse: 17.48s
Epoch 10 [400/506] | Train Loss: 0.4806 Grad: 323527.0625 LR: 6.0459e-05 | Elapse: 23.29s
Epoch 10 [500/506] | Train Loss: 0.4794 Grad: 480301.7188 LR: 5.8767e-05 | Elapse: 29.10s
Epoch 10 [505/506] | Train Loss: 0.4788 Grad: 420606.3750 LR: 5.8682e-05 | Elapse: 29.39s


Valid [9]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 10 [0/125] | Valid Loss: 0.6448 | Elapse: 0.05s
Epoch 10 [100/125] | Valid Loss: 0.7819 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 10 - Average Loss: (train) 0.4788; (valid) 0.7780 | Time: 35.50s
Best model found in epoch 10 | valid loss: 0.7780


Epoch 10 [124/125] | Valid Loss: 0.7780 | Elapse: 6.11s


Train [10]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 11 [0/506] | Train Loss: 0.5322 Grad: 384572.7500 LR: 5.8665e-05 | Elapse: 0.06s
Epoch 11 [100/506] | Train Loss: 0.4544 Grad: 477156.4062 LR: 5.6962e-05 | Elapse: 5.90s
Epoch 11 [200/506] | Train Loss: 0.4614 Grad: 327621.5000 LR: 5.5251e-05 | Elapse: 11.68s
Epoch 11 [300/506] | Train Loss: 0.4578 Grad: 344887.4688 LR: 5.3534e-05 | Elapse: 17.47s
Epoch 11 [400/506] | Train Loss: 0.4579 Grad: 358464.0000 LR: 5.1813e-05 | Elapse: 23.24s
Epoch 11 [500/506] | Train Loss: 0.4573 Grad: 448427.0312 LR: 5.0089e-05 | Elapse: 29.05s
Epoch 11 [505/506] | Train Loss: 0.4567 Grad: 392904.2812 LR: 5.0003e-05 | Elapse: 29.34s


Valid [10]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 11 [0/125] | Valid Loss: 0.6481 | Elapse: 0.05s
Epoch 11 [100/125] | Valid Loss: 0.7784 | Elapse: 4.96s


----------------------------------------------------------------------------------------------------
Epoch 11 - Average Loss: (train) 0.4567; (valid) 0.7743 | Time: 35.50s
Best model found in epoch 11 | valid loss: 0.7743


Epoch 11 [124/125] | Valid Loss: 0.7743 | Elapse: 6.15s


Train [11]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 12 [0/506] | Train Loss: 0.5248 Grad: 403977.9375 LR: 4.9986e-05 | Elapse: 0.06s
Epoch 12 [100/506] | Train Loss: 0.4366 Grad: 469689.9062 LR: 4.8262e-05 | Elapse: 5.94s
Epoch 12 [200/506] | Train Loss: 0.4429 Grad: 161050.9531 LR: 4.6540e-05 | Elapse: 11.84s
Epoch 12 [300/506] | Train Loss: 0.4395 Grad: 189595.3125 LR: 4.4823e-05 | Elapse: 17.72s
Epoch 12 [400/506] | Train Loss: 0.4396 Grad: 187260.0781 LR: 4.3112e-05 | Elapse: 23.57s
Epoch 12 [500/506] | Train Loss: 0.4389 Grad: 229485.7188 LR: 4.1409e-05 | Elapse: 29.46s
Epoch 12 [505/506] | Train Loss: 0.4384 Grad: 218033.7500 LR: 4.1324e-05 | Elapse: 29.76s


Valid [11]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 12 [0/125] | Valid Loss: 0.6560 | Elapse: 0.06s
Epoch 12 [100/125] | Valid Loss: 0.7758 | Elapse: 5.09s


----------------------------------------------------------------------------------------------------
Epoch 12 - Average Loss: (train) 0.4384; (valid) 0.7719 | Time: 36.03s
Best model found in epoch 12 | valid loss: 0.7719


Epoch 12 [124/125] | Valid Loss: 0.7719 | Elapse: 6.28s


Train [12]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 13 [0/506] | Train Loss: 0.5095 Grad: 417216.2188 LR: 4.1307e-05 | Elapse: 0.06s
Epoch 13 [100/506] | Train Loss: 0.4227 Grad: 483879.3125 LR: 3.9615e-05 | Elapse: 5.95s
Epoch 13 [200/506] | Train Loss: 0.4280 Grad: 162385.4844 LR: 3.7935e-05 | Elapse: 11.83s
Epoch 13 [300/506] | Train Loss: 0.4248 Grad: 190978.4375 LR: 3.6270e-05 | Elapse: 17.68s
Epoch 13 [400/506] | Train Loss: 0.4246 Grad: 192297.6719 LR: 3.4621e-05 | Elapse: 23.56s
Epoch 13 [500/506] | Train Loss: 0.4239 Grad: 245336.6250 LR: 3.2991e-05 | Elapse: 29.44s
Epoch 13 [505/506] | Train Loss: 0.4235 Grad: 219295.6875 LR: 3.2910e-05 | Elapse: 29.74s


Valid [12]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 13 [0/125] | Valid Loss: 0.6573 | Elapse: 0.06s
Epoch 13 [100/125] | Valid Loss: 0.7787 | Elapse: 5.09s


----------------------------------------------------------------------------------------------------
Epoch 13 - Average Loss: (train) 0.4235; (valid) 0.7743 | Time: 36.03s


Epoch 13 [124/125] | Valid Loss: 0.7743 | Elapse: 6.29s


Train [13]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 14 [0/506] | Train Loss: 0.4937 Grad: 413178.5625 LR: 3.2893e-05 | Elapse: 0.06s
Epoch 14 [100/506] | Train Loss: 0.4089 Grad: 442404.2500 LR: 3.1284e-05 | Elapse: 5.93s
Epoch 14 [200/506] | Train Loss: 0.4152 Grad: 161424.9219 LR: 2.9698e-05 | Elapse: 11.81s
Epoch 14 [300/506] | Train Loss: 0.4117 Grad: 195883.5469 LR: 2.8135e-05 | Elapse: 17.69s
Epoch 14 [400/506] | Train Loss: 0.4118 Grad: 203452.6719 LR: 2.6598e-05 | Elapse: 23.56s
Epoch 14 [500/506] | Train Loss: 0.4114 Grad: 261012.3281 LR: 2.5090e-05 | Elapse: 29.41s
Epoch 14 [505/506] | Train Loss: 0.4110 Grad: 250546.7031 LR: 2.5015e-05 | Elapse: 29.70s


Valid [13]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 14 [0/125] | Valid Loss: 0.6742 | Elapse: 0.05s
Epoch 14 [100/125] | Valid Loss: 0.7860 | Elapse: 4.97s


----------------------------------------------------------------------------------------------------
Epoch 14 - Average Loss: (train) 0.4110; (valid) 0.7817 | Time: 35.85s


Epoch 14 [124/125] | Valid Loss: 0.7817 | Elapse: 6.14s


Train [14]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 15 [0/506] | Train Loss: 0.4836 Grad: 437445.1875 LR: 2.5000e-05 | Elapse: 0.06s
Epoch 15 [100/506] | Train Loss: 0.3984 Grad: 448795.0312 LR: 2.3523e-05 | Elapse: 5.87s
Epoch 15 [200/506] | Train Loss: 0.4055 Grad: 164959.5156 LR: 2.2077e-05 | Elapse: 11.66s
Epoch 15 [300/506] | Train Loss: 0.4018 Grad: 201572.6719 LR: 2.0665e-05 | Elapse: 17.44s
Epoch 15 [400/506] | Train Loss: 0.4020 Grad: 199329.3125 LR: 1.9287e-05 | Elapse: 23.20s
Epoch 15 [500/506] | Train Loss: 0.4016 Grad: 265736.7500 LR: 1.7946e-05 | Elapse: 28.95s
Epoch 15 [505/506] | Train Loss: 0.4013 Grad: 250533.9531 LR: 1.7880e-05 | Elapse: 29.25s


Valid [14]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 15 [0/125] | Valid Loss: 0.6668 | Elapse: 0.05s
Epoch 15 [100/125] | Valid Loss: 0.7859 | Elapse: 4.98s


----------------------------------------------------------------------------------------------------
Epoch 15 - Average Loss: (train) 0.4013; (valid) 0.7819 | Time: 35.43s


Epoch 15 [124/125] | Valid Loss: 0.7819 | Elapse: 6.18s


Train [15]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 16 [0/506] | Train Loss: 0.4693 Grad: 435731.2500 LR: 1.7867e-05 | Elapse: 0.06s
Epoch 16 [100/506] | Train Loss: 0.3906 Grad: 418348.7188 LR: 1.6567e-05 | Elapse: 5.95s
Epoch 16 [200/506] | Train Loss: 0.3980 Grad: 187387.9062 LR: 1.5306e-05 | Elapse: 11.83s
Epoch 16 [300/506] | Train Loss: 0.3941 Grad: 209733.6094 LR: 1.4087e-05 | Elapse: 17.70s
Epoch 16 [400/506] | Train Loss: 0.3944 Grad: 206978.6719 LR: 1.2910e-05 | Elapse: 23.59s
Epoch 16 [500/506] | Train Loss: 0.3942 Grad: 268343.4062 LR: 1.1777e-05 | Elapse: 29.45s
Epoch 16 [505/506] | Train Loss: 0.3939 Grad: 232819.5312 LR: 1.1722e-05 | Elapse: 29.74s


Valid [15]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 16 [0/125] | Valid Loss: 0.6539 | Elapse: 0.06s
Epoch 16 [100/125] | Valid Loss: 0.7808 | Elapse: 5.08s


----------------------------------------------------------------------------------------------------
Epoch 16 - Average Loss: (train) 0.3939; (valid) 0.7772 | Time: 36.00s


Epoch 16 [124/125] | Valid Loss: 0.7772 | Elapse: 6.26s


Train [16]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 17 [0/506] | Train Loss: 0.4542 Grad: 437523.4688 LR: 1.1711e-05 | Elapse: 0.06s
Epoch 17 [100/506] | Train Loss: 0.3843 Grad: 429806.3750 LR: 1.0627e-05 | Elapse: 5.86s
Epoch 17 [200/506] | Train Loss: 0.3917 Grad: 366589.3750 LR: 9.5894e-06 | Elapse: 11.71s
Epoch 17 [300/506] | Train Loss: 0.3880 Grad: 434062.6875 LR: 8.6001e-06 | Elapse: 17.55s
Epoch 17 [400/506] | Train Loss: 0.3885 Grad: 399524.8438 LR: 7.6602e-06 | Elapse: 23.39s
Epoch 17 [500/506] | Train Loss: 0.3887 Grad: 521177.2812 LR: 6.7706e-06 | Elapse: 29.20s
Epoch 17 [505/506] | Train Loss: 0.3883 Grad: 404653.5625 LR: 6.7274e-06 | Elapse: 29.50s


Valid [16]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 17 [0/125] | Valid Loss: 0.6332 | Elapse: 0.06s
Epoch 17 [100/125] | Valid Loss: 0.7701 | Elapse: 5.05s


----------------------------------------------------------------------------------------------------
Epoch 17 - Average Loss: (train) 0.3883; (valid) 0.7670 | Time: 35.73s
Best model found in epoch 17 | valid loss: 0.7670


Epoch 17 [124/125] | Valid Loss: 0.7670 | Elapse: 6.23s


Train [17]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 18 [0/506] | Train Loss: 0.4415 Grad: 453314.6562 LR: 6.7188e-06 | Elapse: 0.06s
Epoch 18 [100/506] | Train Loss: 0.3795 Grad: 460740.4375 LR: 5.8838e-06 | Elapse: 5.93s
Epoch 18 [200/506] | Train Loss: 0.3865 Grad: 346845.5312 LR: 5.1013e-06 | Elapse: 11.81s
Epoch 18 [300/506] | Train Loss: 0.3831 Grad: 429666.3438 LR: 4.3722e-06 | Elapse: 17.65s
Epoch 18 [400/506] | Train Loss: 0.3838 Grad: 395700.4688 LR: 3.6975e-06 | Elapse: 23.53s
Epoch 18 [500/506] | Train Loss: 0.3842 Grad: 495364.7812 LR: 3.0778e-06 | Elapse: 29.45s
Epoch 18 [505/506] | Train Loss: 0.3838 Grad: 374180.7500 LR: 3.0483e-06 | Elapse: 29.75s


Valid [17]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 18 [0/125] | Valid Loss: 0.6213 | Elapse: 0.06s
Epoch 18 [100/125] | Valid Loss: 0.7641 | Elapse: 5.13s


----------------------------------------------------------------------------------------------------
Epoch 18 - Average Loss: (train) 0.3838; (valid) 0.7610 | Time: 36.07s
Best model found in epoch 18 | valid loss: 0.7610


Epoch 18 [124/125] | Valid Loss: 0.7610 | Elapse: 6.32s


Train [18]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 19 [0/506] | Train Loss: 0.4324 Grad: 466297.4375 LR: 3.0424e-06 | Elapse: 0.06s
Epoch 19 [100/506] | Train Loss: 0.3765 Grad: 486212.8438 LR: 2.4819e-06 | Elapse: 5.96s
Epoch 19 [200/506] | Train Loss: 0.3827 Grad: 342458.9688 LR: 1.9780e-06 | Elapse: 11.86s
Epoch 19 [300/506] | Train Loss: 0.3794 Grad: 421612.6875 LR: 1.5313e-06 | Elapse: 17.71s
Epoch 19 [400/506] | Train Loss: 0.3804 Grad: 402213.8750 LR: 1.1422e-06 | Elapse: 23.46s
Epoch 19 [500/506] | Train Loss: 0.3809 Grad: 475659.3125 LR: 8.1133e-07 | Elapse: 29.26s
Epoch 19 [505/506] | Train Loss: 0.3806 Grad: 366938.5312 LR: 7.9632e-07 | Elapse: 29.55s


Valid [18]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 19 [0/125] | Valid Loss: 0.6210 | Elapse: 0.05s
Epoch 19 [100/125] | Valid Loss: 0.7642 | Elapse: 4.96s


----------------------------------------------------------------------------------------------------
Epoch 19 - Average Loss: (train) 0.3806; (valid) 0.7608 | Time: 35.67s
Best model found in epoch 19 | valid loss: 0.7608


Epoch 19 [124/125] | Valid Loss: 0.7608 | Elapse: 6.12s


Train [19]:   0%|          | 0/506 [00:00<?, ?batch/s]

Epoch 20 [0/506] | Train Loss: 0.4238 Grad: 457406.4375 LR: 7.9333e-07 | Elapse: 0.06s
Epoch 20 [100/506] | Train Loss: 0.3742 Grad: 483715.2500 LR: 5.2448e-07 | Elapse: 5.84s
Epoch 20 [200/506] | Train Loss: 0.3802 Grad: 169096.0312 LR: 3.1452e-07 | Elapse: 11.61s
Epoch 20 [300/506] | Train Loss: 0.3771 Grad: 210287.4844 LR: 1.6368e-07 | Elapse: 17.38s
Epoch 20 [400/506] | Train Loss: 0.3784 Grad: 199132.4531 LR: 7.2154e-08 | Elapse: 23.21s
Epoch 20 [500/506] | Train Loss: 0.3790 Grad: 234816.3750 LR: 4.0048e-08 | Elapse: 29.08s
Epoch 20 [505/506] | Train Loss: 0.3787 Grad: 185101.1875 LR: 4.0003e-08 | Elapse: 29.37s


Valid [19]:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch 20 [0/125] | Valid Loss: 0.6222 | Elapse: 0.05s
Epoch 20 [100/125] | Valid Loss: 0.7651 | Elapse: 5.05s


----------------------------------------------------------------------------------------------------
Epoch 20 - Average Loss: (train) 0.3787; (valid) 0.7616 | Time: 35.62s
Fold 0 Valid Loss: 0.7607576847076416
Elapse: 11.92 min 
- Stage 2 | Train: 5215; Valid: 1277 -


Epoch 20 [124/125] | Valid Loss: 0.7616 | Elapse: 6.24s
Loading model from checkpoint: outputs/ResnetGRU_v1_LB048_fold_0_stage_1.pth


Train [0]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 1 [0/162] | Train Loss: 0.3661 Grad: 593422.6250 LR: 4.0023e-06 | Elapse: 0.07s
Epoch 1 [100/162] | Train Loss: 0.3350 Grad: 254626.6562 LR: 2.5357e-05 | Elapse: 5.95s
Epoch 1 [161/162] | Train Loss: 0.3279 Grad: 330593.0000 LR: 5.2233e-05 | Elapse: 9.54s


Valid [0]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 1 [0/40] | Valid Loss: 0.4260 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 0.3279; (valid) 0.5163 | Time: 11.56s
Best model found in epoch 1 | valid loss: 0.5163


Epoch 1 [39/40] | Valid Loss: 0.5163 | Elapse: 2.01s


Train [1]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 2 [0/162] | Train Loss: 0.2025 Grad: 477145.7188 LR: 5.2700e-05 | Elapse: 0.06s
Epoch 2 [100/162] | Train Loss: 0.2761 Grad: 396974.6875 LR: 9.2056e-05 | Elapse: 5.93s
Epoch 2 [161/162] | Train Loss: 0.2713 Grad: 303789.8750 LR: 1.0000e-04 | Elapse: 9.47s


Valid [1]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 2 [0/40] | Valid Loss: 0.4043 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.2713; (valid) 0.4910 | Time: 11.50s
Best model found in epoch 2 | valid loss: 0.4910


Epoch 2 [39/40] | Valid Loss: 0.4910 | Elapse: 2.03s


Train [2]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 3 [0/162] | Train Loss: 0.1622 Grad: 363285.7812 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/162] | Train Loss: 0.2351 Grad: 333028.8750 LR: 9.9699e-05 | Elapse: 5.85s
Epoch 3 [161/162] | Train Loss: 0.2309 Grad: 434766.6250 LR: 9.9231e-05 | Elapse: 9.37s


Valid [2]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 3 [0/40] | Valid Loss: 0.3919 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.2309; (valid) 0.4788 | Time: 11.33s
Best model found in epoch 3 | valid loss: 0.4788


Epoch 3 [39/40] | Valid Loss: 0.4788 | Elapse: 1.96s


Train [3]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 4 [0/162] | Train Loss: 0.1403 Grad: 319835.6562 LR: 9.9222e-05 | Elapse: 0.06s
Epoch 4 [100/162] | Train Loss: 0.2065 Grad: 320664.2188 LR: 9.7992e-05 | Elapse: 5.85s
Epoch 4 [161/162] | Train Loss: 0.2033 Grad: 415324.2188 LR: 9.6967e-05 | Elapse: 9.37s


Valid [3]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 4 [0/40] | Valid Loss: 0.3897 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.2033; (valid) 0.4760 | Time: 11.34s
Best model found in epoch 4 | valid loss: 0.4760


Epoch 4 [39/40] | Valid Loss: 0.4760 | Elapse: 1.97s


Train [4]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 5 [0/162] | Train Loss: 0.1277 Grad: 322941.9062 LR: 9.6949e-05 | Elapse: 0.06s
Epoch 5 [100/162] | Train Loss: 0.1873 Grad: 307812.0312 LR: 9.4828e-05 | Elapse: 5.80s
Epoch 5 [161/162] | Train Loss: 0.1846 Grad: 446240.5938 LR: 9.3277e-05 | Elapse: 9.30s


Valid [4]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 5 [0/40] | Valid Loss: 0.3986 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.1846; (valid) 0.4762 | Time: 11.26s


Epoch 5 [39/40] | Valid Loss: 0.4762 | Elapse: 1.96s


Train [5]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 6 [0/162] | Train Loss: 0.1088 Grad: 294581.9688 LR: 9.3250e-05 | Elapse: 0.06s
Epoch 6 [100/162] | Train Loss: 0.1724 Grad: 299174.4375 LR: 9.0302e-05 | Elapse: 5.87s
Epoch 6 [161/162] | Train Loss: 0.1698 Grad: 399720.8750 LR: 8.8272e-05 | Elapse: 9.41s


Valid [5]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 6 [0/40] | Valid Loss: 0.3920 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.1698; (valid) 0.4816 | Time: 11.40s


Epoch 6 [39/40] | Valid Loss: 0.4816 | Elapse: 1.98s


Train [6]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 7 [0/162] | Train Loss: 0.1028 Grad: 323307.8438 LR: 8.8238e-05 | Elapse: 0.06s
Epoch 7 [100/162] | Train Loss: 0.1581 Grad: 291224.8750 LR: 8.4553e-05 | Elapse: 5.88s
Epoch 7 [161/162] | Train Loss: 0.1550 Grad: 366607.1875 LR: 8.2105e-05 | Elapse: 9.43s


Valid [6]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 7 [0/40] | Valid Loss: 0.3935 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.1550; (valid) 0.4796 | Time: 11.42s


Epoch 7 [39/40] | Valid Loss: 0.4796 | Elapse: 1.99s


Train [7]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 8 [0/162] | Train Loss: 0.0917 Grad: 307416.5000 LR: 8.2064e-05 | Elapse: 0.06s
Epoch 8 [100/162] | Train Loss: 0.1446 Grad: 356699.0000 LR: 7.7754e-05 | Elapse: 5.90s
Epoch 8 [161/162] | Train Loss: 0.1421 Grad: 310882.4375 LR: 7.4963e-05 | Elapse: 9.44s


Valid [7]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 8 [0/40] | Valid Loss: 0.3930 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.1421; (valid) 0.4856 | Time: 11.41s


Epoch 8 [39/40] | Valid Loss: 0.4856 | Elapse: 1.96s


Train [8]:   0%|          | 0/162 [00:00<?, ?batch/s]

Epoch 9 [0/162] | Train Loss: 0.0859 Grad: 341965.7812 LR: 7.4917e-05 | Elapse: 0.06s
Epoch 9 [100/162] | Train Loss: 0.1376 Grad: 416422.4688 LR: 7.0112e-05 | Elapse: 5.86s
Epoch 9 [161/162] | Train Loss: 0.1342 Grad: 335053.6250 LR: 6.7064e-05 | Elapse: 9.41s


Valid [8]:   0%|          | 0/40 [00:00<?, ?batch/s]

Epoch 9 [0/40] | Valid Loss: 0.3881 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.1342; (valid) 0.4828 | Time: 11.39s
Early stopping at epoch 9
Fold 0 Valid Loss: 0.4760229289531708
Elapse: 1.71 min 
Fold 0 Elapse: 13.63 min
Fold: 1
- Stage 1 | Train: 16297; Valid: 3886 -


Epoch 9 [39/40] | Valid Loss: 0.4828 | Elapse: 1.97s


Train [0]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 1 [0/509] | Train Loss: 1.1793 Grad: 69410.4766 LR: 4.0002e-06 | Elapse: 0.07s
Epoch 1 [100/509] | Train Loss: 1.1798 Grad: 92523.1406 LR: 6.3173e-06 | Elapse: 5.92s
Epoch 1 [200/509] | Train Loss: 1.1841 Grad: 90920.8359 LR: 1.2959e-05 | Elapse: 11.73s
Epoch 1 [300/509] | Train Loss: 1.1747 Grad: 52713.0156 LR: 2.3297e-05 | Elapse: 17.49s
Epoch 1 [400/509] | Train Loss: 1.1617 Grad: 67290.0703 LR: 3.6352e-05 | Elapse: 23.24s
Epoch 1 [500/509] | Train Loss: 1.1426 Grad: 53618.8516 LR: 5.0888e-05 | Elapse: 28.99s
Epoch 1 [508/509] | Train Loss: 1.1405 Grad: 55252.0859 LR: 5.2074e-05 | Elapse: 29.45s


Valid [0]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 1 [0/122] | Valid Loss: 1.2659 | Elapse: 0.05s
Epoch 1 [100/122] | Valid Loss: 1.3081 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 1.1405; (valid) 1.3078 | Time: 35.42s
Best model found in epoch 1 | valid loss: 1.3078


Epoch 1 [121/122] | Valid Loss: 1.3078 | Elapse: 5.96s


Train [1]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 2 [0/509] | Train Loss: 1.0079 Grad: 53654.5859 LR: 5.2222e-05 | Elapse: 0.06s
Epoch 2 [100/509] | Train Loss: 1.0134 Grad: 106202.3906 LR: 6.6805e-05 | Elapse: 5.81s
Epoch 2 [200/509] | Train Loss: 1.0155 Grad: 63047.9414 LR: 7.9985e-05 | Elapse: 11.63s
Epoch 2 [300/509] | Train Loss: 1.0051 Grad: 51056.9414 LR: 9.0517e-05 | Elapse: 17.51s
Epoch 2 [400/509] | Train Loss: 0.9947 Grad: 75958.4375 LR: 9.7402e-05 | Elapse: 23.37s
Epoch 2 [500/509] | Train Loss: 0.9805 Grad: 57373.7930 LR: 9.9989e-05 | Elapse: 29.25s
Epoch 2 [508/509] | Train Loss: 0.9792 Grad: 74742.3750 LR: 1.0000e-04 | Elapse: 29.72s


Valid [1]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 2 [0/122] | Valid Loss: 1.0424 | Elapse: 0.06s
Epoch 2 [100/122] | Valid Loss: 1.1434 | Elapse: 5.03s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.9792; (valid) 1.1426 | Time: 35.77s
Best model found in epoch 2 | valid loss: 1.1426


Epoch 2 [121/122] | Valid Loss: 1.1426 | Elapse: 6.06s


Train [2]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 3 [0/509] | Train Loss: 0.9545 Grad: 67669.6484 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/509] | Train Loss: 0.8937 Grad: 133077.8594 LR: 9.9969e-05 | Elapse: 5.88s
Epoch 3 [200/509] | Train Loss: 0.8948 Grad: 90648.4688 LR: 9.9880e-05 | Elapse: 11.77s
Epoch 3 [300/509] | Train Loss: 0.8824 Grad: 85809.8906 LR: 9.9732e-05 | Elapse: 17.59s
Epoch 3 [400/509] | Train Loss: 0.8693 Grad: 131071.8359 LR: 9.9526e-05 | Elapse: 23.43s
Epoch 3 [500/509] | Train Loss: 0.8539 Grad: 98700.2344 LR: 9.9261e-05 | Elapse: 29.25s
Epoch 3 [508/509] | Train Loss: 0.8525 Grad: 160402.2500 LR: 9.9238e-05 | Elapse: 29.72s


Valid [2]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 3 [0/122] | Valid Loss: 0.9456 | Elapse: 0.06s
Epoch 3 [100/122] | Valid Loss: 0.9765 | Elapse: 5.02s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.8525; (valid) 0.9781 | Time: 35.76s
Best model found in epoch 3 | valid loss: 0.9781


Epoch 3 [121/122] | Valid Loss: 0.9781 | Elapse: 6.03s


Train [3]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 4 [0/509] | Train Loss: 0.8009 Grad: 106448.2656 LR: 9.9235e-05 | Elapse: 0.06s
Epoch 4 [100/509] | Train Loss: 0.7724 Grad: 232273.6406 LR: 9.8907e-05 | Elapse: 5.93s
Epoch 4 [200/509] | Train Loss: 0.7743 Grad: 102067.2969 LR: 9.8522e-05 | Elapse: 11.78s
Epoch 4 [300/509] | Train Loss: 0.7690 Grad: 148286.0781 LR: 9.8080e-05 | Elapse: 17.63s
Epoch 4 [400/509] | Train Loss: 0.7630 Grad: 144000.2969 LR: 9.7581e-05 | Elapse: 23.51s
Epoch 4 [500/509] | Train Loss: 0.7542 Grad: 132777.4688 LR: 9.7027e-05 | Elapse: 29.43s
Epoch 4 [508/509] | Train Loss: 0.7533 Grad: 200525.3125 LR: 9.6980e-05 | Elapse: 29.90s


Valid [3]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 4 [0/122] | Valid Loss: 0.9021 | Elapse: 0.06s
Epoch 4 [100/122] | Valid Loss: 0.8885 | Elapse: 5.02s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.7533; (valid) 0.8902 | Time: 35.94s
Best model found in epoch 4 | valid loss: 0.8902


Epoch 4 [121/122] | Valid Loss: 0.8902 | Elapse: 6.04s


Train [4]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 5 [0/509] | Train Loss: 0.7206 Grad: 150494.5938 LR: 9.6974e-05 | Elapse: 0.07s
Epoch 5 [100/509] | Train Loss: 0.6963 Grad: 249340.6719 LR: 9.6359e-05 | Elapse: 5.95s
Epoch 5 [200/509] | Train Loss: 0.6987 Grad: 147700.2812 LR: 9.5690e-05 | Elapse: 11.80s
Epoch 5 [300/509] | Train Loss: 0.6979 Grad: 167487.0781 LR: 9.4967e-05 | Elapse: 17.64s
Epoch 5 [400/509] | Train Loss: 0.6940 Grad: 161822.1875 LR: 9.4191e-05 | Elapse: 23.46s
Epoch 5 [500/509] | Train Loss: 0.6877 Grad: 158788.5625 LR: 9.3364e-05 | Elapse: 29.30s
Epoch 5 [508/509] | Train Loss: 0.6868 Grad: 236574.7344 LR: 9.3295e-05 | Elapse: 29.77s


Valid [4]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 5 [0/122] | Valid Loss: 0.8514 | Elapse: 0.06s
Epoch 5 [100/122] | Valid Loss: 0.8236 | Elapse: 5.01s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.6868; (valid) 0.8250 | Time: 35.81s
Best model found in epoch 5 | valid loss: 0.8250


Epoch 5 [121/122] | Valid Loss: 0.8250 | Elapse: 6.04s


Train [5]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 6 [0/509] | Train Loss: 0.6302 Grad: 208906.9531 LR: 9.3287e-05 | Elapse: 0.06s
Epoch 6 [100/509] | Train Loss: 0.6305 Grad: 313859.2188 LR: 9.2404e-05 | Elapse: 5.89s
Epoch 6 [200/509] | Train Loss: 0.6325 Grad: 153518.8594 LR: 9.1471e-05 | Elapse: 11.70s
Epoch 6 [300/509] | Train Loss: 0.6359 Grad: 185078.3750 LR: 9.0489e-05 | Elapse: 17.52s
Epoch 6 [400/509] | Train Loss: 0.6332 Grad: 185347.0312 LR: 8.9460e-05 | Elapse: 23.31s
Epoch 6 [500/509] | Train Loss: 0.6292 Grad: 190019.6406 LR: 8.8384e-05 | Elapse: 29.06s
Epoch 6 [508/509] | Train Loss: 0.6284 Grad: 251891.5469 LR: 8.8296e-05 | Elapse: 29.53s


Valid [5]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 6 [0/122] | Valid Loss: 0.7678 | Elapse: 0.05s
Epoch 6 [100/122] | Valid Loss: 0.7711 | Elapse: 4.97s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.6284; (valid) 0.7723 | Time: 35.50s
Best model found in epoch 6 | valid loss: 0.7723


Epoch 6 [121/122] | Valid Loss: 0.7723 | Elapse: 5.97s


Train [6]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 7 [0/509] | Train Loss: 0.5701 Grad: 247719.0000 LR: 8.8285e-05 | Elapse: 0.06s
Epoch 7 [100/509] | Train Loss: 0.5780 Grad: 392204.2500 LR: 8.7160e-05 | Elapse: 5.90s
Epoch 7 [200/509] | Train Loss: 0.5798 Grad: 200495.4062 LR: 8.5992e-05 | Elapse: 11.71s
Epoch 7 [300/509] | Train Loss: 0.5858 Grad: 227940.8438 LR: 8.4781e-05 | Elapse: 17.49s
Epoch 7 [400/509] | Train Loss: 0.5839 Grad: 207104.8281 LR: 8.3529e-05 | Elapse: 23.25s
Epoch 7 [500/509] | Train Loss: 0.5816 Grad: 237092.8750 LR: 8.2238e-05 | Elapse: 29.02s
Epoch 7 [508/509] | Train Loss: 0.5809 Grad: 281906.6250 LR: 8.2133e-05 | Elapse: 29.48s


Valid [6]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 7 [0/122] | Valid Loss: 0.7391 | Elapse: 0.05s
Epoch 7 [100/122] | Valid Loss: 0.7397 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.5809; (valid) 0.7403 | Time: 35.43s
Best model found in epoch 7 | valid loss: 0.7403


Epoch 7 [121/122] | Valid Loss: 0.7403 | Elapse: 5.95s


Train [7]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 8 [0/509] | Train Loss: 0.5185 Grad: 303157.2500 LR: 8.2120e-05 | Elapse: 0.06s
Epoch 8 [100/509] | Train Loss: 0.5361 Grad: 359751.6562 LR: 8.0788e-05 | Elapse: 5.83s
Epoch 8 [200/509] | Train Loss: 0.5379 Grad: 230786.9688 LR: 7.9420e-05 | Elapse: 11.65s
Epoch 8 [300/509] | Train Loss: 0.5455 Grad: 296062.2500 LR: 7.8017e-05 | Elapse: 17.41s
Epoch 8 [400/509] | Train Loss: 0.5441 Grad: 199862.6406 LR: 7.6581e-05 | Elapse: 23.17s
Epoch 8 [500/509] | Train Loss: 0.5430 Grad: 260365.6875 LR: 7.5114e-05 | Elapse: 28.95s
Epoch 8 [508/509] | Train Loss: 0.5423 Grad: 340483.7188 LR: 7.4995e-05 | Elapse: 29.41s


Valid [7]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 8 [0/122] | Valid Loss: 0.7298 | Elapse: 0.06s
Epoch 8 [100/122] | Valid Loss: 0.7254 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.5423; (valid) 0.7259 | Time: 35.41s
Best model found in epoch 8 | valid loss: 0.7259


Epoch 8 [121/122] | Valid Loss: 0.7259 | Elapse: 6.00s


Train [8]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 9 [0/509] | Train Loss: 0.4890 Grad: 365742.0938 LR: 7.4980e-05 | Elapse: 0.06s
Epoch 9 [100/509] | Train Loss: 0.5040 Grad: 330425.4375 LR: 7.3481e-05 | Elapse: 5.90s
Epoch 9 [200/509] | Train Loss: 0.5051 Grad: 258849.5312 LR: 7.1954e-05 | Elapse: 11.72s
Epoch 9 [300/509] | Train Loss: 0.5124 Grad: 338355.4375 LR: 7.0402e-05 | Elapse: 17.56s
Epoch 9 [400/509] | Train Loss: 0.5107 Grad: 214271.2656 LR: 6.8825e-05 | Elapse: 23.38s
Epoch 9 [500/509] | Train Loss: 0.5102 Grad: 302050.3125 LR: 6.7227e-05 | Elapse: 29.19s
Epoch 9 [508/509] | Train Loss: 0.5096 Grad: 385796.7500 LR: 6.7098e-05 | Elapse: 29.66s


Valid [8]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 9 [0/122] | Valid Loss: 0.7275 | Elapse: 0.06s
Epoch 9 [100/122] | Valid Loss: 0.7167 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.5096; (valid) 0.7167 | Time: 35.66s
Best model found in epoch 9 | valid loss: 0.7167


Epoch 9 [121/122] | Valid Loss: 0.7167 | Elapse: 5.99s


Train [9]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 10 [0/509] | Train Loss: 0.4648 Grad: 418563.3125 LR: 6.7082e-05 | Elapse: 0.06s
Epoch 10 [100/509] | Train Loss: 0.4768 Grad: 271099.7812 LR: 6.5461e-05 | Elapse: 5.84s
Epoch 10 [200/509] | Train Loss: 0.4773 Grad: 296733.3125 LR: 6.3823e-05 | Elapse: 11.61s
Epoch 10 [300/509] | Train Loss: 0.4845 Grad: 179545.7656 LR: 6.2168e-05 | Elapse: 17.38s
Epoch 10 [400/509] | Train Loss: 0.4825 Grad: 119956.2188 LR: 6.0499e-05 | Elapse: 23.13s
Epoch 10 [500/509] | Train Loss: 0.4823 Grad: 170726.9531 LR: 5.8817e-05 | Elapse: 28.88s
Epoch 10 [508/509] | Train Loss: 0.4818 Grad: 191448.4219 LR: 5.8682e-05 | Elapse: 29.34s


Valid [9]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 10 [0/122] | Valid Loss: 0.7185 | Elapse: 0.05s
Epoch 10 [100/122] | Valid Loss: 0.7126 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 10 - Average Loss: (train) 0.4818; (valid) 0.7117 | Time: 35.30s
Best model found in epoch 10 | valid loss: 0.7117


Epoch 10 [121/122] | Valid Loss: 0.7117 | Elapse: 5.96s


Train [10]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 11 [0/509] | Train Loss: 0.4354 Grad: 418568.5625 LR: 5.8665e-05 | Elapse: 0.06s
Epoch 11 [100/509] | Train Loss: 0.4537 Grad: 249907.0000 LR: 5.6972e-05 | Elapse: 5.83s
Epoch 11 [200/509] | Train Loss: 0.4539 Grad: 151304.6094 LR: 5.5272e-05 | Elapse: 11.61s
Epoch 11 [300/509] | Train Loss: 0.4612 Grad: 228233.0469 LR: 5.3565e-05 | Elapse: 17.40s
Epoch 11 [400/509] | Train Loss: 0.4593 Grad: 124595.0391 LR: 5.1853e-05 | Elapse: 23.17s
Epoch 11 [500/509] | Train Loss: 0.4593 Grad: 188009.1875 LR: 5.0140e-05 | Elapse: 28.93s
Epoch 11 [508/509] | Train Loss: 0.4589 Grad: 216001.2188 LR: 5.0003e-05 | Elapse: 29.38s


Valid [10]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 11 [0/122] | Valid Loss: 0.7278 | Elapse: 0.05s
Epoch 11 [100/122] | Valid Loss: 0.7151 | Elapse: 4.97s


----------------------------------------------------------------------------------------------------
Epoch 11 - Average Loss: (train) 0.4589; (valid) 0.7143 | Time: 35.36s


Epoch 11 [121/122] | Valid Loss: 0.7143 | Elapse: 5.98s


Train [11]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 12 [0/509] | Train Loss: 0.4150 Grad: 414473.9375 LR: 4.9986e-05 | Elapse: 0.06s
Epoch 12 [100/509] | Train Loss: 0.4324 Grad: 113931.4766 LR: 4.8272e-05 | Elapse: 5.82s
Epoch 12 [200/509] | Train Loss: 0.4342 Grad: 189202.9844 LR: 4.6561e-05 | Elapse: 11.59s
Epoch 12 [300/509] | Train Loss: 0.4412 Grad: 267092.4688 LR: 4.4854e-05 | Elapse: 17.38s
Epoch 12 [400/509] | Train Loss: 0.4394 Grad: 130323.6562 LR: 4.3152e-05 | Elapse: 23.17s
Epoch 12 [500/509] | Train Loss: 0.4394 Grad: 208410.8906 LR: 4.1459e-05 | Elapse: 28.95s
Epoch 12 [508/509] | Train Loss: 0.4391 Grad: 247866.1250 LR: 4.1324e-05 | Elapse: 29.42s


Valid [11]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 12 [0/122] | Valid Loss: 0.7513 | Elapse: 0.06s
Epoch 12 [100/122] | Valid Loss: 0.7211 | Elapse: 5.10s


----------------------------------------------------------------------------------------------------
Epoch 12 - Average Loss: (train) 0.4391; (valid) 0.7207 | Time: 35.57s


Epoch 12 [121/122] | Valid Loss: 0.7207 | Elapse: 6.14s


Train [12]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 13 [0/509] | Train Loss: 0.3915 Grad: 384790.0000 LR: 4.1307e-05 | Elapse: 0.06s
Epoch 13 [100/509] | Train Loss: 0.4161 Grad: 211454.7656 LR: 3.9625e-05 | Elapse: 5.96s
Epoch 13 [200/509] | Train Loss: 0.4179 Grad: 965024.5625 LR: 3.7955e-05 | Elapse: 11.83s
Epoch 13 [300/509] | Train Loss: 0.4245 Grad: 478166.1250 LR: 3.6300e-05 | Elapse: 17.71s
Epoch 13 [400/509] | Train Loss: 0.4224 Grad: 264149.9688 LR: 3.4660e-05 | Elapse: 23.59s
Epoch 13 [500/509] | Train Loss: 0.4228 Grad: 424831.3125 LR: 3.3039e-05 | Elapse: 29.46s
Epoch 13 [508/509] | Train Loss: 0.4225 Grad: 615422.5000 LR: 3.2910e-05 | Elapse: 29.92s


Valid [12]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 13 [0/122] | Valid Loss: 0.7739 | Elapse: 0.05s
Epoch 13 [100/122] | Valid Loss: 0.7336 | Elapse: 5.05s


----------------------------------------------------------------------------------------------------
Epoch 13 - Average Loss: (train) 0.4225; (valid) 0.7342 | Time: 36.01s


Epoch 13 [121/122] | Valid Loss: 0.7342 | Elapse: 6.09s


Train [13]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 14 [0/509] | Train Loss: 0.3692 Grad: 356557.8125 LR: 3.2894e-05 | Elapse: 0.06s
Epoch 14 [100/509] | Train Loss: 0.4003 Grad: 119309.8203 LR: 3.1294e-05 | Elapse: 5.93s
Epoch 14 [200/509] | Train Loss: 0.4027 Grad: 297859.4688 LR: 2.9716e-05 | Elapse: 11.79s
Epoch 14 [300/509] | Train Loss: 0.4089 Grad: 237865.0469 LR: 2.8163e-05 | Elapse: 17.68s
Epoch 14 [400/509] | Train Loss: 0.4073 Grad: 140659.1562 LR: 2.6635e-05 | Elapse: 23.55s
Epoch 14 [500/509] | Train Loss: 0.4081 Grad: 215797.0000 LR: 2.5134e-05 | Elapse: 29.45s
Epoch 14 [508/509] | Train Loss: 0.4079 Grad: 349521.1875 LR: 2.5015e-05 | Elapse: 29.92s


Valid [13]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 14 [0/122] | Valid Loss: 0.7819 | Elapse: 0.06s
Epoch 14 [100/122] | Valid Loss: 0.7425 | Elapse: 5.05s


----------------------------------------------------------------------------------------------------
Epoch 14 - Average Loss: (train) 0.4079; (valid) 0.7445 | Time: 35.99s


Epoch 14 [121/122] | Valid Loss: 0.7445 | Elapse: 6.07s


Train [14]:   0%|          | 0/509 [00:00<?, ?batch/s]

Epoch 15 [0/509] | Train Loss: 0.3534 Grad: 365969.9688 LR: 2.5000e-05 | Elapse: 0.06s
Epoch 15 [100/509] | Train Loss: 0.3880 Grad: 119525.3281 LR: 2.3532e-05 | Elapse: 5.94s
Epoch 15 [200/509] | Train Loss: 0.3905 Grad: 222382.5312 LR: 2.2094e-05 | Elapse: 11.80s
Epoch 15 [300/509] | Train Loss: 0.3963 Grad: 260401.3594 LR: 2.0690e-05 | Elapse: 17.67s
Epoch 15 [400/509] | Train Loss: 0.3949 Grad: 151630.3281 LR: 1.9320e-05 | Elapse: 23.53s
Epoch 15 [500/509] | Train Loss: 0.3964 Grad: 260798.7656 LR: 1.7985e-05 | Elapse: 29.42s
Epoch 15 [508/509] | Train Loss: 0.3961 Grad: 331158.6250 LR: 1.7880e-05 | Elapse: 29.88s


Valid [14]:   0%|          | 0/122 [00:00<?, ?batch/s]

Epoch 15 [0/122] | Valid Loss: 0.7609 | Elapse: 0.05s
Epoch 15 [100/122] | Valid Loss: 0.7331 | Elapse: 5.07s


----------------------------------------------------------------------------------------------------
Epoch 15 - Average Loss: (train) 0.3961; (valid) 0.7354 | Time: 35.99s
Early stopping at epoch 15
Fold 1 Valid Loss: 0.7116912603378296
Elapse: 8.92 min 
- Stage 2 | Train: 5248; Valid: 1244 -


Epoch 15 [121/122] | Valid Loss: 0.7354 | Elapse: 6.11s
Loading model from checkpoint: outputs/ResnetGRU_v1_LB048_fold_1_stage_1.pth


Train [0]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 1 [0/164] | Train Loss: 0.4731 Grad: 844993.0625 LR: 4.0022e-06 | Elapse: 0.07s
Epoch 1 [100/164] | Train Loss: 0.3414 Grad: 303061.4375 LR: 2.4879e-05 | Elapse: 5.99s
Epoch 1 [163/164] | Train Loss: 0.3304 Grad: 371534.6875 LR: 5.2231e-05 | Elapse: 9.74s


Valid [0]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 1 [0/39] | Valid Loss: 0.4658 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 0.3304; (valid) 0.5135 | Time: 11.73s
Best model found in epoch 1 | valid loss: 0.5135


Epoch 1 [38/39] | Valid Loss: 0.5135 | Elapse: 1.99s


Train [1]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 2 [0/164] | Train Loss: 0.3493 Grad: 659771.3750 LR: 5.2692e-05 | Elapse: 0.06s
Epoch 2 [100/164] | Train Loss: 0.2825 Grad: 563418.3125 LR: 9.1734e-05 | Elapse: 6.00s
Epoch 2 [163/164] | Train Loss: 0.2747 Grad: 546782.7500 LR: 1.0000e-04 | Elapse: 9.74s


Valid [1]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 2 [0/39] | Valid Loss: 0.4805 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.2747; (valid) 0.4909 | Time: 11.71s
Best model found in epoch 2 | valid loss: 0.4909


Epoch 2 [38/39] | Valid Loss: 0.4909 | Elapse: 1.97s


Train [2]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 3 [0/164] | Train Loss: 0.2736 Grad: 503929.5312 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/164] | Train Loss: 0.2372 Grad: 534429.1250 LR: 9.9706e-05 | Elapse: 5.99s
Epoch 3 [163/164] | Train Loss: 0.2301 Grad: 488043.6875 LR: 9.9231e-05 | Elapse: 9.67s


Valid [2]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 3 [0/39] | Valid Loss: 0.4786 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.2301; (valid) 0.4860 | Time: 11.61s
Best model found in epoch 3 | valid loss: 0.4860


Epoch 3 [38/39] | Valid Loss: 0.4860 | Elapse: 1.94s


Train [3]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 4 [0/164] | Train Loss: 0.2250 Grad: 388162.5312 LR: 9.9222e-05 | Elapse: 0.06s
Epoch 4 [100/164] | Train Loss: 0.2055 Grad: 474863.4062 LR: 9.8011e-05 | Elapse: 5.91s
Epoch 4 [163/164] | Train Loss: 0.2008 Grad: 434401.6875 LR: 9.6968e-05 | Elapse: 9.58s


Valid [3]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 4 [0/39] | Valid Loss: 0.4715 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.2008; (valid) 0.4829 | Time: 11.52s
Best model found in epoch 4 | valid loss: 0.4829


Epoch 4 [38/39] | Valid Loss: 0.4829 | Elapse: 1.94s


Train [4]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 5 [0/164] | Train Loss: 0.1904 Grad: 358780.3438 LR: 9.6949e-05 | Elapse: 0.06s
Epoch 5 [100/164] | Train Loss: 0.1847 Grad: 417525.6250 LR: 9.4857e-05 | Elapse: 5.91s
Epoch 5 [163/164] | Train Loss: 0.1807 Grad: 372016.7188 LR: 9.3277e-05 | Elapse: 9.59s


Valid [4]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 5 [0/39] | Valid Loss: 0.4546 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.1807; (valid) 0.4837 | Time: 11.53s


Epoch 5 [38/39] | Valid Loss: 0.4837 | Elapse: 1.94s


Train [5]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 6 [0/164] | Train Loss: 0.1624 Grad: 335529.5312 LR: 9.3251e-05 | Elapse: 0.06s
Epoch 6 [100/164] | Train Loss: 0.1676 Grad: 377167.5312 LR: 9.0342e-05 | Elapse: 5.90s
Epoch 6 [163/164] | Train Loss: 0.1643 Grad: 384231.4062 LR: 8.8273e-05 | Elapse: 9.58s


Valid [5]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 6 [0/39] | Valid Loss: 0.4471 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.1643; (valid) 0.4837 | Time: 11.53s


Epoch 6 [38/39] | Valid Loss: 0.4837 | Elapse: 1.94s


Train [6]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 7 [0/164] | Train Loss: 0.1506 Grad: 354350.1562 LR: 8.8238e-05 | Elapse: 0.06s
Epoch 7 [100/164] | Train Loss: 0.1540 Grad: 364490.1875 LR: 8.4601e-05 | Elapse: 5.92s
Epoch 7 [163/164] | Train Loss: 0.1509 Grad: 349594.5625 LR: 8.2106e-05 | Elapse: 9.60s


Valid [6]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 7 [0/39] | Valid Loss: 0.4417 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.1509; (valid) 0.4845 | Time: 11.55s


Epoch 7 [38/39] | Valid Loss: 0.4845 | Elapse: 1.95s


Train [7]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 8 [0/164] | Train Loss: 0.1324 Grad: 333563.8125 LR: 8.2065e-05 | Elapse: 0.06s
Epoch 8 [100/164] | Train Loss: 0.1414 Grad: 326394.4375 LR: 7.7810e-05 | Elapse: 5.92s
Epoch 8 [163/164] | Train Loss: 0.1378 Grad: 294935.5625 LR: 7.4964e-05 | Elapse: 9.63s


Valid [7]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 8 [0/39] | Valid Loss: 0.4422 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.1378; (valid) 0.4865 | Time: 11.59s


Epoch 8 [38/39] | Valid Loss: 0.4865 | Elapse: 1.96s


Train [8]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 9 [0/164] | Train Loss: 0.1220 Grad: 343690.4062 LR: 7.4918e-05 | Elapse: 0.07s
Epoch 9 [100/164] | Train Loss: 0.1304 Grad: 344828.9062 LR: 7.0174e-05 | Elapse: 5.90s
Epoch 9 [163/164] | Train Loss: 0.1273 Grad: 431970.7500 LR: 6.7064e-05 | Elapse: 9.58s


Valid [8]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 9 [0/39] | Valid Loss: 0.4307 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.1273; (valid) 0.4884 | Time: 11.54s
Early stopping at epoch 9
Fold 1 Valid Loss: 0.4829496145248413
Elapse: 1.74 min 
Fold 1 Elapse: 10.66 min
Fold: 2
- Stage 1 | Train: 16059; Valid: 4124 -


Epoch 9 [38/39] | Valid Loss: 0.4884 | Elapse: 1.96s


Train [0]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 1 [0/501] | Train Loss: 1.1643 Grad: 69165.1562 LR: 4.0002e-06 | Elapse: 0.07s
Epoch 1 [100/501] | Train Loss: 1.1853 Grad: 92249.0312 LR: 6.3914e-06 | Elapse: 5.92s
Epoch 1 [200/501] | Train Loss: 1.1902 Grad: 74951.2734 LR: 1.3238e-05 | Elapse: 11.77s
Epoch 1 [300/501] | Train Loss: 1.1796 Grad: 81393.1719 LR: 2.3872e-05 | Elapse: 17.61s
Epoch 1 [400/501] | Train Loss: 1.1650 Grad: 54243.9375 LR: 3.7253e-05 | Elapse: 23.51s
Epoch 1 [500/501] | Train Loss: 1.1443 Grad: 61172.1406 LR: 5.2075e-05 | Elapse: 29.37s


Valid [0]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 1 [0/129] | Valid Loss: 1.2066 | Elapse: 0.06s
Epoch 1 [100/129] | Valid Loss: 1.2773 | Elapse: 5.05s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 1.1443; (valid) 1.2764 | Time: 35.82s
Best model found in epoch 1 | valid loss: 1.2764


Epoch 1 [128/129] | Valid Loss: 1.2764 | Elapse: 6.44s


Train [1]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 2 [0/501] | Train Loss: 1.0055 Grad: 51859.8789 LR: 5.2226e-05 | Elapse: 0.06s
Epoch 2 [100/501] | Train Loss: 1.0266 Grad: 57134.0547 LR: 6.7033e-05 | Elapse: 5.94s
Epoch 2 [200/501] | Train Loss: 1.0293 Grad: 72883.2422 LR: 8.0372e-05 | Elapse: 11.83s
Epoch 2 [300/501] | Train Loss: 1.0162 Grad: 61675.7383 LR: 9.0939e-05 | Elapse: 17.73s
Epoch 2 [400/501] | Train Loss: 1.0071 Grad: 46217.4023 LR: 9.7702e-05 | Elapse: 23.54s
Epoch 2 [500/501] | Train Loss: 0.9930 Grad: 78389.1406 LR: 1.0000e-04 | Elapse: 29.29s


Valid [1]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 2 [0/129] | Valid Loss: 1.0113 | Elapse: 0.05s
Epoch 2 [100/129] | Valid Loss: 1.1727 | Elapse: 4.96s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.9930; (valid) 1.1724 | Time: 35.64s
Best model found in epoch 2 | valid loss: 1.1724


Epoch 2 [128/129] | Valid Loss: 1.1724 | Elapse: 6.35s


Train [2]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 3 [0/501] | Train Loss: 0.8875 Grad: 50919.2930 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/501] | Train Loss: 0.9188 Grad: 112362.1562 LR: 9.9968e-05 | Elapse: 5.90s
Epoch 3 [200/501] | Train Loss: 0.9204 Grad: 77045.0859 LR: 9.9876e-05 | Elapse: 11.72s
Epoch 3 [300/501] | Train Loss: 0.9014 Grad: 101666.7422 LR: 9.9724e-05 | Elapse: 17.52s
Epoch 3 [400/501] | Train Loss: 0.8890 Grad: 96002.7812 LR: 9.9511e-05 | Elapse: 23.26s
Epoch 3 [500/501] | Train Loss: 0.8728 Grad: 124664.9062 LR: 9.9238e-05 | Elapse: 29.02s


Valid [2]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 3 [0/129] | Valid Loss: 0.9058 | Elapse: 0.06s
Epoch 3 [100/129] | Valid Loss: 1.0518 | Elapse: 5.02s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.8728; (valid) 1.0538 | Time: 35.43s
Best model found in epoch 3 | valid loss: 1.0538


Epoch 3 [128/129] | Valid Loss: 1.0538 | Elapse: 6.40s


Train [3]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 4 [0/501] | Train Loss: 0.7823 Grad: 96998.8281 LR: 9.9235e-05 | Elapse: 0.06s
Epoch 4 [100/501] | Train Loss: 0.8000 Grad: 185075.1719 LR: 9.8901e-05 | Elapse: 5.89s
Epoch 4 [200/501] | Train Loss: 0.8018 Grad: 111904.7109 LR: 9.8509e-05 | Elapse: 11.72s
Epoch 4 [300/501] | Train Loss: 0.7869 Grad: 112767.6484 LR: 9.8057e-05 | Elapse: 17.54s
Epoch 4 [400/501] | Train Loss: 0.7790 Grad: 127650.0781 LR: 9.7547e-05 | Elapse: 23.36s
Epoch 4 [500/501] | Train Loss: 0.7670 Grad: 184311.3281 LR: 9.6980e-05 | Elapse: 29.25s


Valid [3]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 4 [0/129] | Valid Loss: 0.8350 | Elapse: 0.06s
Epoch 4 [100/129] | Valid Loss: 0.9519 | Elapse: 4.98s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.7670; (valid) 0.9552 | Time: 35.59s
Best model found in epoch 4 | valid loss: 0.9552


Epoch 4 [128/129] | Valid Loss: 0.9552 | Elapse: 6.34s


Train [4]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 5 [0/501] | Train Loss: 0.6882 Grad: 145122.7188 LR: 9.6974e-05 | Elapse: 0.06s
Epoch 5 [100/501] | Train Loss: 0.7159 Grad: 172939.0938 LR: 9.6349e-05 | Elapse: 5.84s
Epoch 5 [200/501] | Train Loss: 0.7183 Grad: 142902.0625 LR: 9.5668e-05 | Elapse: 11.66s
Epoch 5 [300/501] | Train Loss: 0.7081 Grad: 146032.7969 LR: 9.4931e-05 | Elapse: 17.48s
Epoch 5 [400/501] | Train Loss: 0.7028 Grad: 182895.2031 LR: 9.4140e-05 | Elapse: 23.26s
Epoch 5 [500/501] | Train Loss: 0.6935 Grad: 284014.5625 LR: 9.3295e-05 | Elapse: 29.03s


Valid [4]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 5 [0/129] | Valid Loss: 0.7869 | Elapse: 0.05s
Epoch 5 [100/129] | Valid Loss: 0.8857 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.6935; (valid) 0.8900 | Time: 35.33s
Best model found in epoch 5 | valid loss: 0.8900


Epoch 5 [128/129] | Valid Loss: 0.8900 | Elapse: 6.30s


Train [5]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 6 [0/501] | Train Loss: 0.6284 Grad: 178026.7344 LR: 9.3287e-05 | Elapse: 0.06s
Epoch 6 [100/501] | Train Loss: 0.6485 Grad: 197400.6875 LR: 9.2389e-05 | Elapse: 5.85s
Epoch 6 [200/501] | Train Loss: 0.6510 Grad: 181837.7500 LR: 9.1440e-05 | Elapse: 11.63s
Epoch 6 [300/501] | Train Loss: 0.6432 Grad: 215369.5312 LR: 9.0440e-05 | Elapse: 17.47s
Epoch 6 [400/501] | Train Loss: 0.6390 Grad: 206390.5938 LR: 8.9392e-05 | Elapse: 23.24s
Epoch 6 [500/501] | Train Loss: 0.6315 Grad: 343411.3438 LR: 8.8296e-05 | Elapse: 29.03s


Valid [5]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 6 [0/129] | Valid Loss: 0.7255 | Elapse: 0.05s
Epoch 6 [100/129] | Valid Loss: 0.8315 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.6315; (valid) 0.8375 | Time: 35.35s
Best model found in epoch 6 | valid loss: 0.8375


Epoch 6 [128/129] | Valid Loss: 0.8375 | Elapse: 6.32s


Train [6]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 7 [0/501] | Train Loss: 0.5894 Grad: 218227.4375 LR: 8.8285e-05 | Elapse: 0.06s
Epoch 7 [100/501] | Train Loss: 0.5949 Grad: 279976.3750 LR: 8.7141e-05 | Elapse: 5.88s
Epoch 7 [200/501] | Train Loss: 0.5981 Grad: 221898.9375 LR: 8.5953e-05 | Elapse: 11.69s
Epoch 7 [300/501] | Train Loss: 0.5922 Grad: 240851.1562 LR: 8.4721e-05 | Elapse: 17.51s
Epoch 7 [400/501] | Train Loss: 0.5894 Grad: 226390.4688 LR: 8.3448e-05 | Elapse: 23.28s
Epoch 7 [500/501] | Train Loss: 0.5837 Grad: 344993.8750 LR: 8.2133e-05 | Elapse: 29.04s


Valid [6]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 7 [0/129] | Valid Loss: 0.6785 | Elapse: 0.05s
Epoch 7 [100/129] | Valid Loss: 0.8030 | Elapse: 5.03s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.5837; (valid) 0.8098 | Time: 35.46s
Best model found in epoch 7 | valid loss: 0.8098


Epoch 7 [128/129] | Valid Loss: 0.8098 | Elapse: 6.41s


Train [7]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 8 [0/501] | Train Loss: 0.5730 Grad: 261837.0156 LR: 8.2120e-05 | Elapse: 0.06s
Epoch 8 [100/501] | Train Loss: 0.5551 Grad: 329673.9375 LR: 8.0766e-05 | Elapse: 5.81s
Epoch 8 [200/501] | Train Loss: 0.5585 Grad: 273346.0625 LR: 7.9375e-05 | Elapse: 11.66s
Epoch 8 [300/501] | Train Loss: 0.5537 Grad: 256957.7656 LR: 7.7948e-05 | Elapse: 17.49s
Epoch 8 [400/501] | Train Loss: 0.5516 Grad: 238785.9062 LR: 7.6488e-05 | Elapse: 23.29s
Epoch 8 [500/501] | Train Loss: 0.5469 Grad: 380429.6875 LR: 7.4995e-05 | Elapse: 29.09s


Valid [7]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 8 [0/129] | Valid Loss: 0.6534 | Elapse: 0.05s
Epoch 8 [100/129] | Valid Loss: 0.7865 | Elapse: 5.00s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.5469; (valid) 0.7939 | Time: 35.46s
Best model found in epoch 8 | valid loss: 0.7939


Epoch 8 [128/129] | Valid Loss: 0.7939 | Elapse: 6.36s


Train [8]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 9 [0/501] | Train Loss: 0.5425 Grad: 297699.9062 LR: 7.4980e-05 | Elapse: 0.06s
Epoch 9 [100/501] | Train Loss: 0.5218 Grad: 333373.8438 LR: 7.3457e-05 | Elapse: 5.82s
Epoch 9 [200/501] | Train Loss: 0.5262 Grad: 296808.0312 LR: 7.1905e-05 | Elapse: 11.61s
Epoch 9 [300/501] | Train Loss: 0.5225 Grad: 264322.2188 LR: 7.0326e-05 | Elapse: 17.38s
Epoch 9 [400/501] | Train Loss: 0.5205 Grad: 249217.1719 LR: 6.8723e-05 | Elapse: 23.18s
Epoch 9 [500/501] | Train Loss: 0.5163 Grad: 386530.0000 LR: 6.7098e-05 | Elapse: 28.98s


Valid [8]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 9 [0/129] | Valid Loss: 0.6340 | Elapse: 0.05s
Epoch 9 [100/129] | Valid Loss: 0.7783 | Elapse: 5.00s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.5163; (valid) 0.7859 | Time: 35.36s
Best model found in epoch 9 | valid loss: 0.7859


Epoch 9 [128/129] | Valid Loss: 0.7859 | Elapse: 6.38s


Train [9]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 10 [0/501] | Train Loss: 0.5171 Grad: 377319.9688 LR: 6.7081e-05 | Elapse: 0.06s
Epoch 10 [100/501] | Train Loss: 0.4917 Grad: 383122.3125 LR: 6.5435e-05 | Elapse: 5.90s
Epoch 10 [200/501] | Train Loss: 0.4974 Grad: 320017.2812 LR: 6.3770e-05 | Elapse: 11.72s
Epoch 10 [300/501] | Train Loss: 0.4946 Grad: 286101.3438 LR: 6.2088e-05 | Elapse: 17.53s
Epoch 10 [400/501] | Train Loss: 0.4930 Grad: 266061.3750 LR: 6.0391e-05 | Elapse: 23.32s
Epoch 10 [500/501] | Train Loss: 0.4893 Grad: 426692.2188 LR: 5.8682e-05 | Elapse: 29.13s


Valid [9]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 10 [0/129] | Valid Loss: 0.6280 | Elapse: 0.06s
Epoch 10 [100/129] | Valid Loss: 0.7760 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 10 - Average Loss: (train) 0.4893; (valid) 0.7836 | Time: 35.52s
Best model found in epoch 10 | valid loss: 0.7836


Epoch 10 [128/129] | Valid Loss: 0.7836 | Elapse: 6.38s


Train [10]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 11 [0/501] | Train Loss: 0.4941 Grad: 458205.9375 LR: 5.8665e-05 | Elapse: 0.06s
Epoch 11 [100/501] | Train Loss: 0.4660 Grad: 346321.3438 LR: 5.6945e-05 | Elapse: 5.98s
Epoch 11 [200/501] | Train Loss: 0.4723 Grad: 335475.7188 LR: 5.5217e-05 | Elapse: 11.87s
Epoch 11 [300/501] | Train Loss: 0.4706 Grad: 304074.0312 LR: 5.3482e-05 | Elapse: 17.75s
Epoch 11 [400/501] | Train Loss: 0.4699 Grad: 279756.5938 LR: 5.1743e-05 | Elapse: 23.60s
Epoch 11 [500/501] | Train Loss: 0.4663 Grad: 456661.4062 LR: 5.0003e-05 | Elapse: 29.45s


Valid [10]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 11 [0/129] | Valid Loss: 0.6255 | Elapse: 0.05s
Epoch 11 [100/129] | Valid Loss: 0.7759 | Elapse: 5.03s


----------------------------------------------------------------------------------------------------
Epoch 11 - Average Loss: (train) 0.4663; (valid) 0.7832 | Time: 35.87s
Best model found in epoch 11 | valid loss: 0.7832


Epoch 11 [128/129] | Valid Loss: 0.7832 | Elapse: 6.41s


Train [11]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 12 [0/501] | Train Loss: 0.4748 Grad: 479602.2188 LR: 4.9985e-05 | Elapse: 0.06s
Epoch 12 [100/501] | Train Loss: 0.4449 Grad: 457880.2812 LR: 4.8244e-05 | Elapse: 5.95s
Epoch 12 [200/501] | Train Loss: 0.4513 Grad: 361366.8438 LR: 4.6506e-05 | Elapse: 11.81s
Epoch 12 [300/501] | Train Loss: 0.4507 Grad: 277206.9688 LR: 4.4771e-05 | Elapse: 17.65s
Epoch 12 [400/501] | Train Loss: 0.4503 Grad: 297676.3438 LR: 4.3043e-05 | Elapse: 23.46s
Epoch 12 [500/501] | Train Loss: 0.4468 Grad: 448860.6562 LR: 4.1324e-05 | Elapse: 29.32s


Valid [11]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 12 [0/129] | Valid Loss: 0.6157 | Elapse: 0.05s
Epoch 12 [100/129] | Valid Loss: 0.7756 | Elapse: 5.02s


----------------------------------------------------------------------------------------------------
Epoch 12 - Average Loss: (train) 0.4468; (valid) 0.7830 | Time: 35.73s
Best model found in epoch 12 | valid loss: 0.7830


Epoch 12 [128/129] | Valid Loss: 0.7830 | Elapse: 6.40s


Train [12]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 13 [0/501] | Train Loss: 0.4679 Grad: 463101.3125 LR: 4.1307e-05 | Elapse: 0.06s
Epoch 13 [100/501] | Train Loss: 0.4272 Grad: 776869.2500 LR: 3.9598e-05 | Elapse: 5.90s
Epoch 13 [200/501] | Train Loss: 0.4337 Grad: 389003.0938 LR: 3.7902e-05 | Elapse: 11.75s
Epoch 13 [300/501] | Train Loss: 0.4335 Grad: 248919.4062 LR: 3.6220e-05 | Elapse: 17.60s
Epoch 13 [400/501] | Train Loss: 0.4332 Grad: 322751.4688 LR: 3.4555e-05 | Elapse: 23.42s
Epoch 13 [500/501] | Train Loss: 0.4300 Grad: 451475.0625 LR: 3.2909e-05 | Elapse: 29.26s


Valid [12]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 13 [0/129] | Valid Loss: 0.6083 | Elapse: 0.05s
Epoch 13 [100/129] | Valid Loss: 0.7773 | Elapse: 5.02s


----------------------------------------------------------------------------------------------------
Epoch 13 - Average Loss: (train) 0.4300; (valid) 0.7842 | Time: 35.67s


Epoch 13 [128/129] | Valid Loss: 0.7842 | Elapse: 6.41s


Train [13]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 14 [0/501] | Train Loss: 0.4615 Grad: 465948.0625 LR: 3.2893e-05 | Elapse: 0.06s
Epoch 14 [100/501] | Train Loss: 0.4130 Grad: 396299.1250 LR: 3.1268e-05 | Elapse: 5.88s
Epoch 14 [200/501] | Train Loss: 0.4194 Grad: 404624.3125 LR: 2.9666e-05 | Elapse: 11.72s
Epoch 14 [300/501] | Train Loss: 0.4197 Grad: 242286.8438 LR: 2.8088e-05 | Elapse: 17.59s
Epoch 14 [400/501] | Train Loss: 0.4195 Grad: 353097.5625 LR: 2.6537e-05 | Elapse: 23.46s
Epoch 14 [500/501] | Train Loss: 0.4164 Grad: 424852.9688 LR: 2.5015e-05 | Elapse: 29.37s


Valid [13]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 14 [0/129] | Valid Loss: 0.6132 | Elapse: 0.06s
Epoch 14 [100/129] | Valid Loss: 0.7819 | Elapse: 5.04s


----------------------------------------------------------------------------------------------------
Epoch 14 - Average Loss: (train) 0.4164; (valid) 0.7888 | Time: 35.80s


Epoch 14 [128/129] | Valid Loss: 0.7888 | Elapse: 6.42s


Train [14]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 15 [0/501] | Train Loss: 0.4509 Grad: 449130.0938 LR: 2.5000e-05 | Elapse: 0.06s
Epoch 15 [100/501] | Train Loss: 0.4031 Grad: 563441.0625 LR: 2.3508e-05 | Elapse: 5.90s
Epoch 15 [200/501] | Train Loss: 0.4095 Grad: 422241.9688 LR: 2.2048e-05 | Elapse: 11.76s
Epoch 15 [300/501] | Train Loss: 0.4092 Grad: 220113.0938 LR: 2.0623e-05 | Elapse: 17.65s
Epoch 15 [400/501] | Train Loss: 0.4089 Grad: 365379.3750 LR: 1.9233e-05 | Elapse: 23.46s
Epoch 15 [500/501] | Train Loss: 0.4062 Grad: 426586.3438 LR: 1.7880e-05 | Elapse: 29.26s


Valid [14]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 15 [0/129] | Valid Loss: 0.6156 | Elapse: 0.06s
Epoch 15 [100/129] | Valid Loss: 0.7840 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 15 - Average Loss: (train) 0.4062; (valid) 0.7909 | Time: 35.56s


Epoch 15 [128/129] | Valid Loss: 0.7909 | Elapse: 6.30s


Train [15]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 16 [0/501] | Train Loss: 0.4444 Grad: 445666.0312 LR: 1.7867e-05 | Elapse: 0.06s
Epoch 16 [100/501] | Train Loss: 0.3955 Grad: 364561.4062 LR: 1.6554e-05 | Elapse: 5.80s
Epoch 16 [200/501] | Train Loss: 0.4016 Grad: 461645.0000 LR: 1.5281e-05 | Elapse: 11.54s
Epoch 16 [300/501] | Train Loss: 0.4009 Grad: 240433.7812 LR: 1.4050e-05 | Elapse: 17.24s
Epoch 16 [400/501] | Train Loss: 0.4008 Grad: 390066.1250 LR: 1.2864e-05 | Elapse: 22.95s
Epoch 16 [500/501] | Train Loss: 0.3983 Grad: 429118.9375 LR: 1.1722e-05 | Elapse: 28.68s


Valid [15]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 16 [0/129] | Valid Loss: 0.6069 | Elapse: 0.05s
Epoch 16 [100/129] | Valid Loss: 0.7817 | Elapse: 4.92s


----------------------------------------------------------------------------------------------------
Epoch 16 - Average Loss: (train) 0.3983; (valid) 0.7892 | Time: 34.96s


Epoch 16 [128/129] | Valid Loss: 0.7892 | Elapse: 6.28s


Train [16]:   0%|          | 0/501 [00:00<?, ?batch/s]

Epoch 17 [0/501] | Train Loss: 0.4387 Grad: 454773.9375 LR: 1.1711e-05 | Elapse: 0.06s
Epoch 17 [100/501] | Train Loss: 0.3904 Grad: 357592.5938 LR: 1.0616e-05 | Elapse: 5.80s
Epoch 17 [200/501] | Train Loss: 0.3963 Grad: 700681.3750 LR: 9.5690e-06 | Elapse: 11.56s
Epoch 17 [300/501] | Train Loss: 0.3954 Grad: 255800.9375 LR: 8.5711e-06 | Elapse: 17.31s
Epoch 17 [400/501] | Train Loss: 0.3955 Grad: 407495.4375 LR: 7.6235e-06 | Elapse: 22.97s
Epoch 17 [500/501] | Train Loss: 0.3934 Grad: 399049.5312 LR: 6.7273e-06 | Elapse: 28.68s


Valid [16]:   0%|          | 0/129 [00:00<?, ?batch/s]

Epoch 17 [0/129] | Valid Loss: 0.5997 | Elapse: 0.05s
Epoch 17 [100/129] | Valid Loss: 0.7785 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 17 - Average Loss: (train) 0.3934; (valid) 0.7868 | Time: 34.98s
Early stopping at epoch 17
Fold 2 Valid Loss: 0.7829928398132324
Elapse: 10.06 min 


Epoch 17 [128/129] | Valid Loss: 0.7868 | Elapse: 6.30s


- Stage 2 | Train: 5070; Valid: 1422 -


Loading model from checkpoint: outputs/ResnetGRU_v1_LB048_fold_2_stage_1.pth


Train [0]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 1 [0/158] | Train Loss: 0.5660 Grad: 854411.8125 LR: 4.0024e-06 | Elapse: 0.06s
Epoch 1 [100/158] | Train Loss: 0.3738 Grad: 469991.0000 LR: 2.6361e-05 | Elapse: 5.82s
Epoch 1 [157/158] | Train Loss: 0.3567 Grad: 406150.2812 LR: 5.2239e-05 | Elapse: 9.11s


Valid [0]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 1 [0/45] | Valid Loss: 0.4418 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 0.3567; (valid) 0.5688 | Time: 11.30s
Best model found in epoch 1 | valid loss: 0.5688


Epoch 1 [44/45] | Valid Loss: 0.5688 | Elapse: 2.19s


Train [1]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 2 [0/158] | Train Loss: 0.4592 Grad: 658347.8750 LR: 5.2718e-05 | Elapse: 0.06s
Epoch 2 [100/158] | Train Loss: 0.3056 Grad: 338311.8750 LR: 9.2706e-05 | Elapse: 5.84s
Epoch 2 [157/158] | Train Loss: 0.2921 Grad: 257837.0938 LR: 1.0000e-04 | Elapse: 9.13s


Valid [1]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 2 [0/45] | Valid Loss: 0.3484 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.2921; (valid) 0.5093 | Time: 11.32s
Best model found in epoch 2 | valid loss: 0.5093


Epoch 2 [44/45] | Valid Loss: 0.5093 | Elapse: 2.18s


Train [2]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 3 [0/158] | Train Loss: 0.3860 Grad: 567232.5625 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/158] | Train Loss: 0.2534 Grad: 500141.4688 LR: 9.9683e-05 | Elapse: 5.73s
Epoch 3 [157/158] | Train Loss: 0.2444 Grad: 343914.7500 LR: 9.9231e-05 | Elapse: 9.01s


Valid [2]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 3 [0/45] | Valid Loss: 0.3424 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.2444; (valid) 0.5016 | Time: 11.20s
Best model found in epoch 3 | valid loss: 0.5016


Epoch 3 [44/45] | Valid Loss: 0.5016 | Elapse: 2.18s


Train [3]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 4 [0/158] | Train Loss: 0.3197 Grad: 455687.1250 LR: 9.9221e-05 | Elapse: 0.06s
Epoch 4 [100/158] | Train Loss: 0.2219 Grad: 478701.4688 LR: 9.7953e-05 | Elapse: 5.81s
Epoch 4 [157/158] | Train Loss: 0.2149 Grad: 306076.0312 LR: 9.6967e-05 | Elapse: 9.09s


Valid [3]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 4 [0/45] | Valid Loss: 0.3377 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.2149; (valid) 0.5001 | Time: 11.27s
Best model found in epoch 4 | valid loss: 0.5001


Epoch 4 [44/45] | Valid Loss: 0.5001 | Elapse: 2.18s


Train [4]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 5 [0/158] | Train Loss: 0.2846 Grad: 431568.4688 LR: 9.6948e-05 | Elapse: 0.06s
Epoch 5 [100/158] | Train Loss: 0.2001 Grad: 467030.0000 LR: 9.4766e-05 | Elapse: 5.83s
Epoch 5 [157/158] | Train Loss: 0.1944 Grad: 263292.8438 LR: 9.3276e-05 | Elapse: 9.18s


Valid [4]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 5 [0/45] | Valid Loss: 0.3361 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.1944; (valid) 0.5003 | Time: 11.40s


Epoch 5 [44/45] | Valid Loss: 0.5003 | Elapse: 2.22s


Train [5]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 6 [0/158] | Train Loss: 0.2509 Grad: 410532.8750 LR: 9.3249e-05 | Elapse: 0.06s
Epoch 6 [100/158] | Train Loss: 0.1834 Grad: 438894.4375 LR: 9.0220e-05 | Elapse: 5.93s
Epoch 6 [157/158] | Train Loss: 0.1784 Grad: 294537.5312 LR: 8.8271e-05 | Elapse: 9.26s


Valid [5]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 6 [0/45] | Valid Loss: 0.3237 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.1784; (valid) 0.5027 | Time: 11.49s


Epoch 6 [44/45] | Valid Loss: 0.5027 | Elapse: 2.23s


Train [6]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 7 [0/158] | Train Loss: 0.2212 Grad: 396004.3125 LR: 8.8236e-05 | Elapse: 0.06s
Epoch 7 [100/158] | Train Loss: 0.1686 Grad: 356918.5000 LR: 8.4452e-05 | Elapse: 5.91s
Epoch 7 [157/158] | Train Loss: 0.1644 Grad: 423849.3125 LR: 8.2104e-05 | Elapse: 9.25s


Valid [6]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 7 [0/45] | Valid Loss: 0.3310 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.1644; (valid) 0.5052 | Time: 11.47s


Epoch 7 [44/45] | Valid Loss: 0.5052 | Elapse: 2.22s


Train [7]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 8 [0/158] | Train Loss: 0.2091 Grad: 407337.9062 LR: 8.2062e-05 | Elapse: 0.06s
Epoch 8 [100/158] | Train Loss: 0.1569 Grad: 372094.2500 LR: 7.7638e-05 | Elapse: 5.91s
Epoch 8 [157/158] | Train Loss: 0.1532 Grad: 607934.7500 LR: 7.4962e-05 | Elapse: 9.25s


Valid [7]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 8 [0/45] | Valid Loss: 0.3326 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.1532; (valid) 0.5120 | Time: 11.47s


Epoch 8 [44/45] | Valid Loss: 0.5120 | Elapse: 2.22s


Train [8]:   0%|          | 0/158 [00:00<?, ?batch/s]

Epoch 9 [0/158] | Train Loss: 0.1904 Grad: 385475.9375 LR: 7.4914e-05 | Elapse: 0.06s
Epoch 9 [100/158] | Train Loss: 0.1455 Grad: 306060.2188 LR: 6.9985e-05 | Elapse: 5.93s
Epoch 9 [157/158] | Train Loss: 0.1413 Grad: 488520.3750 LR: 6.7062e-05 | Elapse: 9.29s


Valid [8]:   0%|          | 0/45 [00:00<?, ?batch/s]

Epoch 9 [0/45] | Valid Loss: 0.3375 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.1413; (valid) 0.5190 | Time: 11.52s
Early stopping at epoch 9
Fold 2 Valid Loss: 0.5001062750816345
Elapse: 1.71 min 
Fold 2 Elapse: 11.77 min
Fold: 3
- Stage 1 | Train: 16337; Valid: 3846 -


Epoch 9 [44/45] | Valid Loss: 0.5190 | Elapse: 2.23s


Train [0]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 1 [0/510] | Train Loss: 1.1912 Grad: 60922.4375 LR: 4.0002e-06 | Elapse: 0.07s
Epoch 1 [100/510] | Train Loss: 1.1562 Grad: 67459.1172 LR: 6.3083e-06 | Elapse: 5.89s
Epoch 1 [200/510] | Train Loss: 1.1493 Grad: 79558.8359 LR: 1.2925e-05 | Elapse: 11.76s
Epoch 1 [300/510] | Train Loss: 1.1402 Grad: 57920.4609 LR: 2.3227e-05 | Elapse: 17.61s
Epoch 1 [400/510] | Train Loss: 1.1282 Grad: 55223.5273 LR: 3.6241e-05 | Elapse: 23.45s
Epoch 1 [500/510] | Train Loss: 1.1120 Grad: 44679.5508 LR: 5.0742e-05 | Elapse: 29.27s
Epoch 1 [509/510] | Train Loss: 1.1108 Grad: 65284.4102 LR: 5.2074e-05 | Elapse: 29.79s


Valid [0]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 1 [0/121] | Valid Loss: 1.1624 | Elapse: 0.06s
Epoch 1 [100/121] | Valid Loss: 1.2900 | Elapse: 5.01s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 1.1108; (valid) 1.2859 | Time: 35.77s
Best model found in epoch 1 | valid loss: 1.2859


Epoch 1 [120/121] | Valid Loss: 1.2859 | Elapse: 5.97s


Train [1]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 2 [0/510] | Train Loss: 1.0882 Grad: 45621.6016 LR: 5.2222e-05 | Elapse: 0.06s
Epoch 2 [100/510] | Train Loss: 1.0158 Grad: 55301.7188 LR: 6.6777e-05 | Elapse: 5.87s
Epoch 2 [200/510] | Train Loss: 1.0119 Grad: 65561.6562 LR: 7.9938e-05 | Elapse: 11.70s
Epoch 2 [300/510] | Train Loss: 1.0017 Grad: 57075.3242 LR: 9.0464e-05 | Elapse: 17.52s
Epoch 2 [400/510] | Train Loss: 0.9879 Grad: 54085.3789 LR: 9.7364e-05 | Elapse: 23.33s
Epoch 2 [500/510] | Train Loss: 0.9740 Grad: 55806.6289 LR: 9.9985e-05 | Elapse: 29.13s
Epoch 2 [509/510] | Train Loss: 0.9733 Grad: 83727.1641 LR: 1.0000e-04 | Elapse: 29.66s


Valid [1]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 2 [0/121] | Valid Loss: 1.0870 | Elapse: 0.06s
Epoch 2 [100/121] | Valid Loss: 1.1592 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.9733; (valid) 1.1545 | Time: 35.60s
Best model found in epoch 2 | valid loss: 1.1545


Epoch 2 [120/121] | Valid Loss: 1.1545 | Elapse: 5.94s


Train [2]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 3 [0/510] | Train Loss: 1.0122 Grad: 63871.3359 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/510] | Train Loss: 0.8866 Grad: 123873.0625 LR: 9.9970e-05 | Elapse: 5.91s
Epoch 3 [200/510] | Train Loss: 0.8779 Grad: 128024.0469 LR: 9.9881e-05 | Elapse: 11.76s
Epoch 3 [300/510] | Train Loss: 0.8681 Grad: 119070.2891 LR: 9.9733e-05 | Elapse: 17.61s
Epoch 3 [400/510] | Train Loss: 0.8566 Grad: 83093.4453 LR: 9.9528e-05 | Elapse: 23.36s
Epoch 3 [500/510] | Train Loss: 0.8467 Grad: 76686.2969 LR: 9.9264e-05 | Elapse: 29.12s
Epoch 3 [509/510] | Train Loss: 0.8464 Grad: 122237.2344 LR: 9.9238e-05 | Elapse: 29.64s


Valid [2]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 3 [0/121] | Valid Loss: 1.0273 | Elapse: 0.05s
Epoch 3 [100/121] | Valid Loss: 1.0454 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.8464; (valid) 1.0380 | Time: 35.52s
Best model found in epoch 3 | valid loss: 1.0380


Epoch 3 [120/121] | Valid Loss: 1.0380 | Elapse: 5.88s


Train [3]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 4 [0/510] | Train Loss: 0.9136 Grad: 90044.3750 LR: 9.9235e-05 | Elapse: 0.06s
Epoch 4 [100/510] | Train Loss: 0.7850 Grad: 119312.9766 LR: 9.8908e-05 | Elapse: 5.82s
Epoch 4 [200/510] | Train Loss: 0.7820 Grad: 198884.6562 LR: 9.8524e-05 | Elapse: 11.57s
Epoch 4 [300/510] | Train Loss: 0.7772 Grad: 178877.9531 LR: 9.8083e-05 | Elapse: 17.33s
Epoch 4 [400/510] | Train Loss: 0.7699 Grad: 122933.9219 LR: 9.7585e-05 | Elapse: 23.08s
Epoch 4 [500/510] | Train Loss: 0.7625 Grad: 108803.6797 LR: 9.7032e-05 | Elapse: 28.84s
Epoch 4 [509/510] | Train Loss: 0.7622 Grad: 164645.3594 LR: 9.6980e-05 | Elapse: 29.36s


Valid [3]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 4 [0/121] | Valid Loss: 0.9820 | Elapse: 0.05s
Epoch 4 [100/121] | Valid Loss: 0.9644 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.7622; (valid) 0.9548 | Time: 35.23s
Best model found in epoch 4 | valid loss: 0.9548


Epoch 4 [120/121] | Valid Loss: 0.9548 | Elapse: 5.87s


Train [4]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 5 [0/510] | Train Loss: 0.8449 Grad: 119922.6094 LR: 9.6974e-05 | Elapse: 0.06s
Epoch 5 [100/510] | Train Loss: 0.7126 Grad: 146948.7500 LR: 9.6361e-05 | Elapse: 5.83s
Epoch 5 [200/510] | Train Loss: 0.7089 Grad: 201684.2188 LR: 9.5693e-05 | Elapse: 11.58s
Epoch 5 [300/510] | Train Loss: 0.7072 Grad: 205515.3750 LR: 9.4972e-05 | Elapse: 17.33s
Epoch 5 [400/510] | Train Loss: 0.7003 Grad: 167222.4062 LR: 9.4198e-05 | Elapse: 23.09s
Epoch 5 [500/510] | Train Loss: 0.6937 Grad: 155976.7188 LR: 9.3372e-05 | Elapse: 28.84s
Epoch 5 [509/510] | Train Loss: 0.6934 Grad: 187854.5469 LR: 9.3295e-05 | Elapse: 29.37s


Valid [4]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 5 [0/121] | Valid Loss: 0.9000 | Elapse: 0.05s
Epoch 5 [100/121] | Valid Loss: 0.9040 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.6934; (valid) 0.8934 | Time: 35.24s
Best model found in epoch 5 | valid loss: 0.8934


Epoch 5 [120/121] | Valid Loss: 0.8934 | Elapse: 5.87s


Train [5]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 6 [0/510] | Train Loss: 0.7605 Grad: 150443.5625 LR: 9.3287e-05 | Elapse: 0.06s
Epoch 6 [100/510] | Train Loss: 0.6400 Grad: 104186.1875 LR: 9.2405e-05 | Elapse: 5.80s
Epoch 6 [200/510] | Train Loss: 0.6382 Grad: 125514.0000 LR: 9.1474e-05 | Elapse: 11.54s
Epoch 6 [300/510] | Train Loss: 0.6382 Grad: 114495.9375 LR: 9.0495e-05 | Elapse: 17.28s
Epoch 6 [400/510] | Train Loss: 0.6314 Grad: 116677.0938 LR: 8.9468e-05 | Elapse: 22.93s
Epoch 6 [500/510] | Train Loss: 0.6262 Grad: 106457.0000 LR: 8.8395e-05 | Elapse: 28.68s
Epoch 6 [509/510] | Train Loss: 0.6260 Grad: 130048.4688 LR: 8.8296e-05 | Elapse: 29.19s


Valid [5]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 6 [0/121] | Valid Loss: 0.8482 | Elapse: 0.05s
Epoch 6 [100/121] | Valid Loss: 0.8497 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.6260; (valid) 0.8398 | Time: 35.06s
Best model found in epoch 6 | valid loss: 0.8398


Epoch 6 [120/121] | Valid Loss: 0.8398 | Elapse: 5.87s


Train [6]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 7 [0/510] | Train Loss: 0.6866 Grad: 186655.5625 LR: 8.8285e-05 | Elapse: 0.06s
Epoch 7 [100/510] | Train Loss: 0.5795 Grad: 251785.7344 LR: 8.7162e-05 | Elapse: 5.82s
Epoch 7 [200/510] | Train Loss: 0.5818 Grad: 276788.5625 LR: 8.5996e-05 | Elapse: 11.59s
Epoch 7 [300/510] | Train Loss: 0.5836 Grad: 264578.8750 LR: 8.4788e-05 | Elapse: 17.35s
Epoch 7 [400/510] | Train Loss: 0.5779 Grad: 248411.5469 LR: 8.3539e-05 | Elapse: 23.12s
Epoch 7 [500/510] | Train Loss: 0.5746 Grad: 258656.8125 LR: 8.2251e-05 | Elapse: 28.85s
Epoch 7 [509/510] | Train Loss: 0.5745 Grad: 292422.5625 LR: 8.2133e-05 | Elapse: 29.37s


Valid [6]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 7 [0/121] | Valid Loss: 0.8326 | Elapse: 0.05s
Epoch 7 [100/121] | Valid Loss: 0.8175 | Elapse: 4.92s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.5745; (valid) 0.8072 | Time: 35.23s
Best model found in epoch 7 | valid loss: 0.8072


Epoch 7 [120/121] | Valid Loss: 0.8072 | Elapse: 5.86s


Train [7]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 8 [0/510] | Train Loss: 0.6245 Grad: 204376.7500 LR: 8.2120e-05 | Elapse: 0.06s
Epoch 8 [100/510] | Train Loss: 0.5372 Grad: 148932.5156 LR: 8.0791e-05 | Elapse: 5.81s
Epoch 8 [200/510] | Train Loss: 0.5413 Grad: 145995.0781 LR: 7.9425e-05 | Elapse: 11.56s
Epoch 8 [300/510] | Train Loss: 0.5436 Grad: 148303.7656 LR: 7.8025e-05 | Elapse: 17.30s
Epoch 8 [400/510] | Train Loss: 0.5384 Grad: 135781.6094 LR: 7.6592e-05 | Elapse: 23.06s
Epoch 8 [500/510] | Train Loss: 0.5366 Grad: 138623.3438 LR: 7.5128e-05 | Elapse: 28.82s
Epoch 8 [509/510] | Train Loss: 0.5365 Grad: 172017.5156 LR: 7.4995e-05 | Elapse: 29.34s


Valid [7]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 8 [0/121] | Valid Loss: 0.8177 | Elapse: 0.06s
Epoch 8 [100/121] | Valid Loss: 0.8000 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.5365; (valid) 0.7894 | Time: 35.22s
Best model found in epoch 8 | valid loss: 0.7894


Epoch 8 [120/121] | Valid Loss: 0.7894 | Elapse: 5.88s


Train [8]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 9 [0/510] | Train Loss: 0.5777 Grad: 212821.1094 LR: 7.4980e-05 | Elapse: 0.06s
Epoch 9 [100/510] | Train Loss: 0.5042 Grad: 355092.5938 LR: 7.3484e-05 | Elapse: 5.83s
Epoch 9 [200/510] | Train Loss: 0.5098 Grad: 350463.8750 LR: 7.1961e-05 | Elapse: 11.63s
Epoch 9 [300/510] | Train Loss: 0.5123 Grad: 156534.9375 LR: 7.0411e-05 | Elapse: 17.40s
Epoch 9 [400/510] | Train Loss: 0.5075 Grad: 142440.9375 LR: 6.8838e-05 | Elapse: 23.18s
Epoch 9 [500/510] | Train Loss: 0.5069 Grad: 144911.4688 LR: 6.7243e-05 | Elapse: 28.92s
Epoch 9 [509/510] | Train Loss: 0.5067 Grad: 187289.7812 LR: 6.7098e-05 | Elapse: 29.44s


Valid [8]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 9 [0/121] | Valid Loss: 0.8059 | Elapse: 0.05s
Epoch 9 [100/121] | Valid Loss: 0.7888 | Elapse: 4.92s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.5067; (valid) 0.7775 | Time: 35.30s
Best model found in epoch 9 | valid loss: 0.7775


Epoch 9 [120/121] | Valid Loss: 0.7775 | Elapse: 5.86s


Train [9]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 10 [0/510] | Train Loss: 0.5503 Grad: 230878.6094 LR: 6.7082e-05 | Elapse: 0.06s
Epoch 10 [100/510] | Train Loss: 0.4776 Grad: 385637.0000 LR: 6.5465e-05 | Elapse: 5.83s
Epoch 10 [200/510] | Train Loss: 0.4843 Grad: 324759.2188 LR: 6.3829e-05 | Elapse: 11.63s
Epoch 10 [300/510] | Train Loss: 0.4866 Grad: 166641.9375 LR: 6.2178e-05 | Elapse: 17.51s
Epoch 10 [400/510] | Train Loss: 0.4822 Grad: 154544.2188 LR: 6.0512e-05 | Elapse: 23.36s
Epoch 10 [500/510] | Train Loss: 0.4820 Grad: 150641.3750 LR: 5.8834e-05 | Elapse: 29.23s
Epoch 10 [509/510] | Train Loss: 0.4819 Grad: 204075.5781 LR: 5.8682e-05 | Elapse: 29.75s


Valid [9]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 10 [0/121] | Valid Loss: 0.7973 | Elapse: 0.05s
Epoch 10 [100/121] | Valid Loss: 0.7825 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 10 - Average Loss: (train) 0.4819; (valid) 0.7711 | Time: 35.64s
Best model found in epoch 10 | valid loss: 0.7711


Epoch 10 [120/121] | Valid Loss: 0.7711 | Elapse: 5.88s


Train [10]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 11 [0/510] | Train Loss: 0.5299 Grad: 270963.6875 LR: 5.8665e-05 | Elapse: 0.06s
Epoch 11 [100/510] | Train Loss: 0.4553 Grad: 399028.8750 LR: 5.6976e-05 | Elapse: 5.83s
Epoch 11 [200/510] | Train Loss: 0.4629 Grad: 351466.1875 LR: 5.5278e-05 | Elapse: 11.59s
Epoch 11 [300/510] | Train Loss: 0.4654 Grad: 370532.5625 LR: 5.3575e-05 | Elapse: 17.36s
Epoch 11 [400/510] | Train Loss: 0.4611 Grad: 339240.6875 LR: 5.1867e-05 | Elapse: 23.11s
Epoch 11 [500/510] | Train Loss: 0.4614 Grad: 310982.2500 LR: 5.0157e-05 | Elapse: 28.89s
Epoch 11 [509/510] | Train Loss: 0.4613 Grad: 465318.2188 LR: 5.0003e-05 | Elapse: 29.41s


Valid [10]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 11 [0/121] | Valid Loss: 0.7974 | Elapse: 0.05s
Epoch 11 [100/121] | Valid Loss: 0.7812 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 11 - Average Loss: (train) 0.4613; (valid) 0.7695 | Time: 35.35s
Best model found in epoch 11 | valid loss: 0.7695


Epoch 11 [120/121] | Valid Loss: 0.7695 | Elapse: 5.94s


Train [11]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 12 [0/510] | Train Loss: 0.5183 Grad: 296958.2500 LR: 4.9986e-05 | Elapse: 0.06s
Epoch 12 [100/510] | Train Loss: 0.4368 Grad: 421843.0625 LR: 4.8276e-05 | Elapse: 5.84s
Epoch 12 [200/510] | Train Loss: 0.4452 Grad: 339342.3438 LR: 4.6568e-05 | Elapse: 11.60s
Epoch 12 [300/510] | Train Loss: 0.4474 Grad: 379254.3125 LR: 4.4864e-05 | Elapse: 17.36s
Epoch 12 [400/510] | Train Loss: 0.4434 Grad: 360490.8750 LR: 4.3166e-05 | Elapse: 23.13s
Epoch 12 [500/510] | Train Loss: 0.4438 Grad: 337286.7188 LR: 4.1476e-05 | Elapse: 28.91s
Epoch 12 [509/510] | Train Loss: 0.4437 Grad: 497610.5625 LR: 4.1324e-05 | Elapse: 29.43s


Valid [11]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 12 [0/121] | Valid Loss: 0.7899 | Elapse: 0.05s
Epoch 12 [100/121] | Valid Loss: 0.7812 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 12 - Average Loss: (train) 0.4437; (valid) 0.7691 | Time: 35.33s
Best model found in epoch 12 | valid loss: 0.7691


Epoch 12 [120/121] | Valid Loss: 0.7691 | Elapse: 5.90s


Train [12]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 13 [0/510] | Train Loss: 0.5068 Grad: 299582.8750 LR: 4.1307e-05 | Elapse: 0.06s
Epoch 13 [100/510] | Train Loss: 0.4215 Grad: 422721.3125 LR: 3.9629e-05 | Elapse: 5.85s
Epoch 13 [200/510] | Train Loss: 0.4303 Grad: 319945.0938 LR: 3.7962e-05 | Elapse: 11.63s
Epoch 13 [300/510] | Train Loss: 0.4329 Grad: 408359.8125 LR: 3.6309e-05 | Elapse: 17.43s
Epoch 13 [400/510] | Train Loss: 0.4289 Grad: 360179.1875 LR: 3.4673e-05 | Elapse: 23.22s
Epoch 13 [500/510] | Train Loss: 0.4291 Grad: 359406.3438 LR: 3.3054e-05 | Elapse: 28.99s
Epoch 13 [509/510] | Train Loss: 0.4290 Grad: 493494.7188 LR: 3.2910e-05 | Elapse: 29.50s


Valid [12]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 13 [0/121] | Valid Loss: 0.7838 | Elapse: 0.05s
Epoch 13 [100/121] | Valid Loss: 0.7844 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 13 - Average Loss: (train) 0.4290; (valid) 0.7721 | Time: 35.40s


Epoch 13 [120/121] | Valid Loss: 0.7721 | Elapse: 5.89s


Train [13]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 14 [0/510] | Train Loss: 0.4945 Grad: 282226.1250 LR: 3.2894e-05 | Elapse: 0.06s
Epoch 14 [100/510] | Train Loss: 0.4096 Grad: 427398.6875 LR: 3.1297e-05 | Elapse: 5.82s
Epoch 14 [200/510] | Train Loss: 0.4186 Grad: 418905.4062 LR: 2.9723e-05 | Elapse: 11.58s
Epoch 14 [300/510] | Train Loss: 0.4211 Grad: 441375.1562 LR: 2.8172e-05 | Elapse: 17.35s
Epoch 14 [400/510] | Train Loss: 0.4170 Grad: 354981.9688 LR: 2.6646e-05 | Elapse: 23.11s
Epoch 14 [500/510] | Train Loss: 0.4171 Grad: 368083.3750 LR: 2.5149e-05 | Elapse: 28.87s
Epoch 14 [509/510] | Train Loss: 0.4171 Grad: 530054.2500 LR: 2.5015e-05 | Elapse: 29.39s


Valid [13]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 14 [0/121] | Valid Loss: 0.7867 | Elapse: 0.05s
Epoch 14 [100/121] | Valid Loss: 0.7884 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 14 - Average Loss: (train) 0.4171; (valid) 0.7764 | Time: 35.27s


Epoch 14 [120/121] | Valid Loss: 0.7764 | Elapse: 5.87s


Train [14]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 15 [0/510] | Train Loss: 0.4877 Grad: 307781.7500 LR: 2.5000e-05 | Elapse: 0.06s
Epoch 15 [100/510] | Train Loss: 0.3997 Grad: 408305.1875 LR: 2.3535e-05 | Elapse: 5.81s
Epoch 15 [200/510] | Train Loss: 0.4090 Grad: 453461.6875 LR: 2.2100e-05 | Elapse: 11.56s
Epoch 15 [300/510] | Train Loss: 0.4111 Grad: 447959.4062 LR: 2.0698e-05 | Elapse: 17.33s
Epoch 15 [400/510] | Train Loss: 0.4071 Grad: 348717.0000 LR: 1.9330e-05 | Elapse: 23.11s
Epoch 15 [500/510] | Train Loss: 0.4073 Grad: 388739.5000 LR: 1.7998e-05 | Elapse: 28.86s
Epoch 15 [509/510] | Train Loss: 0.4073 Grad: 617421.8125 LR: 1.7880e-05 | Elapse: 29.38s


Valid [14]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 15 [0/121] | Valid Loss: 0.7929 | Elapse: 0.05s
Epoch 15 [100/121] | Valid Loss: 0.7876 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 15 - Average Loss: (train) 0.4073; (valid) 0.7754 | Time: 35.25s


Epoch 15 [120/121] | Valid Loss: 0.7754 | Elapse: 5.87s


Train [15]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 16 [0/510] | Train Loss: 0.4800 Grad: 336854.9375 LR: 1.7867e-05 | Elapse: 0.06s
Epoch 16 [100/510] | Train Loss: 0.3926 Grad: 412147.5938 LR: 1.6577e-05 | Elapse: 5.86s
Epoch 16 [200/510] | Train Loss: 0.4014 Grad: 387771.1250 LR: 1.5326e-05 | Elapse: 11.64s
Epoch 16 [300/510] | Train Loss: 0.4031 Grad: 427498.2188 LR: 1.4115e-05 | Elapse: 17.43s
Epoch 16 [400/510] | Train Loss: 0.3995 Grad: 339261.6875 LR: 1.2946e-05 | Elapse: 23.24s
Epoch 16 [500/510] | Train Loss: 0.3999 Grad: 402729.5000 LR: 1.1821e-05 | Elapse: 29.05s
Epoch 16 [509/510] | Train Loss: 0.3999 Grad: 684848.0625 LR: 1.1722e-05 | Elapse: 29.57s


Valid [15]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 16 [0/121] | Valid Loss: 0.7963 | Elapse: 0.05s
Epoch 16 [100/121] | Valid Loss: 0.7833 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 16 - Average Loss: (train) 0.3999; (valid) 0.7708 | Time: 35.45s


Epoch 16 [120/121] | Valid Loss: 0.7708 | Elapse: 5.88s


Train [16]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 17 [0/510] | Train Loss: 0.4755 Grad: 385755.8750 LR: 1.1711e-05 | Elapse: 0.06s
Epoch 17 [100/510] | Train Loss: 0.3860 Grad: 423177.6875 LR: 1.0635e-05 | Elapse: 5.82s
Epoch 17 [200/510] | Train Loss: 0.3948 Grad: 412214.2188 LR: 9.6054e-06 | Elapse: 11.50s
Epoch 17 [300/510] | Train Loss: 0.3967 Grad: 434398.5938 LR: 8.6230e-06 | Elapse: 17.17s
Epoch 17 [400/510] | Train Loss: 0.3933 Grad: 346305.0000 LR: 7.6890e-06 | Elapse: 22.85s
Epoch 17 [500/510] | Train Loss: 0.3940 Grad: 404521.5938 LR: 6.8046e-06 | Elapse: 28.51s
Epoch 17 [509/510] | Train Loss: 0.3940 Grad: 657405.4375 LR: 6.7275e-06 | Elapse: 29.02s


Valid [16]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 17 [0/121] | Valid Loss: 0.7946 | Elapse: 0.05s
Epoch 17 [100/121] | Valid Loss: 0.7808 | Elapse: 4.92s


----------------------------------------------------------------------------------------------------
Epoch 17 - Average Loss: (train) 0.3940; (valid) 0.7683 | Time: 34.89s
Best model found in epoch 17 | valid loss: 0.7683


Epoch 17 [120/121] | Valid Loss: 0.7683 | Elapse: 5.86s


Train [17]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 18 [0/510] | Train Loss: 0.4732 Grad: 397394.7812 LR: 6.7190e-06 | Elapse: 0.06s
Epoch 18 [100/510] | Train Loss: 0.3803 Grad: 434960.5938 LR: 5.8903e-06 | Elapse: 5.76s
Epoch 18 [200/510] | Train Loss: 0.3892 Grad: 485821.6875 LR: 5.1133e-06 | Elapse: 11.48s
Epoch 18 [300/510] | Train Loss: 0.3917 Grad: 457358.5625 LR: 4.3889e-06 | Elapse: 17.21s
Epoch 18 [400/510] | Train Loss: 0.3885 Grad: 340633.0625 LR: 3.7179e-06 | Elapse: 22.95s
Epoch 18 [500/510] | Train Loss: 0.3893 Grad: 402321.2812 LR: 3.1011e-06 | Elapse: 28.73s
Epoch 18 [509/510] | Train Loss: 0.3893 Grad: 610717.3125 LR: 3.0483e-06 | Elapse: 29.25s


Valid [17]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 18 [0/121] | Valid Loss: 0.7895 | Elapse: 0.05s
Epoch 18 [100/121] | Valid Loss: 0.7795 | Elapse: 4.98s


----------------------------------------------------------------------------------------------------
Epoch 18 - Average Loss: (train) 0.3893; (valid) 0.7671 | Time: 35.18s
Best model found in epoch 18 | valid loss: 0.7671


Epoch 18 [120/121] | Valid Loss: 0.7671 | Elapse: 5.93s


Train [18]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 19 [0/510] | Train Loss: 0.4671 Grad: 372038.7500 LR: 3.0425e-06 | Elapse: 0.06s
Epoch 19 [100/510] | Train Loss: 0.3765 Grad: 435571.7812 LR: 2.4862e-06 | Elapse: 5.86s
Epoch 19 [200/510] | Train Loss: 0.3855 Grad: 679221.5625 LR: 1.9856e-06 | Elapse: 11.68s
Epoch 19 [300/510] | Train Loss: 0.3879 Grad: 473599.2188 LR: 1.5412e-06 | Elapse: 17.49s
Epoch 19 [400/510] | Train Loss: 0.3852 Grad: 341168.7500 LR: 1.1536e-06 | Elapse: 23.30s
Epoch 19 [500/510] | Train Loss: 0.3860 Grad: 396767.2188 LR: 8.2325e-07 | Elapse: 29.08s
Epoch 19 [509/510] | Train Loss: 0.3860 Grad: 581818.7500 LR: 7.9634e-07 | Elapse: 29.60s


Valid [18]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 19 [0/121] | Valid Loss: 0.7866 | Elapse: 0.05s
Epoch 19 [100/121] | Valid Loss: 0.7792 | Elapse: 4.92s


----------------------------------------------------------------------------------------------------
Epoch 19 - Average Loss: (train) 0.3860; (valid) 0.7670 | Time: 35.46s
Best model found in epoch 19 | valid loss: 0.7670


Epoch 19 [120/121] | Valid Loss: 0.7670 | Elapse: 5.86s


Train [19]:   0%|          | 0/510 [00:00<?, ?batch/s]

Epoch 20 [0/510] | Train Loss: 0.4640 Grad: 370561.9375 LR: 7.9338e-07 | Elapse: 0.06s
Epoch 20 [100/510] | Train Loss: 0.3741 Grad: 439385.0312 LR: 5.2640e-07 | Elapse: 5.83s
Epoch 20 [200/510] | Train Loss: 0.3833 Grad: 779277.4375 LR: 3.1738e-07 | Elapse: 11.63s
Epoch 20 [300/510] | Train Loss: 0.3857 Grad: 460105.1562 LR: 1.6657e-07 | Elapse: 17.43s
Epoch 20 [400/510] | Train Loss: 0.3832 Grad: 343395.4688 LR: 7.4133e-08 | Elapse: 23.23s
Epoch 20 [500/510] | Train Loss: 0.3843 Grad: 391990.2500 LR: 4.0187e-08 | Elapse: 29.06s
Epoch 20 [509/510] | Train Loss: 0.3842 Grad: 560095.3125 LR: 4.0003e-08 | Elapse: 29.58s


Valid [19]:   0%|          | 0/121 [00:00<?, ?batch/s]

Epoch 20 [0/121] | Valid Loss: 0.7844 | Elapse: 0.05s
Epoch 20 [100/121] | Valid Loss: 0.7789 | Elapse: 5.00s


----------------------------------------------------------------------------------------------------
Epoch 20 - Average Loss: (train) 0.3842; (valid) 0.7667 | Time: 35.54s
Best model found in epoch 20 | valid loss: 0.7667
Fold 3 Valid Loss: 0.7667206525802612
Elapse: 11.78 min 


Epoch 20 [120/121] | Valid Loss: 0.7667 | Elapse: 5.95s


- Stage 2 | Train: 5266; Valid: 1226 -


Loading model from checkpoint: outputs/ResnetGRU_v1_LB048_fold_3_stage_1.pth


Train [0]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 1 [0/164] | Train Loss: 0.2902 Grad: 535465.8750 LR: 4.0022e-06 | Elapse: 0.07s
Epoch 1 [100/164] | Train Loss: 0.3348 Grad: 236197.8438 LR: 2.4879e-05 | Elapse: 5.91s
Epoch 1 [163/164] | Train Loss: 0.3253 Grad: 150339.5312 LR: 5.2231e-05 | Elapse: 9.57s


Valid [0]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 1 [0/39] | Valid Loss: 0.5833 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 0.3253; (valid) 0.5017 | Time: 11.48s
Best model found in epoch 1 | valid loss: 0.5017


Epoch 1 [38/39] | Valid Loss: 0.5017 | Elapse: 1.91s


Train [1]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 2 [0/164] | Train Loss: 0.2407 Grad: 434919.9375 LR: 5.2692e-05 | Elapse: 0.06s
Epoch 2 [100/164] | Train Loss: 0.2711 Grad: 143971.1406 LR: 9.1734e-05 | Elapse: 5.87s
Epoch 2 [163/164] | Train Loss: 0.2680 Grad: 154434.4688 LR: 1.0000e-04 | Elapse: 9.53s


Valid [1]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 2 [0/39] | Valid Loss: 0.5289 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.2680; (valid) 0.4705 | Time: 11.42s
Best model found in epoch 2 | valid loss: 0.4705


Epoch 2 [38/39] | Valid Loss: 0.4705 | Elapse: 1.89s


Train [2]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 3 [0/164] | Train Loss: 0.1902 Grad: 342110.3438 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/164] | Train Loss: 0.2309 Grad: 272212.4688 LR: 9.9706e-05 | Elapse: 5.85s
Epoch 3 [163/164] | Train Loss: 0.2296 Grad: 278964.7812 LR: 9.9231e-05 | Elapse: 9.50s


Valid [2]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 3 [0/39] | Valid Loss: 0.5107 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.2296; (valid) 0.4680 | Time: 11.40s
Best model found in epoch 3 | valid loss: 0.4680


Epoch 3 [38/39] | Valid Loss: 0.4680 | Elapse: 1.89s


Train [3]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 4 [0/164] | Train Loss: 0.1692 Grad: 324733.9375 LR: 9.9222e-05 | Elapse: 0.06s
Epoch 4 [100/164] | Train Loss: 0.2030 Grad: 243679.7031 LR: 9.8011e-05 | Elapse: 5.85s
Epoch 4 [163/164] | Train Loss: 0.2037 Grad: 282995.0312 LR: 9.6968e-05 | Elapse: 9.51s


Valid [3]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 4 [0/39] | Valid Loss: 0.4989 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.2037; (valid) 0.4690 | Time: 11.43s


Epoch 4 [38/39] | Valid Loss: 0.4690 | Elapse: 1.92s


Train [4]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 5 [0/164] | Train Loss: 0.1486 Grad: 307473.4375 LR: 9.6949e-05 | Elapse: 0.06s
Epoch 5 [100/164] | Train Loss: 0.1843 Grad: 250811.2969 LR: 9.4857e-05 | Elapse: 5.91s
Epoch 5 [163/164] | Train Loss: 0.1854 Grad: 291252.5938 LR: 9.3277e-05 | Elapse: 9.57s


Valid [4]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 5 [0/39] | Valid Loss: 0.4943 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.1854; (valid) 0.4725 | Time: 11.47s


Epoch 5 [38/39] | Valid Loss: 0.4725 | Elapse: 1.89s


Train [5]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 6 [0/164] | Train Loss: 0.1280 Grad: 305145.1250 LR: 9.3251e-05 | Elapse: 0.06s
Epoch 6 [100/164] | Train Loss: 0.1686 Grad: 224907.6719 LR: 9.0342e-05 | Elapse: 5.84s
Epoch 6 [163/164] | Train Loss: 0.1698 Grad: 300902.5312 LR: 8.8273e-05 | Elapse: 9.49s


Valid [5]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 6 [0/39] | Valid Loss: 0.4879 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.1698; (valid) 0.4752 | Time: 11.39s


Epoch 6 [38/39] | Valid Loss: 0.4752 | Elapse: 1.89s


Train [6]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 7 [0/164] | Train Loss: 0.1121 Grad: 290608.6562 LR: 8.8238e-05 | Elapse: 0.06s
Epoch 7 [100/164] | Train Loss: 0.1554 Grad: 327105.3750 LR: 8.4601e-05 | Elapse: 5.87s
Epoch 7 [163/164] | Train Loss: 0.1568 Grad: 293169.8125 LR: 8.2106e-05 | Elapse: 9.53s


Valid [6]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 7 [0/39] | Valid Loss: 0.4880 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.1568; (valid) 0.4813 | Time: 11.43s


Epoch 7 [38/39] | Valid Loss: 0.4813 | Elapse: 1.89s


Train [7]:   0%|          | 0/164 [00:00<?, ?batch/s]

Epoch 8 [0/164] | Train Loss: 0.0978 Grad: 284911.6250 LR: 8.2065e-05 | Elapse: 0.06s
Epoch 8 [100/164] | Train Loss: 0.1439 Grad: 268008.1875 LR: 7.7810e-05 | Elapse: 5.88s
Epoch 8 [163/164] | Train Loss: 0.1454 Grad: 323816.1250 LR: 7.4964e-05 | Elapse: 9.56s


Valid [7]:   0%|          | 0/39 [00:00<?, ?batch/s]

Epoch 8 [0/39] | Valid Loss: 0.4980 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.1454; (valid) 0.4823 | Time: 11.45s
Early stopping at epoch 8
Fold 3 Valid Loss: 0.46801629662513733
Elapse: 1.53 min 
Fold 3 Elapse: 13.31 min
Fold: 4
- Stage 1 | Train: 15844; Valid: 4339 -


Epoch 8 [38/39] | Valid Loss: 0.4823 | Elapse: 1.89s


Train [0]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 1 [0/495] | Train Loss: 1.1378 Grad: 70959.9141 LR: 4.0002e-06 | Elapse: 0.07s
Epoch 1 [100/495] | Train Loss: 1.1913 Grad: 61949.5859 LR: 6.4492e-06 | Elapse: 5.86s
Epoch 1 [200/495] | Train Loss: 1.1947 Grad: 78135.1406 LR: 1.3456e-05 | Elapse: 11.63s
Epoch 1 [300/495] | Train Loss: 1.1840 Grad: 85308.5859 LR: 2.4319e-05 | Elapse: 17.36s
Epoch 1 [400/495] | Train Loss: 1.1695 Grad: 91638.7500 LR: 3.7952e-05 | Elapse: 23.08s
Epoch 1 [494/495] | Train Loss: 1.1498 Grad: 58506.9141 LR: 5.2076e-05 | Elapse: 28.47s


Valid [0]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 1 [0/136] | Valid Loss: 1.3279 | Elapse: 0.05s
Epoch 1 [100/136] | Valid Loss: 1.3359 | Elapse: 5.00s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 1.1498; (valid) 1.3308 | Time: 35.20s
Best model found in epoch 1 | valid loss: 1.3308


Epoch 1 [135/136] | Valid Loss: 1.3308 | Elapse: 6.73s


Train [1]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 2 [0/495] | Train Loss: 0.9929 Grad: 46580.5859 LR: 5.2229e-05 | Elapse: 0.06s
Epoch 2 [100/495] | Train Loss: 1.0253 Grad: 52452.1836 LR: 6.7209e-05 | Elapse: 5.87s
Epoch 2 [200/495] | Train Loss: 1.0213 Grad: 61218.9922 LR: 8.0668e-05 | Elapse: 11.67s
Epoch 2 [300/495] | Train Loss: 1.0113 Grad: 70403.7578 LR: 9.1258e-05 | Elapse: 17.48s
Epoch 2 [400/495] | Train Loss: 0.9999 Grad: 113860.7266 LR: 9.7921e-05 | Elapse: 23.29s
Epoch 2 [494/495] | Train Loss: 0.9851 Grad: 86356.7578 LR: 1.0000e-04 | Elapse: 28.77s


Valid [1]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 2 [0/136] | Valid Loss: 1.1792 | Elapse: 0.06s
Epoch 2 [100/136] | Valid Loss: 1.1962 | Elapse: 5.00s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.9851; (valid) 1.1905 | Time: 35.48s
Best model found in epoch 2 | valid loss: 1.1905


Epoch 2 [135/136] | Valid Loss: 1.1905 | Elapse: 6.70s


Train [2]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 3 [0/495] | Train Loss: 0.9398 Grad: 60034.8203 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/495] | Train Loss: 0.9006 Grad: 82733.0938 LR: 9.9968e-05 | Elapse: 5.77s
Epoch 3 [200/495] | Train Loss: 0.8941 Grad: 70377.2188 LR: 9.9873e-05 | Elapse: 11.54s
Epoch 3 [300/495] | Train Loss: 0.8781 Grad: 119018.0938 LR: 9.9717e-05 | Elapse: 17.31s
Epoch 3 [400/495] | Train Loss: 0.8644 Grad: 180631.2500 LR: 9.9499e-05 | Elapse: 23.08s
Epoch 3 [494/495] | Train Loss: 0.8488 Grad: 144917.6562 LR: 9.9238e-05 | Elapse: 28.51s


Valid [2]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 3 [0/136] | Valid Loss: 1.0467 | Elapse: 0.05s
Epoch 3 [100/136] | Valid Loss: 1.0552 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.8488; (valid) 1.0521 | Time: 35.13s
Best model found in epoch 3 | valid loss: 1.0521


Epoch 3 [135/136] | Valid Loss: 1.0521 | Elapse: 6.62s


Train [3]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 4 [0/495] | Train Loss: 0.7873 Grad: 91446.3203 LR: 9.9235e-05 | Elapse: 0.06s
Epoch 4 [100/495] | Train Loss: 0.7704 Grad: 134947.4375 LR: 9.8897e-05 | Elapse: 5.83s
Epoch 4 [200/495] | Train Loss: 0.7695 Grad: 131077.9375 LR: 9.8498e-05 | Elapse: 11.61s
Epoch 4 [300/495] | Train Loss: 0.7595 Grad: 182545.6875 LR: 9.8039e-05 | Elapse: 17.39s
Epoch 4 [400/495] | Train Loss: 0.7514 Grad: 145455.5469 LR: 9.7521e-05 | Elapse: 23.17s
Epoch 4 [494/495] | Train Loss: 0.7391 Grad: 198775.9688 LR: 9.6980e-05 | Elapse: 28.60s


Valid [3]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 4 [0/136] | Valid Loss: 0.9027 | Elapse: 0.06s
Epoch 4 [100/136] | Valid Loss: 0.9600 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.7391; (valid) 0.9598 | Time: 35.23s
Best model found in epoch 4 | valid loss: 0.9598


Epoch 4 [135/136] | Valid Loss: 0.9598 | Elapse: 6.62s


Train [4]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 5 [0/495] | Train Loss: 0.6659 Grad: 132883.4375 LR: 9.6974e-05 | Elapse: 0.06s
Epoch 5 [100/495] | Train Loss: 0.6659 Grad: 160521.1250 LR: 9.6341e-05 | Elapse: 5.89s
Epoch 5 [200/495] | Train Loss: 0.6683 Grad: 183549.7031 LR: 9.5650e-05 | Elapse: 11.70s
Epoch 5 [300/495] | Train Loss: 0.6618 Grad: 182059.4062 LR: 9.4903e-05 | Elapse: 17.49s
Epoch 5 [400/495] | Train Loss: 0.6569 Grad: 155586.5000 LR: 9.4100e-05 | Elapse: 23.29s
Epoch 5 [494/495] | Train Loss: 0.6479 Grad: 248058.7344 LR: 9.3295e-05 | Elapse: 28.74s


Valid [4]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 5 [0/136] | Valid Loss: 0.8227 | Elapse: 0.05s
Epoch 5 [100/136] | Valid Loss: 0.8604 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.6479; (valid) 0.8633 | Time: 35.44s
Best model found in epoch 5 | valid loss: 0.8633


Epoch 5 [135/136] | Valid Loss: 0.8633 | Elapse: 6.70s


Train [5]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 6 [0/495] | Train Loss: 0.5741 Grad: 167056.0781 LR: 9.3286e-05 | Elapse: 0.06s
Epoch 6 [100/495] | Train Loss: 0.5872 Grad: 217529.7812 LR: 9.2377e-05 | Elapse: 5.88s
Epoch 6 [200/495] | Train Loss: 0.5946 Grad: 274266.7500 LR: 9.1416e-05 | Elapse: 11.70s
Epoch 6 [300/495] | Train Loss: 0.5931 Grad: 208260.8906 LR: 9.0403e-05 | Elapse: 17.52s
Epoch 6 [400/495] | Train Loss: 0.5915 Grad: 197804.2969 LR: 8.9340e-05 | Elapse: 23.34s
Epoch 6 [494/495] | Train Loss: 0.5855 Grad: 291614.0312 LR: 8.8296e-05 | Elapse: 28.81s


Valid [5]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 6 [0/136] | Valid Loss: 0.7574 | Elapse: 0.05s
Epoch 6 [100/136] | Valid Loss: 0.8207 | Elapse: 4.99s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.5855; (valid) 0.8233 | Time: 35.52s
Best model found in epoch 6 | valid loss: 0.8233


Epoch 6 [135/136] | Valid Loss: 0.8233 | Elapse: 6.70s


Train [6]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 7 [0/495] | Train Loss: 0.5294 Grad: 185826.6406 LR: 8.8284e-05 | Elapse: 0.06s
Epoch 7 [100/495] | Train Loss: 0.5427 Grad: 254067.0938 LR: 8.7127e-05 | Elapse: 5.86s
Epoch 7 [200/495] | Train Loss: 0.5501 Grad: 282778.0312 LR: 8.5924e-05 | Elapse: 11.63s
Epoch 7 [300/495] | Train Loss: 0.5503 Grad: 238189.9688 LR: 8.4676e-05 | Elapse: 17.38s
Epoch 7 [400/495] | Train Loss: 0.5497 Grad: 236894.5781 LR: 8.3384e-05 | Elapse: 23.14s
Epoch 7 [494/495] | Train Loss: 0.5445 Grad: 350609.3125 LR: 8.2133e-05 | Elapse: 28.55s


Valid [6]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 7 [0/136] | Valid Loss: 0.7049 | Elapse: 0.06s
Epoch 7 [100/136] | Valid Loss: 0.7940 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.5445; (valid) 0.7962 | Time: 35.20s
Best model found in epoch 7 | valid loss: 0.7962


Epoch 7 [135/136] | Valid Loss: 0.7962 | Elapse: 6.64s


Train [7]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 8 [0/495] | Train Loss: 0.4968 Grad: 233954.6094 LR: 8.2120e-05 | Elapse: 0.06s
Epoch 8 [100/495] | Train Loss: 0.5076 Grad: 293312.0625 LR: 8.0749e-05 | Elapse: 5.82s
Epoch 8 [200/495] | Train Loss: 0.5162 Grad: 296192.0312 LR: 7.9340e-05 | Elapse: 11.59s
Epoch 8 [300/495] | Train Loss: 0.5172 Grad: 262805.1562 LR: 7.7895e-05 | Elapse: 17.36s
Epoch 8 [400/495] | Train Loss: 0.5169 Grad: 299510.4688 LR: 7.6416e-05 | Elapse: 23.13s
Epoch 8 [494/495] | Train Loss: 0.5123 Grad: 387731.9688 LR: 7.4995e-05 | Elapse: 28.57s


Valid [7]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 8 [0/136] | Valid Loss: 0.6906 | Elapse: 0.05s
Epoch 8 [100/136] | Valid Loss: 0.7784 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.5123; (valid) 0.7805 | Time: 35.22s
Best model found in epoch 8 | valid loss: 0.7805


Epoch 8 [135/136] | Valid Loss: 0.7805 | Elapse: 6.64s


Train [8]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 9 [0/495] | Train Loss: 0.4744 Grad: 280279.3125 LR: 7.4979e-05 | Elapse: 0.06s
Epoch 9 [100/495] | Train Loss: 0.4783 Grad: 310649.5000 LR: 7.3437e-05 | Elapse: 5.84s
Epoch 9 [200/495] | Train Loss: 0.4869 Grad: 311071.8125 LR: 7.1866e-05 | Elapse: 11.61s
Epoch 9 [300/495] | Train Loss: 0.4884 Grad: 282645.6562 LR: 7.0268e-05 | Elapse: 17.38s
Epoch 9 [400/495] | Train Loss: 0.4884 Grad: 344117.6562 LR: 6.8645e-05 | Elapse: 23.16s
Epoch 9 [494/495] | Train Loss: 0.4843 Grad: 428960.3750 LR: 6.7098e-05 | Elapse: 28.57s


Valid [8]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 9 [0/136] | Valid Loss: 0.6764 | Elapse: 0.05s
Epoch 9 [100/136] | Valid Loss: 0.7692 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.4843; (valid) 0.7704 | Time: 35.22s
Best model found in epoch 9 | valid loss: 0.7704


Epoch 9 [135/136] | Valid Loss: 0.7704 | Elapse: 6.64s


Train [9]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 10 [0/495] | Train Loss: 0.4511 Grad: 321741.6875 LR: 6.7081e-05 | Elapse: 0.06s
Epoch 10 [100/495] | Train Loss: 0.4527 Grad: 307429.1250 LR: 6.5414e-05 | Elapse: 5.83s
Epoch 10 [200/495] | Train Loss: 0.4618 Grad: 304311.7812 LR: 6.3729e-05 | Elapse: 11.61s
Epoch 10 [300/495] | Train Loss: 0.4639 Grad: 311464.1875 LR: 6.2026e-05 | Elapse: 17.43s
Epoch 10 [400/495] | Train Loss: 0.4643 Grad: 381521.3125 LR: 6.0308e-05 | Elapse: 23.21s
Epoch 10 [494/495] | Train Loss: 0.4602 Grad: 424881.9688 LR: 5.8682e-05 | Elapse: 28.65s


Valid [9]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 10 [0/136] | Valid Loss: 0.6807 | Elapse: 0.05s
Epoch 10 [100/136] | Valid Loss: 0.7611 | Elapse: 4.96s


----------------------------------------------------------------------------------------------------
Epoch 10 - Average Loss: (train) 0.4602; (valid) 0.7625 | Time: 35.31s
Best model found in epoch 10 | valid loss: 0.7625


Epoch 10 [135/136] | Valid Loss: 0.7625 | Elapse: 6.66s


Train [10]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 11 [0/495] | Train Loss: 0.4380 Grad: 373436.0000 LR: 5.8664e-05 | Elapse: 0.06s
Epoch 11 [100/495] | Train Loss: 0.4328 Grad: 326146.3438 LR: 5.6924e-05 | Elapse: 5.82s
Epoch 11 [200/495] | Train Loss: 0.4410 Grad: 312188.2812 LR: 5.5174e-05 | Elapse: 11.62s
Epoch 11 [300/495] | Train Loss: 0.4434 Grad: 354561.5312 LR: 5.3419e-05 | Elapse: 17.39s
Epoch 11 [400/495] | Train Loss: 0.4440 Grad: 423039.9062 LR: 5.1659e-05 | Elapse: 23.16s
Epoch 11 [494/495] | Train Loss: 0.4397 Grad: 453616.9062 LR: 5.0002e-05 | Elapse: 28.57s


Valid [10]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 11 [0/136] | Valid Loss: 0.7062 | Elapse: 0.05s
Epoch 11 [100/136] | Valid Loss: 0.7568 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 11 - Average Loss: (train) 0.4397; (valid) 0.7587 | Time: 35.19s
Best model found in epoch 11 | valid loss: 0.7587


Epoch 11 [135/136] | Valid Loss: 0.7587 | Elapse: 6.62s


Train [11]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 12 [0/495] | Train Loss: 0.4347 Grad: 412531.7812 LR: 4.9985e-05 | Elapse: 0.06s
Epoch 12 [100/495] | Train Loss: 0.4149 Grad: 339387.3438 LR: 4.8223e-05 | Elapse: 5.81s
Epoch 12 [200/495] | Train Loss: 0.4232 Grad: 314717.7188 LR: 4.6463e-05 | Elapse: 11.40s
Epoch 12 [300/495] | Train Loss: 0.4259 Grad: 402967.3750 LR: 4.4708e-05 | Elapse: 16.99s
Epoch 12 [400/495] | Train Loss: 0.4270 Grad: 465792.3438 LR: 4.2959e-05 | Elapse: 22.76s
Epoch 12 [494/495] | Train Loss: 0.4228 Grad: 393615.2500 LR: 4.1324e-05 | Elapse: 28.09s


Valid [11]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 12 [0/136] | Valid Loss: 0.7248 | Elapse: 0.05s
Epoch 12 [100/136] | Valid Loss: 0.7573 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 12 - Average Loss: (train) 0.4228; (valid) 0.7591 | Time: 34.73s


Epoch 12 [135/136] | Valid Loss: 0.7591 | Elapse: 6.63s


Train [12]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 13 [0/495] | Train Loss: 0.4251 Grad: 442117.0312 LR: 4.1306e-05 | Elapse: 0.06s
Epoch 13 [100/495] | Train Loss: 0.4005 Grad: 346860.8125 LR: 3.9577e-05 | Elapse: 5.76s
Epoch 13 [200/495] | Train Loss: 0.4084 Grad: 321197.3438 LR: 3.7860e-05 | Elapse: 11.44s
Epoch 13 [300/495] | Train Loss: 0.4109 Grad: 386141.8125 LR: 3.6159e-05 | Elapse: 17.11s
Epoch 13 [400/495] | Train Loss: 0.4120 Grad: 481603.5938 LR: 3.4475e-05 | Elapse: 22.79s
Epoch 13 [494/495] | Train Loss: 0.4080 Grad: 417299.2812 LR: 3.2909e-05 | Elapse: 28.13s


Valid [12]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 13 [0/136] | Valid Loss: 0.7408 | Elapse: 0.05s
Epoch 13 [100/136] | Valid Loss: 0.7608 | Elapse: 4.94s


----------------------------------------------------------------------------------------------------
Epoch 13 - Average Loss: (train) 0.4080; (valid) 0.7629 | Time: 34.77s


Epoch 13 [135/136] | Valid Loss: 0.7629 | Elapse: 6.63s


Train [13]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 14 [0/495] | Train Loss: 0.4109 Grad: 451507.1250 LR: 3.2893e-05 | Elapse: 0.06s
Epoch 14 [100/495] | Train Loss: 0.3883 Grad: 357327.1562 LR: 3.1248e-05 | Elapse: 5.84s
Epoch 14 [200/495] | Train Loss: 0.3958 Grad: 333830.5625 LR: 2.9627e-05 | Elapse: 11.61s
Epoch 14 [300/495] | Train Loss: 0.3986 Grad: 418684.2812 LR: 2.8031e-05 | Elapse: 17.40s
Epoch 14 [400/495] | Train Loss: 0.4002 Grad: 491880.7812 LR: 2.6463e-05 | Elapse: 23.17s
Epoch 14 [494/495] | Train Loss: 0.3965 Grad: 415571.1250 LR: 2.5015e-05 | Elapse: 28.60s


Valid [13]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 14 [0/136] | Valid Loss: 0.7317 | Elapse: 0.05s
Epoch 14 [100/136] | Valid Loss: 0.7639 | Elapse: 4.93s


----------------------------------------------------------------------------------------------------
Epoch 14 - Average Loss: (train) 0.3965; (valid) 0.7656 | Time: 35.22s


Epoch 14 [135/136] | Valid Loss: 0.7656 | Elapse: 6.62s


Train [14]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 15 [0/495] | Train Loss: 0.3939 Grad: 456592.9688 LR: 2.4999e-05 | Elapse: 0.06s
Epoch 15 [100/495] | Train Loss: 0.3790 Grad: 350363.1875 LR: 2.3490e-05 | Elapse: 5.85s
Epoch 15 [200/495] | Train Loss: 0.3854 Grad: 353319.6250 LR: 2.2013e-05 | Elapse: 11.64s
Epoch 15 [300/495] | Train Loss: 0.3887 Grad: 517433.7188 LR: 2.0571e-05 | Elapse: 17.41s
Epoch 15 [400/495] | Train Loss: 0.3910 Grad: 521955.9375 LR: 1.9166e-05 | Elapse: 23.17s
Epoch 15 [494/495] | Train Loss: 0.3877 Grad: 463358.0625 LR: 1.7880e-05 | Elapse: 28.63s


Valid [14]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 15 [0/136] | Valid Loss: 0.7327 | Elapse: 0.05s
Epoch 15 [100/136] | Valid Loss: 0.7682 | Elapse: 4.97s


----------------------------------------------------------------------------------------------------
Epoch 15 - Average Loss: (train) 0.3877; (valid) 0.7695 | Time: 35.30s


Epoch 15 [135/136] | Valid Loss: 0.7695 | Elapse: 6.66s


Train [15]:   0%|          | 0/495 [00:00<?, ?batch/s]

Epoch 16 [0/495] | Train Loss: 0.3827 Grad: 477873.0000 LR: 1.7866e-05 | Elapse: 0.06s
Epoch 16 [100/495] | Train Loss: 0.3715 Grad: 354409.0625 LR: 1.6538e-05 | Elapse: 5.85s
Epoch 16 [200/495] | Train Loss: 0.3772 Grad: 395348.2812 LR: 1.5250e-05 | Elapse: 11.63s
Epoch 16 [300/495] | Train Loss: 0.3803 Grad: 552057.3750 LR: 1.4006e-05 | Elapse: 17.41s
Epoch 16 [400/495] | Train Loss: 0.3832 Grad: 278614.6250 LR: 1.2807e-05 | Elapse: 23.19s
Epoch 16 [494/495] | Train Loss: 0.3804 Grad: 241199.2188 LR: 1.1722e-05 | Elapse: 28.62s


Valid [15]:   0%|          | 0/136 [00:00<?, ?batch/s]

Epoch 16 [0/136] | Valid Loss: 0.7381 | Elapse: 0.06s
Epoch 16 [100/136] | Valid Loss: 0.7729 | Elapse: 4.95s


----------------------------------------------------------------------------------------------------
Epoch 16 - Average Loss: (train) 0.3804; (valid) 0.7733 | Time: 35.27s
Early stopping at epoch 16
Fold 4 Valid Loss: 0.7586690187454224
Elapse: 9.39 min 


Epoch 16 [135/136] | Valid Loss: 0.7733 | Elapse: 6.65s


- Stage 2 | Train: 5169; Valid: 1323 -


Loading model from checkpoint: outputs/ResnetGRU_v1_LB048_fold_4_stage_1.pth


Train [0]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 1 [0/161] | Train Loss: 0.3045 Grad: 529901.0625 LR: 4.0023e-06 | Elapse: 0.07s
Epoch 1 [100/161] | Train Loss: 0.3404 Grad: 706079.0000 LR: 2.5602e-05 | Elapse: 5.89s
Epoch 1 [160/161] | Train Loss: 0.3202 Grad: 156530.9219 LR: 5.2235e-05 | Elapse: 9.38s


Valid [0]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 1 [0/42] | Valid Loss: 0.5255 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Loss: (train) 0.3202; (valid) 0.5380 | Time: 11.45s
Best model found in epoch 1 | valid loss: 0.5380


Epoch 1 [41/42] | Valid Loss: 0.5380 | Elapse: 2.06s


Train [1]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 2 [0/161] | Train Loss: 0.2508 Grad: 462955.3125 LR: 5.2705e-05 | Elapse: 0.07s
Epoch 2 [100/161] | Train Loss: 0.2718 Grad: 373347.8125 LR: 9.2218e-05 | Elapse: 5.92s
Epoch 2 [160/161] | Train Loss: 0.2592 Grad: 121705.5234 LR: 1.0000e-04 | Elapse: 9.47s


Valid [1]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 2 [0/42] | Valid Loss: 0.4834 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Loss: (train) 0.2592; (valid) 0.5006 | Time: 11.56s
Best model found in epoch 2 | valid loss: 0.5006


Epoch 2 [41/42] | Valid Loss: 0.5006 | Elapse: 2.09s


Train [2]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 3 [0/161] | Train Loss: 0.2102 Grad: 356393.4062 LR: 1.0000e-04 | Elapse: 0.06s
Epoch 3 [100/161] | Train Loss: 0.2279 Grad: 498247.1250 LR: 9.9695e-05 | Elapse: 5.95s
Epoch 3 [160/161] | Train Loss: 0.2185 Grad: 242740.7031 LR: 9.9231e-05 | Elapse: 9.47s


Valid [2]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 3 [0/42] | Valid Loss: 0.4703 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Loss: (train) 0.2185; (valid) 0.4967 | Time: 11.55s
Best model found in epoch 3 | valid loss: 0.4967


Epoch 3 [41/42] | Valid Loss: 0.4967 | Elapse: 2.08s


Train [3]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 4 [0/161] | Train Loss: 0.2004 Grad: 423230.3750 LR: 9.9222e-05 | Elapse: 0.06s
Epoch 4 [100/161] | Train Loss: 0.2001 Grad: 514279.2500 LR: 9.7982e-05 | Elapse: 5.92s
Epoch 4 [160/161] | Train Loss: 0.1936 Grad: 247983.7344 LR: 9.6967e-05 | Elapse: 9.43s


Valid [3]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 4 [0/42] | Valid Loss: 0.4785 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Loss: (train) 0.1936; (valid) 0.4970 | Time: 11.50s


Epoch 4 [41/42] | Valid Loss: 0.4970 | Elapse: 2.07s


Train [4]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 5 [0/161] | Train Loss: 0.1744 Grad: 360317.0000 LR: 9.6949e-05 | Elapse: 0.06s
Epoch 5 [100/161] | Train Loss: 0.1796 Grad: 495203.3125 LR: 9.4813e-05 | Elapse: 5.93s
Epoch 5 [160/161] | Train Loss: 0.1741 Grad: 245975.1719 LR: 9.3277e-05 | Elapse: 9.44s


Valid [4]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 5 [0/42] | Valid Loss: 0.4774 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Loss: (train) 0.1741; (valid) 0.4944 | Time: 11.52s
Best model found in epoch 5 | valid loss: 0.4944


Epoch 5 [41/42] | Valid Loss: 0.4944 | Elapse: 2.07s


Train [5]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 6 [0/161] | Train Loss: 0.1576 Grad: 349445.7188 LR: 9.3250e-05 | Elapse: 0.06s
Epoch 6 [100/161] | Train Loss: 0.1642 Grad: 442408.1562 LR: 9.0282e-05 | Elapse: 5.93s
Epoch 6 [160/161] | Train Loss: 0.1592 Grad: 231404.8125 LR: 8.8272e-05 | Elapse: 9.45s


Valid [5]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 6 [0/42] | Valid Loss: 0.4854 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Loss: (train) 0.1592; (valid) 0.4973 | Time: 11.52s


Epoch 6 [41/42] | Valid Loss: 0.4973 | Elapse: 2.07s


Train [6]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 7 [0/161] | Train Loss: 0.1323 Grad: 328263.5938 LR: 8.8237e-05 | Elapse: 0.06s
Epoch 7 [100/161] | Train Loss: 0.1509 Grad: 437924.0000 LR: 8.4528e-05 | Elapse: 5.99s
Epoch 7 [160/161] | Train Loss: 0.1468 Grad: 228293.7188 LR: 8.2105e-05 | Elapse: 9.53s


Valid [6]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 7 [0/42] | Valid Loss: 0.4836 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 7 - Average Loss: (train) 0.1468; (valid) 0.4975 | Time: 11.62s


Epoch 7 [41/42] | Valid Loss: 0.4975 | Elapse: 2.09s


Train [7]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 8 [0/161] | Train Loss: 0.1271 Grad: 395002.6250 LR: 8.2063e-05 | Elapse: 0.06s
Epoch 8 [100/161] | Train Loss: 0.1419 Grad: 431699.7188 LR: 7.7725e-05 | Elapse: 5.95s
Epoch 8 [160/161] | Train Loss: 0.1374 Grad: 207642.9844 LR: 7.4963e-05 | Elapse: 9.48s


Valid [7]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 8 [0/42] | Valid Loss: 0.4878 | Elapse: 0.05s


----------------------------------------------------------------------------------------------------
Epoch 8 - Average Loss: (train) 0.1374; (valid) 0.5005 | Time: 11.58s


Epoch 8 [41/42] | Valid Loss: 0.5005 | Elapse: 2.09s


Train [8]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 9 [0/161] | Train Loss: 0.1077 Grad: 346575.1562 LR: 7.4916e-05 | Elapse: 0.06s
Epoch 9 [100/161] | Train Loss: 0.1318 Grad: 448134.5000 LR: 7.0081e-05 | Elapse: 5.97s
Epoch 9 [160/161] | Train Loss: 0.1276 Grad: 223301.1250 LR: 6.7063e-05 | Elapse: 9.52s


Valid [8]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 9 [0/42] | Valid Loss: 0.4831 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 9 - Average Loss: (train) 0.1276; (valid) 0.5002 | Time: 11.62s


Epoch 9 [41/42] | Valid Loss: 0.5002 | Elapse: 2.10s


Train [9]:   0%|          | 0/161 [00:00<?, ?batch/s]

Epoch 10 [0/161] | Train Loss: 0.1158 Grad: 526689.6875 LR: 6.7012e-05 | Elapse: 0.06s
Epoch 10 [100/161] | Train Loss: 0.1227 Grad: 506529.1250 LR: 6.1827e-05 | Elapse: 5.96s
Epoch 10 [160/161] | Train Loss: 0.1191 Grad: 212251.1250 LR: 5.8646e-05 | Elapse: 9.50s


Valid [9]:   0%|          | 0/42 [00:00<?, ?batch/s]

Epoch 10 [0/42] | Valid Loss: 0.4817 | Elapse: 0.06s


----------------------------------------------------------------------------------------------------
Epoch 10 - Average Loss: (train) 0.1191; (valid) 0.5028 | Time: 11.60s
Early stopping at epoch 10
Fold 4 Valid Loss: 0.4943685233592987
Elapse: 1.93 min 
Fold 4 Elapse: 11.32 min
Training Complete!
CV Result: Stage 1: 0.7565410733222961 | Stage 2: 0.48485204577445984
Elapse: 60.70 min 


Epoch 10 [41/42] | Valid Loss: 0.5028 | Elapse: 2.10s


In [None]:
# plot loss history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

for i, loss in enumerate(loss_history_1):
    ax1.plot(loss['train'], marker="*", ls="-", label=f"Fold {i} Train")
    ax1.plot(loss['valid'], marker="o", ls=":", label=f"Fold {i} Valid")

for i, loss in enumerate(loss_history_2):
    ax2.plot(loss['train'], marker="*", ls="-", label=f"Fold {i} Train")
    ax2.plot(loss['valid'], marker="o", ls=":", label=f"Fold {i} Valid")

ax1.set_title("Stage 1 Loss")
ax2.set_title("Stage 2 Loss")

for ax in (ax1, ax2):
    ax.set_xlabel("Epochs")
    ax.set_ylabel("Loss")
    ax.legend()
    ax.grid(True)

fig.tight_layout()
fig.savefig(Path(PATHS.OUTPUT_DIR) / f"{ModelConfig.MODEL_NAME}_loss_history.png")
plt.show()

In [None]:
csv_path = f'./outputs/{ModelConfig.MODEL_NAME}_oof_1.csv'
print("CSV Path: ", csv_path)

oof_df = analyze_oof(csv_path)

print("Kaggle Score: ", calc_kaggle_score(oof_df))
print("Average KL Loss: ", oof_df["kl_loss"].mean())

display(oof_df.head())

# plot confusion matrix
cm = confusion_matrix(oof_df['target_id'], oof_df['target_pred']) # (y_true, y_pred)
cm = cm / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=TARGET2ID.keys(), yticklabels=TARGET2ID.keys())
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.title(csv_path.split('/')[-1].split('.')[0], fontsize=12)
fig.tight_layout()
fig.savefig(f"./outputs/{csv_path.split('/')[-1].split('.')[0]}_CM.png")
plt.show()

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)
oof_samples = oof_df.sample(axes.size)

for i, ax in enumerate(axes.flatten()):
    row = oof_samples.iloc[i]
    x = np.arange(6)
    ax.plot(x, row[TARGETS].T, marker="o", ls="-", label="True")
    ax.plot(x, row[TARGETS_PRED].T, marker="*", ls="--", label="Predicted")
    ax.set_title(f"{row['target']} | KL Loss: {row['kl_loss']:.4f}")
    ax.legend()
    
fig.tight_layout()
fig.savefig(f"./outputs/{csv_path.split('/')[-1].split('.')[0]}_samples.png")
plt.show()

In [None]:
csv_path = f'./outputs/{ModelConfig.MODEL_NAME}_oof_2.csv'
print("CSV Path: ", csv_path)

oof_df = analyze_oof(csv_path)

print("Kaggle Score: ", calc_kaggle_score(oof_df))
print("Average KL Loss: ", oof_df["kl_loss"].mean())

display(oof_df.head())

# plot confusion matrix
cm = confusion_matrix(oof_df['target_id'], oof_df['target_pred']) # (y_true, y_pred)
cm = cm / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=TARGET2ID.keys(), yticklabels=TARGET2ID.keys())
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.title(csv_path.split('/')[-1].split('.')[0], fontsize=12)
fig.tight_layout()
fig.savefig(f"./outputs/{csv_path.split('/')[-1].split('.')[0]}_CM.png")
plt.show()

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)
oof_samples = oof_df.sample(axes.size)

for i, ax in enumerate(axes.flatten()):
    row = oof_samples.iloc[i]
    x = np.arange(6)
    ax.plot(x, row[TARGETS].T, marker="o", ls="-", label="True")
    ax.plot(x, row[TARGETS_PRED].T, marker="*", ls="--", label="Predicted")
    ax.set_title(f"{row['target']} | KL Loss: {row['kl_loss']:.4f}")
    ax.legend()
    
fig.tight_layout()
fig.savefig(f"./outputs/{csv_path.split('/')[-1].split('.')[0]}_samples.png")
plt.show()

In [None]:
oof_stage_2_full = pd.DataFrame()

for fold in range(1):

    valid_folds = train_all[train_all['fold'] == fold].reset_index(drop=True)

    # predict labels using stage-2 models
    model = ResNetGRU(
        kernels=ModelConfig.RESNET_GRU_KERNELS, 
        in_channels=8, 
        fixed_kernel_size=ModelConfig.RESNET_GRU_FIXED_KERNEL_SIZE,
        hidden_size=ModelConfig.RESNET_GRU_HIDDEN_SIZE,
        num_classes=6
        )
    
    check_point = os.path.join(
        PATHS.OUTPUT_DIR,
        f"{ModelConfig.MODEL_NAME}_fold_{fold}_stage_2.pth"
    )

    model.load_state_dict(torch.load(check_point, map_location=DEVICE))

    loader_kwargs = {
        "batch_size": ModelConfig.BATCH_SIZE,
        "num_workers": ModelConfig.NUM_WORKERS,
        "pin_memory": True,
        "shuffle": False,
    }

    valid_dataset = EEGSeqDataset(
        valid_folds, ModelConfig, ALL_EEG_SIGNALS, mode="valid", downsample=ModelConfig.RESNET_GRU_DOWNSAMPLE)
    valid_loader = DataLoader(valid_dataset, drop_last=False, collate_fn=None, **loader_kwargs)

    model.to(DEVICE)
    model.eval()

    valid_predicts = []
    with torch.no_grad():
        for X, y in valid_loader:
            X = X.to(DEVICE)
            y_pred = model(X)
            valid_predicts.append(y_pred.to('cpu').numpy())

    valid_predicts = np.concatenate(valid_predicts)
    valid_folds[TARGETS_PRED] = valid_predicts
    oof_stage_2_full = pd.concat([oof_stage_2, valid_folds], axis=0).reset_index(drop=True)

    del valid_dataset, valid_loader
    torch.cuda.empty_cache()
    gc.collect()

    oof_stage_2_full.to_csv(os.path.join(PATHS.OUTPUT_DIR, f"{ModelConfig.MODEL_NAME}_oof_2_full.csv"), index=False)

cv_results = evaluate_oof(oof_stage_2_full)
logger.info(f"{'=' * 100}\nCV Result (Stage 2 Full): {cv_results}\n{'=' * 100}")


Reg = 0.15, Downsample = 0, CV Result (Stage 2 Full): 0.639643669128418

In [None]:
csv_path = f'./outputs/Resnet_SeqGRU_ChrisNO_NoReg_oof_2_full.csv'
print("CSV Path: ", csv_path)

oof_df = analyze_oof(csv_path)

print("Kaggle Score: ", calc_kaggle_score(oof_df))
print("Average KL Loss: ", oof_df["kl_loss"].mean())

display(oof_df.head())

# plot confusion matrix
cm = confusion_matrix(oof_df['target_id'], oof_df['target_pred']) # (y_true, y_pred)
cm = cm / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=TARGET2ID.keys(), yticklabels=TARGET2ID.keys())
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.title(csv_path.split('/')[-1].split('.')[0], fontsize=12)
fig.tight_layout()
fig.savefig(f"./outputs/{csv_path.split('/')[-1].split('.')[0]}_CM.png")
plt.show()