In [1]:
import pandas as pd 
import numpy as np 
from scipy.stats import entropy
import matplotlib.pyplot as plt

from engine_hms_trainer import *
from engine_hms_model import JobConfig, ModelConfig

import torch
from torch import nn
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
seed_everything(JobConfig.SEED)

## Config EfficientNet 
# ModelConfig.EPOCHS = 6
# ModelConfig.MODEL_BACKBONE = 'tf_efficientnet_b2'
# ModelConfig.MODEL_NAME = "ENet_b2_mlp"
# ModelConfig.AUGMENT = True
# ModelConfig.USE_KAGGLE_SPECTROGRAMS = True
# ModelConfig.USE_EEG_SPECTROGRAMS = True
# ModelConfig.REGULARIZATION = None
# ModelConfig.AUGMENTATIONS = ['xy_masking']

## Config ViTMAE
ModelConfig.EPOCHS = 6
ModelConfig.MODEL_BACKBONE = 'vit_mae_base'
ModelConfig.MODEL_NAME = "ViTMAE_base"
ModelConfig.AUGMENT = True
ModelConfig.USE_KAGGLE_SPECTROGRAMS = True
ModelConfig.USE_EEG_SPECTROGRAMS = True
ModelConfig.REGULARIZATION = None
ModelConfig.AUGMENTATIONS = ['xy_masking']
ModelConfig.MAE_PRETRAINED_WEIGHTS = "./outputs/vit_mae_pretraining/best_mae_model.pth"

hms_predictor = HMSPredictor(JobConfig, ModelConfig)

****************************************************************************************************
Script Start: Tue Mar 12 21:26:25 2024
Model Configurations:
MODEL_NAME: ViTMAE_base
MODEL_BACKBONE: vit_mae_base
BATCH_SIZE: 16
EPOCHS: 6
GRADIENT_ACCUMULATION_STEPS: 2
DROP_RATE: 0.15
DROP_PATH_RATE: 0.25
WEIGHT_DECAY: 0.01
REGULARIZATION: None
USE_KAGGLE_SPECTROGRAMS: True
USE_EEG_SPECTROGRAMS: True
AMP: True
AUGMENT: True
AUGMENTATIONS: ['xy_masking']
PRINT_FREQ: 50
FREEZE: False
NUM_FROZEN_LAYERS: 0
NUM_WORKERS: 0
MAX_GRAD_NORM: 10000000.0
MAE_PRETRAINED_WEIGHTS: ./outputs/vit_mae_pretraining/best_mae_model.pth
MAE_HIDDEN_DROPOUT_PROB: 0.05
MAE_ATTENTION_DROPOUT_PROB: 0.05
****************************************************************************************************


In [3]:
train_easy, train_hard, all_specs, all_eegs = hms_predictor.load_train_data()

print(train_easy.shape)
print(train_hard.shape)

# check if contain NaN
print(train_easy.isnull().sum().sum())
print(train_hard.isnull().sum().sum())

display(train_easy.head())
print(" ")
display(train_hard.head())

(13996, 12)
(6187, 12)
0
0


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,spectrogram_id,min,max,patient_id,target
0,642382,0.0,0.0,0.0,0.0,0.0,1.0,14960202,1008.0,1032.0,5955,Other
1,751790,0.0,0.0,1.0,0.0,0.0,0.0,618728447,908.0,908.0,38549,GPD
2,778705,0.0,0.0,0.0,0.0,0.0,1.0,52296320,0.0,0.0,40955,Other
3,1629671,1.0,0.0,0.0,0.0,0.0,0.0,2036345030,0.0,160.0,37481,Seizure
4,2061593,0.0,0.0,0.0,0.0,0.0,1.0,320962633,1450.0,1450.0,23828,Other


 


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,spectrogram_id,min,max,patient_id,target
0,568657,0.0,0.0,0.25,0.0,0.166667,0.583333,789577333,0.0,16.0,20654,Other
1,582999,0.0,0.857143,0.0,0.071429,0.0,0.071429,1552638400,0.0,38.0,20230,LPD
2,1895581,0.076923,0.0,0.0,0.0,0.076923,0.846154,128369999,1138.0,1138.0,47999,Other
3,2482631,0.0,0.0,0.133333,0.066667,0.133333,0.666667,978166025,1902.0,1944.0,20606,Other
4,2521897,0.0,0.0,0.083333,0.083333,0.333333,0.5,673742515,0.0,4.0,62117,Other


In [4]:
hms_predictor.train_folds(train_easy, train_hard, all_specs, all_eegs)

Fold: 0 || Valid size 6867 
- First Stage 


Train:   0%|          | 0/566 [00:00<?, ?batch/s]

Epoch: [1][0/566]Elapsed 0.82s | Loss: 0.8747 Grad: inf LR: 4.0000e-06
Epoch: [1][50/566]Elapsed 3.97s | Loss: 0.7918 Grad: 22935.7832 LR: 5.2855e-06
Epoch: [1][100/566]Elapsed 7.05s | Loss: 0.7631 Grad: 24325.8926 LR: 9.0731e-06
Epoch: [1][150/566]Elapsed 10.14s | Loss: 0.7482 Grad: 22231.9473 LR: 1.5160e-05
Epoch: [1][200/566]Elapsed 13.23s | Loss: 0.7361 Grad: 26599.8477 LR: 2.3220e-05
Epoch: [1][250/566]Elapsed 16.31s | Loss: 0.7207 Grad: 42741.4062 LR: 3.2822e-05
Epoch: [1][300/566]Elapsed 19.39s | Loss: 0.7094 Grad: 32611.2246 LR: 4.3451e-05
Epoch: [1][350/566]Elapsed 22.47s | Loss: 0.6982 Grad: 73515.8047 LR: 5.4537e-05
Epoch: [1][400/566]Elapsed 25.55s | Loss: 0.6836 Grad: 39497.6602 LR: 6.5488e-05
Epoch: [1][450/566]Elapsed 28.65s | Loss: 0.6715 Grad: 40204.4180 LR: 7.5717e-05
Epoch: [1][500/566]Elapsed 31.74s | Loss: 0.6606 Grad: 35129.3281 LR: 8.4675e-05
Epoch: [1][550/566]Elapsed 34.83s | Loss: 0.6480 Grad: 37107.0977 LR: 9.1883e-05
Epoch: [1][565/566]Elapsed 35.78s | Loss:

Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [1][0/430]Elapsed 0.05s | Loss: 0.6996
Epoch: [1][50/430]Elapsed 2.45s | Loss: 0.4806
Epoch: [1][100/430]Elapsed 4.93s | Loss: 0.4930
Epoch: [1][150/430]Elapsed 7.52s | Loss: 0.4981
Epoch: [1][200/430]Elapsed 10.16s | Loss: 0.5050
Epoch: [1][250/430]Elapsed 12.80s | Loss: 0.5113
Epoch: [1][300/430]Elapsed 15.47s | Loss: 0.5092
Epoch: [1][350/430]Elapsed 18.14s | Loss: 0.4909
Epoch: [1][400/430]Elapsed 20.79s | Loss: 0.4790


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6456 | Average Valid Loss: 0.4745 | Time: 58.19s
Best model found in epoch 1 | valid loss: 0.4745


Train:   0%|          | 0/566 [00:00<?, ?batch/s]

Epoch: [2][0/566]Elapsed 0.09s | Loss: 0.4640 Grad: 108062.1641 LR: 9.3754e-05
Epoch: [2][50/566]Elapsed 3.25s | Loss: 0.5252 Grad: 82015.2344 LR: 9.8078e-05
Epoch: [2][100/566]Elapsed 6.39s | Loss: 0.5169 Grad: 54894.3672 LR: 9.9935e-05
Epoch: [2][150/566]Elapsed 9.53s | Loss: 0.5254 Grad: 72788.4219 LR: 9.9990e-05
Epoch: [2][200/566]Elapsed 12.68s | Loss: 0.5206 Grad: 78987.0781 LR: 9.9948e-05
Epoch: [2][250/566]Elapsed 15.79s | Loss: 0.5151 Grad: 83810.3906 LR: 9.9873e-05
Epoch: [2][300/566]Elapsed 18.94s | Loss: 0.5121 Grad: 49754.5430 LR: 9.9765e-05
Epoch: [2][350/566]Elapsed 22.28s | Loss: 0.5050 Grad: 86065.4141 LR: 9.9624e-05
Epoch: [2][400/566]Elapsed 25.57s | Loss: 0.4976 Grad: 101469.9375 LR: 9.9450e-05
Epoch: [2][450/566]Elapsed 28.81s | Loss: 0.4947 Grad: 94379.7734 LR: 9.9244e-05
Epoch: [2][500/566]Elapsed 32.02s | Loss: 0.4923 Grad: 73346.4766 LR: 9.9006e-05
Epoch: [2][550/566]Elapsed 35.29s | Loss: 0.4877 Grad: 62362.7266 LR: 9.8734e-05
Epoch: [2][565/566]Elapsed 36.28s

Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [2][0/430]Elapsed 0.05s | Loss: 0.5795
Epoch: [2][50/430]Elapsed 2.48s | Loss: 0.4550
Epoch: [2][100/430]Elapsed 5.03s | Loss: 0.4523
Epoch: [2][150/430]Elapsed 7.60s | Loss: 0.4600
Epoch: [2][200/430]Elapsed 10.24s | Loss: 0.4675
Epoch: [2][250/430]Elapsed 12.89s | Loss: 0.4772
Epoch: [2][300/430]Elapsed 15.53s | Loss: 0.4778
Epoch: [2][350/430]Elapsed 18.17s | Loss: 0.4603
Epoch: [2][400/430]Elapsed 20.81s | Loss: 0.4501


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.4869 | Average Valid Loss: 0.4471 | Time: 58.72s
Best model found in epoch 2 | valid loss: 0.4471


Train:   0%|          | 0/566 [00:00<?, ?batch/s]

Epoch: [3][0/566]Elapsed 0.07s | Loss: 0.4731 Grad: 116059.0000 LR: 9.8641e-05
Epoch: [3][50/566]Elapsed 3.24s | Loss: 0.4559 Grad: 123311.7969 LR: 9.8327e-05
Epoch: [3][100/566]Elapsed 6.41s | Loss: 0.4545 Grad: 70717.9766 LR: 9.7982e-05
Epoch: [3][150/566]Elapsed 9.63s | Loss: 0.4511 Grad: 99300.1094 LR: 9.7605e-05
Epoch: [3][200/566]Elapsed 12.83s | Loss: 0.4473 Grad: 78394.3281 LR: 9.7197e-05
Epoch: [3][250/566]Elapsed 15.98s | Loss: 0.4445 Grad: 78396.6953 LR: 9.6757e-05
Epoch: [3][300/566]Elapsed 19.11s | Loss: 0.4395 Grad: 74771.9219 LR: 9.6286e-05
Epoch: [3][350/566]Elapsed 22.32s | Loss: 0.4370 Grad: 73368.2656 LR: 9.5785e-05
Epoch: [3][400/566]Elapsed 25.47s | Loss: 0.4330 Grad: 69953.0625 LR: 9.5254e-05
Epoch: [3][450/566]Elapsed 28.66s | Loss: 0.4339 Grad: 95176.1875 LR: 9.4693e-05
Epoch: [3][500/566]Elapsed 31.82s | Loss: 0.4319 Grad: 105103.2031 LR: 9.4102e-05
Epoch: [3][550/566]Elapsed 35.09s | Loss: 0.4290 Grad: 71859.0547 LR: 9.3483e-05
Epoch: [3][565/566]Elapsed 36.08

Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [3][0/430]Elapsed 0.06s | Loss: 0.5622
Epoch: [3][50/430]Elapsed 2.50s | Loss: 0.4093
Epoch: [3][100/430]Elapsed 5.06s | Loss: 0.4205
Epoch: [3][150/430]Elapsed 7.69s | Loss: 0.4238
Epoch: [3][200/430]Elapsed 10.33s | Loss: 0.4318
Epoch: [3][250/430]Elapsed 12.97s | Loss: 0.4416
Epoch: [3][300/430]Elapsed 15.62s | Loss: 0.4420
Epoch: [3][350/430]Elapsed 18.24s | Loss: 0.4328
Epoch: [3][400/430]Elapsed 20.87s | Loss: 0.4276


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.4288 | Average Valid Loss: 0.4280 | Time: 58.58s
Best model found in epoch 3 | valid loss: 0.4280


Train:   0%|          | 0/566 [00:00<?, ?batch/s]

Epoch: [4][0/566]Elapsed 0.07s | Loss: 0.4076 Grad: 111009.5391 LR: 9.3278e-05
Epoch: [4][50/566]Elapsed 3.26s | Loss: 0.3934 Grad: 153407.2344 LR: 9.2621e-05
Epoch: [4][100/566]Elapsed 6.37s | Loss: 0.4016 Grad: 74265.4609 LR: 9.1935e-05
Epoch: [4][150/566]Elapsed 9.52s | Loss: 0.3976 Grad: 97032.6250 LR: 9.1222e-05
Epoch: [4][200/566]Elapsed 12.65s | Loss: 0.3909 Grad: 69647.6016 LR: 9.0481e-05
Epoch: [4][250/566]Elapsed 15.77s | Loss: 0.3913 Grad: 66268.3750 LR: 8.9714e-05
Epoch: [4][300/566]Elapsed 18.91s | Loss: 0.3896 Grad: 54785.0508 LR: 8.8921e-05
Epoch: [4][350/566]Elapsed 22.03s | Loss: 0.3890 Grad: 74064.5000 LR: 8.8101e-05
Epoch: [4][400/566]Elapsed 25.15s | Loss: 0.3877 Grad: 80592.6953 LR: 8.7257e-05
Epoch: [4][450/566]Elapsed 28.29s | Loss: 0.3898 Grad: 79990.7578 LR: 8.6388e-05
Epoch: [4][500/566]Elapsed 31.42s | Loss: 0.3879 Grad: 59564.3320 LR: 8.5495e-05
Epoch: [4][550/566]Elapsed 34.59s | Loss: 0.3865 Grad: 54265.1367 LR: 8.4579e-05
Epoch: [4][565/566]Elapsed 35.59s

Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [4][0/430]Elapsed 0.05s | Loss: 0.3506
Epoch: [4][50/430]Elapsed 2.49s | Loss: 0.3693
Epoch: [4][100/430]Elapsed 5.03s | Loss: 0.3668
Epoch: [4][150/430]Elapsed 7.65s | Loss: 0.3737
Epoch: [4][200/430]Elapsed 10.29s | Loss: 0.3763
Epoch: [4][250/430]Elapsed 12.94s | Loss: 0.3797
Epoch: [4][300/430]Elapsed 15.58s | Loss: 0.3836
Epoch: [4][350/430]Elapsed 18.20s | Loss: 0.3791
Epoch: [4][400/430]Elapsed 20.84s | Loss: 0.3765


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.3865 | Average Valid Loss: 0.3764 | Time: 58.04s
Best model found in epoch 4 | valid loss: 0.3764


Train:   0%|          | 0/566 [00:00<?, ?batch/s]

Epoch: [5][0/566]Elapsed 0.07s | Loss: 0.3122 Grad: 107142.1094 LR: 8.4281e-05
Epoch: [5][50/566]Elapsed 3.23s | Loss: 0.3554 Grad: 114332.5000 LR: 8.3335e-05
Epoch: [5][100/566]Elapsed 6.36s | Loss: 0.3542 Grad: 178480.1094 LR: 8.2366e-05
Epoch: [5][150/566]Elapsed 9.58s | Loss: 0.3580 Grad: 186818.8438 LR: 8.1377e-05
Epoch: [5][200/566]Elapsed 12.73s | Loss: 0.3598 Grad: 102867.5234 LR: 8.0366e-05
Epoch: [5][250/566]Elapsed 15.87s | Loss: 0.3580 Grad: 199508.7031 LR: 7.9336e-05
Epoch: [5][300/566]Elapsed 19.00s | Loss: 0.3549 Grad: 105267.5625 LR: 7.8286e-05
Epoch: [5][350/566]Elapsed 22.17s | Loss: 0.3516 Grad: 156397.6719 LR: 7.7218e-05
Epoch: [5][400/566]Elapsed 25.35s | Loss: 0.3504 Grad: 152162.6875 LR: 7.6131e-05
Epoch: [5][450/566]Elapsed 28.61s | Loss: 0.3530 Grad: 145160.5469 LR: 7.5028e-05
Epoch: [5][500/566]Elapsed 31.92s | Loss: 0.3512 Grad: 137351.6719 LR: 7.3908e-05
Epoch: [5][550/566]Elapsed 35.22s | Loss: 0.3492 Grad: 164307.5625 LR: 7.2772e-05
Epoch: [5][565/566]Elap

Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [5][0/430]Elapsed 0.05s | Loss: 0.4736
Epoch: [5][50/430]Elapsed 2.49s | Loss: 0.3973
Epoch: [5][100/430]Elapsed 5.06s | Loss: 0.3936
Epoch: [5][150/430]Elapsed 7.68s | Loss: 0.3998
Epoch: [5][200/430]Elapsed 10.33s | Loss: 0.4063
Epoch: [5][250/430]Elapsed 12.99s | Loss: 0.4137
Epoch: [5][300/430]Elapsed 15.63s | Loss: 0.4144
Epoch: [5][350/430]Elapsed 18.27s | Loss: 0.4173
Epoch: [5][400/430]Elapsed 20.91s | Loss: 0.4202


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.3499 | Average Valid Loss: 0.4242 | Time: 58.77s


Train:   0%|          | 0/566 [00:00<?, ?batch/s]

Epoch: [6][0/566]Elapsed 0.07s | Loss: 0.3904 Grad: 198883.2344 LR: 7.2405e-05
Epoch: [6][50/566]Elapsed 3.28s | Loss: 0.3441 Grad: 118852.8125 LR: 7.1249e-05
Epoch: [6][100/566]Elapsed 6.49s | Loss: 0.3341 Grad: 125197.4844 LR: 7.0080e-05
Epoch: [6][150/566]Elapsed 9.82s | Loss: 0.3324 Grad: 216145.7656 LR: 6.8897e-05
Epoch: [6][200/566]Elapsed 13.12s | Loss: 0.3339 Grad: 99724.0312 LR: 6.7702e-05
Epoch: [6][250/566]Elapsed 16.41s | Loss: 0.3378 Grad: 71736.7344 LR: 6.6495e-05
Epoch: [6][300/566]Elapsed 19.61s | Loss: 0.3359 Grad: 62118.9453 LR: 6.5277e-05
Epoch: [6][350/566]Elapsed 22.76s | Loss: 0.3313 Grad: 87350.3984 LR: 6.4049e-05
Epoch: [6][400/566]Elapsed 25.91s | Loss: 0.3280 Grad: 70426.3203 LR: 6.2812e-05
Epoch: [6][450/566]Elapsed 29.05s | Loss: 0.3297 Grad: 100007.1719 LR: 6.1566e-05
Epoch: [6][500/566]Elapsed 32.22s | Loss: 0.3293 Grad: 61087.6719 LR: 6.0313e-05
Epoch: [6][550/566]Elapsed 35.40s | Loss: 0.3289 Grad: 77279.3203 LR: 5.9053e-05
Epoch: [6][565/566]Elapsed 36.

Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [6][0/430]Elapsed 0.06s | Loss: 0.3129
Epoch: [6][50/430]Elapsed 2.55s | Loss: 0.3919
Epoch: [6][100/430]Elapsed 5.13s | Loss: 0.3938
Epoch: [6][150/430]Elapsed 7.73s | Loss: 0.3984
Epoch: [6][200/430]Elapsed 10.37s | Loss: 0.4034
Epoch: [6][250/430]Elapsed 13.01s | Loss: 0.4044
Epoch: [6][300/430]Elapsed 15.66s | Loss: 0.4083
Epoch: [6][350/430]Elapsed 18.29s | Loss: 0.4098
Epoch: [6][400/430]Elapsed 20.94s | Loss: 0.4095


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.3292 | Average Valid Loss: 0.4112 | Time: 58.91s
Fold 0 Valid Loss: (Easy) 0.7672 | (Hard) 0.7163
Elapse: 5.88 min 
- Second Stage 
Use Checkpoint: ViTMAE_base_fold_0_stage_1.pth


Train:   0%|          | 0/265 [00:00<?, ?batch/s]

Epoch: [1][0/265]Elapsed 0.06s | Loss: 0.2725 Grad: 173023.3125 LR: 4.0000e-06
Epoch: [1][50/265]Elapsed 3.35s | Loss: 0.3291 Grad: 76777.6016 LR: 9.8092e-06
Epoch: [1][100/265]Elapsed 6.66s | Loss: 0.2985 Grad: 68261.5547 LR: 2.5831e-05
Epoch: [1][150/265]Elapsed 9.94s | Loss: 0.2845 Grad: 64605.3398 LR: 4.8186e-05
Epoch: [1][200/265]Elapsed 13.25s | Loss: 0.2738 Grad: 59358.2461 LR: 7.1465e-05
Epoch: [1][250/265]Elapsed 16.50s | Loss: 0.2639 Grad: 96406.3047 LR: 9.0032e-05
Epoch: [1][264/265]Elapsed 17.39s | Loss: 0.2612 Grad: 76558.0781 LR: 9.3727e-05


Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [1][0/430]Elapsed 0.05s | Loss: 0.4224
Epoch: [1][50/430]Elapsed 2.50s | Loss: 0.4411
Epoch: [1][100/430]Elapsed 5.07s | Loss: 0.4388
Epoch: [1][150/430]Elapsed 7.66s | Loss: 0.4418
Epoch: [1][200/430]Elapsed 10.30s | Loss: 0.4343
Epoch: [1][250/430]Elapsed 12.97s | Loss: 0.4338
Epoch: [1][300/430]Elapsed 15.63s | Loss: 0.4299
Epoch: [1][350/430]Elapsed 18.28s | Loss: 0.4059
Epoch: [1][400/430]Elapsed 20.93s | Loss: 0.3840


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.2612 | Average Valid Loss: 0.3743 | Time: 39.94s
Best model found in epoch 1 | valid loss: 0.3743


Train:   0%|          | 0/265 [00:00<?, ?batch/s]

Epoch: [2][0/265]Elapsed 0.07s | Loss: 0.1854 Grad: 132728.3281 LR: 9.3727e-05
Epoch: [2][50/265]Elapsed 3.31s | Loss: 0.2188 Grad: 57732.6445 LR: 9.9991e-05
Epoch: [2][100/265]Elapsed 6.48s | Loss: 0.2240 Grad: 63620.5000 LR: 9.9931e-05
Epoch: [2][150/265]Elapsed 9.66s | Loss: 0.2266 Grad: 65386.0156 LR: 9.9711e-05
Epoch: [2][200/265]Elapsed 12.82s | Loss: 0.2270 Grad: 72679.7656 LR: 9.9342e-05
Epoch: [2][250/265]Elapsed 15.99s | Loss: 0.2240 Grad: 93785.8906 LR: 9.8824e-05
Epoch: [2][264/265]Elapsed 16.88s | Loss: 0.2234 Grad: 82180.1719 LR: 9.8653e-05


Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [2][0/430]Elapsed 0.05s | Loss: 0.4248
Epoch: [2][50/430]Elapsed 2.49s | Loss: 0.4245
Epoch: [2][100/430]Elapsed 5.04s | Loss: 0.4278
Epoch: [2][150/430]Elapsed 7.66s | Loss: 0.4298
Epoch: [2][200/430]Elapsed 10.31s | Loss: 0.4225
Epoch: [2][250/430]Elapsed 12.97s | Loss: 0.4226
Epoch: [2][300/430]Elapsed 15.63s | Loss: 0.4178
Epoch: [2][350/430]Elapsed 18.29s | Loss: 0.3943
Epoch: [2][400/430]Elapsed 20.95s | Loss: 0.3729


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.2234 | Average Valid Loss: 0.3640 | Time: 39.45s
Best model found in epoch 2 | valid loss: 0.3640


Train:   0%|          | 0/265 [00:00<?, ?batch/s]

Epoch: [3][0/265]Elapsed 0.08s | Loss: 0.2071 Grad: 145569.1406 LR: 9.8653e-05
Epoch: [3][50/265]Elapsed 3.29s | Loss: 0.2062 Grad: 52813.0273 LR: 9.7947e-05
Epoch: [3][100/265]Elapsed 6.46s | Loss: 0.2098 Grad: 66153.0312 LR: 9.7097e-05
Epoch: [3][150/265]Elapsed 9.65s | Loss: 0.2136 Grad: 66519.6562 LR: 9.6106e-05
Epoch: [3][200/265]Elapsed 12.82s | Loss: 0.2136 Grad: 49361.2773 LR: 9.4975e-05
Epoch: [3][250/265]Elapsed 15.99s | Loss: 0.2111 Grad: 79389.2891 LR: 9.3710e-05
Epoch: [3][264/265]Elapsed 16.89s | Loss: 0.2100 Grad: 74643.2891 LR: 9.3331e-05


Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [3][0/430]Elapsed 0.05s | Loss: 0.3485
Epoch: [3][50/430]Elapsed 2.50s | Loss: 0.4115
Epoch: [3][100/430]Elapsed 5.04s | Loss: 0.4126
Epoch: [3][150/430]Elapsed 7.64s | Loss: 0.4164
Epoch: [3][200/430]Elapsed 10.29s | Loss: 0.4111
Epoch: [3][250/430]Elapsed 12.93s | Loss: 0.4110
Epoch: [3][300/430]Elapsed 15.59s | Loss: 0.4080
Epoch: [3][350/430]Elapsed 18.23s | Loss: 0.3845
Epoch: [3][400/430]Elapsed 20.89s | Loss: 0.3641


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2100 | Average Valid Loss: 0.3547 | Time: 39.41s
Best model found in epoch 3 | valid loss: 0.3547


Train:   0%|          | 0/265 [00:00<?, ?batch/s]

Epoch: [4][0/265]Elapsed 0.07s | Loss: 0.1646 Grad: 120928.0000 LR: 9.3331e-05
Epoch: [4][50/265]Elapsed 3.35s | Loss: 0.2036 Grad: 61516.8359 LR: 9.1898e-05
Epoch: [4][100/265]Elapsed 6.57s | Loss: 0.2007 Grad: 74241.0625 LR: 9.0338e-05
Epoch: [4][150/265]Elapsed 9.91s | Loss: 0.2006 Grad: 91637.5000 LR: 8.8657e-05
Epoch: [4][200/265]Elapsed 13.15s | Loss: 0.2007 Grad: 66348.2734 LR: 8.6860e-05
Epoch: [4][250/265]Elapsed 16.33s | Loss: 0.1979 Grad: 94463.9141 LR: 8.4952e-05
Epoch: [4][264/265]Elapsed 17.22s | Loss: 0.1970 Grad: 69351.9922 LR: 8.4398e-05


Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [4][0/430]Elapsed 0.05s | Loss: 0.4084
Epoch: [4][50/430]Elapsed 2.49s | Loss: 0.4108
Epoch: [4][100/430]Elapsed 5.05s | Loss: 0.4129
Epoch: [4][150/430]Elapsed 7.63s | Loss: 0.4145
Epoch: [4][200/430]Elapsed 10.27s | Loss: 0.4061
Epoch: [4][250/430]Elapsed 12.91s | Loss: 0.4040
Epoch: [4][300/430]Elapsed 15.55s | Loss: 0.4001
Epoch: [4][350/430]Elapsed 18.19s | Loss: 0.3775
Epoch: [4][400/430]Elapsed 20.83s | Loss: 0.3566


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1970 | Average Valid Loss: 0.3473 | Time: 39.65s
Best model found in epoch 4 | valid loss: 0.3473


Train:   0%|          | 0/265 [00:00<?, ?batch/s]

Epoch: [5][0/265]Elapsed 0.07s | Loss: 0.1788 Grad: 124866.6641 LR: 8.4398e-05
Epoch: [5][50/265]Elapsed 3.24s | Loss: 0.1806 Grad: 54460.0742 LR: 8.2356e-05
Epoch: [5][100/265]Elapsed 6.40s | Loss: 0.1841 Grad: 79020.1484 LR: 8.0217e-05
Epoch: [5][150/265]Elapsed 9.56s | Loss: 0.1889 Grad: 84836.5781 LR: 7.7987e-05
Epoch: [5][200/265]Elapsed 12.74s | Loss: 0.1903 Grad: 72079.5234 LR: 7.5672e-05
Epoch: [5][250/265]Elapsed 16.05s | Loss: 0.1866 Grad: 81545.5469 LR: 7.3280e-05
Epoch: [5][264/265]Elapsed 17.02s | Loss: 0.1859 Grad: 99821.3359 LR: 7.2598e-05


Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [5][0/430]Elapsed 0.06s | Loss: 0.3703
Epoch: [5][50/430]Elapsed 2.50s | Loss: 0.4548
Epoch: [5][100/430]Elapsed 5.06s | Loss: 0.4486
Epoch: [5][150/430]Elapsed 7.69s | Loss: 0.4481
Epoch: [5][200/430]Elapsed 10.34s | Loss: 0.4384
Epoch: [5][250/430]Elapsed 13.01s | Loss: 0.4356
Epoch: [5][300/430]Elapsed 15.67s | Loss: 0.4317
Epoch: [5][350/430]Elapsed 18.32s | Loss: 0.4068
Epoch: [5][400/430]Elapsed 20.97s | Loss: 0.3846


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1859 | Average Valid Loss: 0.3741 | Time: 39.61s


Train:   0%|          | 0/265 [00:00<?, ?batch/s]

Epoch: [6][0/265]Elapsed 0.07s | Loss: 0.1743 Grad: 170901.3125 LR: 7.2598e-05
Epoch: [6][50/265]Elapsed 3.37s | Loss: 0.1702 Grad: 60517.1719 LR: 7.0118e-05
Epoch: [6][100/265]Elapsed 6.69s | Loss: 0.1737 Grad: 88187.5859 LR: 6.7577e-05
Epoch: [6][150/265]Elapsed 9.88s | Loss: 0.1776 Grad: 47873.1055 LR: 6.4984e-05
Epoch: [6][200/265]Elapsed 13.13s | Loss: 0.1799 Grad: 51901.8906 LR: 6.2345e-05
Epoch: [6][250/265]Elapsed 16.33s | Loss: 0.1773 Grad: 112411.8203 LR: 5.9670e-05
Epoch: [6][264/265]Elapsed 17.23s | Loss: 0.1768 Grad: 93361.8828 LR: 5.8915e-05


Valid:   0%|          | 0/430 [00:00<?, ?batch/s]

Epoch: [6][0/430]Elapsed 0.06s | Loss: 0.3678
Epoch: [6][50/430]Elapsed 2.53s | Loss: 0.4307
Epoch: [6][100/430]Elapsed 5.09s | Loss: 0.4273
Epoch: [6][150/430]Elapsed 7.71s | Loss: 0.4329
Epoch: [6][200/430]Elapsed 10.37s | Loss: 0.4202
Epoch: [6][250/430]Elapsed 13.02s | Loss: 0.4193
Epoch: [6][300/430]Elapsed 15.68s | Loss: 0.4170
Epoch: [6][350/430]Elapsed 18.31s | Loss: 0.3941
Epoch: [6][400/430]Elapsed 20.98s | Loss: 0.3726


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1768 | Average Valid Loss: 0.3636 | Time: 39.83s
Fold 0 Valid Loss: (Easy) 0.8017 | (Hard) 0.4214
Elapse: 3.98 min 
Fold: 1 || Valid size 4196 
- First Stage 


Train:   0%|          | 0/688 [00:00<?, ?batch/s]

Epoch: [1][0/688]Elapsed 0.07s | Loss: 0.8448 Grad: 100444.4844 LR: 4.0000e-06
Epoch: [1][50/688]Elapsed 3.40s | Loss: 0.8101 Grad: 50855.1250 LR: 4.8704e-06
Epoch: [1][100/688]Elapsed 6.58s | Loss: 0.7766 Grad: 40843.9414 LR: 7.4499e-06
Epoch: [1][150/688]Elapsed 9.76s | Loss: 0.7587 Grad: 46079.0938 LR: 1.1645e-05
Epoch: [1][200/688]Elapsed 12.94s | Loss: 0.7525 Grad: 70407.9766 LR: 1.7304e-05
Epoch: [1][250/688]Elapsed 16.14s | Loss: 0.7423 Grad: 68390.1641 LR: 2.4221e-05
Epoch: [1][300/688]Elapsed 19.32s | Loss: 0.7327 Grad: 82965.2734 LR: 3.2145e-05
Epoch: [1][350/688]Elapsed 22.50s | Loss: 0.7179 Grad: 150508.6719 LR: 4.0789e-05
Epoch: [1][400/688]Elapsed 25.68s | Loss: 0.7090 Grad: 65116.9141 LR: 4.9840e-05
Epoch: [1][450/688]Elapsed 28.87s | Loss: 0.6940 Grad: 106031.1172 LR: 5.8969e-05
Epoch: [1][500/688]Elapsed 32.09s | Loss: 0.6848 Grad: 110750.8203 LR: 6.7846e-05
Epoch: [1][550/688]Elapsed 35.28s | Loss: 0.6734 Grad: 81568.2812 LR: 7.6148e-05
Epoch: [1][600/688]Elapsed 38.4

Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [1][0/263]Elapsed 0.06s | Loss: 0.5572
Epoch: [1][50/263]Elapsed 2.52s | Loss: 0.5577
Epoch: [1][100/263]Elapsed 5.10s | Loss: 0.5566
Epoch: [1][150/263]Elapsed 7.74s | Loss: 0.5492
Epoch: [1][200/263]Elapsed 10.40s | Loss: 0.5233
Epoch: [1][250/263]Elapsed 13.06s | Loss: 0.4866


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6474 | Average Valid Loss: 0.4819 | Time: 57.94s
Best model found in epoch 1 | valid loss: 0.4819


Train:   0%|          | 0/688 [00:00<?, ?batch/s]

Epoch: [2][0/688]Elapsed 0.07s | Loss: 0.5414 Grad: 110725.1172 LR: 9.3721e-05
Epoch: [2][50/688]Elapsed 3.28s | Loss: 0.5072 Grad: 99780.3359 LR: 9.7464e-05
Epoch: [2][100/688]Elapsed 6.46s | Loss: 0.5212 Grad: 90240.5859 LR: 9.9558e-05
Epoch: [2][150/688]Elapsed 9.67s | Loss: 0.5126 Grad: 69465.4297 LR: 9.9999e-05
Epoch: [2][200/688]Elapsed 12.92s | Loss: 0.4966 Grad: 66580.8281 LR: 9.9981e-05
Epoch: [2][250/688]Elapsed 16.10s | Loss: 0.4940 Grad: 77513.1797 LR: 9.9942e-05
Epoch: [2][300/688]Elapsed 19.38s | Loss: 0.4918 Grad: 86792.0703 LR: 9.9879e-05
Epoch: [2][350/688]Elapsed 22.68s | Loss: 0.4913 Grad: 71422.5703 LR: 9.9795e-05
Epoch: [2][400/688]Elapsed 26.00s | Loss: 0.4913 Grad: 80256.3438 LR: 9.9688e-05
Epoch: [2][450/688]Elapsed 29.18s | Loss: 0.4846 Grad: 76726.8203 LR: 9.9559e-05
Epoch: [2][500/688]Elapsed 32.37s | Loss: 0.4837 Grad: 73887.5312 LR: 9.9408e-05
Epoch: [2][550/688]Elapsed 35.55s | Loss: 0.4784 Grad: 75305.3672 LR: 9.9235e-05
Epoch: [2][600/688]Elapsed 38.73s 

Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [2][0/263]Elapsed 0.05s | Loss: 0.4759
Epoch: [2][50/263]Elapsed 2.50s | Loss: 0.4485
Epoch: [2][100/263]Elapsed 5.06s | Loss: 0.4417
Epoch: [2][150/263]Elapsed 7.65s | Loss: 0.4277
Epoch: [2][200/263]Elapsed 10.28s | Loss: 0.4108
Epoch: [2][250/263]Elapsed 12.91s | Loss: 0.3991


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.4682 | Average Valid Loss: 0.3995 | Time: 57.88s
Best model found in epoch 2 | valid loss: 0.3995


Train:   0%|          | 0/688 [00:00<?, ?batch/s]

Epoch: [3][0/688]Elapsed 0.07s | Loss: 0.5766 Grad: 183314.0312 LR: 9.8643e-05
Epoch: [3][50/688]Elapsed 3.25s | Loss: 0.4255 Grad: 65832.0078 LR: 9.8388e-05
Epoch: [3][100/688]Elapsed 6.42s | Loss: 0.4160 Grad: 75203.4531 LR: 9.8111e-05
Epoch: [3][150/688]Elapsed 9.60s | Loss: 0.4077 Grad: 70178.7109 LR: 9.7812e-05
Epoch: [3][200/688]Elapsed 12.78s | Loss: 0.3975 Grad: 51422.8672 LR: 9.7492e-05
Epoch: [3][250/688]Elapsed 15.95s | Loss: 0.3967 Grad: 73799.5234 LR: 9.7151e-05
Epoch: [3][300/688]Elapsed 19.12s | Loss: 0.4008 Grad: 100249.6484 LR: 9.6789e-05
Epoch: [3][350/688]Elapsed 22.29s | Loss: 0.4033 Grad: 73765.2188 LR: 9.6406e-05
Epoch: [3][400/688]Elapsed 25.48s | Loss: 0.4034 Grad: 83138.6875 LR: 9.6002e-05
Epoch: [3][450/688]Elapsed 28.64s | Loss: 0.4015 Grad: 66100.2109 LR: 9.5578e-05
Epoch: [3][500/688]Elapsed 31.82s | Loss: 0.4027 Grad: 76006.3828 LR: 9.5133e-05
Epoch: [3][550/688]Elapsed 35.00s | Loss: 0.4031 Grad: 74461.5625 LR: 9.4669e-05
Epoch: [3][600/688]Elapsed 38.17s

Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [3][0/263]Elapsed 0.06s | Loss: 0.4026
Epoch: [3][50/263]Elapsed 2.52s | Loss: 0.4181
Epoch: [3][100/263]Elapsed 5.12s | Loss: 0.4047
Epoch: [3][150/263]Elapsed 7.78s | Loss: 0.3948
Epoch: [3][200/263]Elapsed 10.45s | Loss: 0.3758
Epoch: [3][250/263]Elapsed 13.09s | Loss: 0.3644


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.3992 | Average Valid Loss: 0.3627 | Time: 57.52s
Best model found in epoch 3 | valid loss: 0.3627


Train:   0%|          | 0/688 [00:00<?, ?batch/s]

Epoch: [4][0/688]Elapsed 0.08s | Loss: 0.3200 Grad: 116085.8281 LR: 9.3283e-05
Epoch: [4][50/688]Elapsed 3.40s | Loss: 0.3788 Grad: 157410.2344 LR: 9.2744e-05
Epoch: [4][100/688]Elapsed 6.57s | Loss: 0.3608 Grad: 239467.4531 LR: 9.2186e-05
Epoch: [4][150/688]Elapsed 9.83s | Loss: 0.3553 Grad: 120621.6328 LR: 9.1610e-05
Epoch: [4][200/688]Elapsed 13.14s | Loss: 0.3479 Grad: 95925.8516 LR: 9.1014e-05
Epoch: [4][250/688]Elapsed 16.44s | Loss: 0.3524 Grad: 63775.4609 LR: 9.0401e-05
Epoch: [4][300/688]Elapsed 19.62s | Loss: 0.3566 Grad: 74392.8750 LR: 8.9769e-05
Epoch: [4][350/688]Elapsed 22.80s | Loss: 0.3614 Grad: 70376.1953 LR: 8.9120e-05
Epoch: [4][400/688]Elapsed 26.02s | Loss: 0.3641 Grad: 74598.0703 LR: 8.8453e-05
Epoch: [4][450/688]Elapsed 29.22s | Loss: 0.3615 Grad: 62086.7383 LR: 8.7769e-05
Epoch: [4][500/688]Elapsed 32.44s | Loss: 0.3633 Grad: 59912.1602 LR: 8.7068e-05
Epoch: [4][550/688]Elapsed 35.68s | Loss: 0.3654 Grad: 58511.3906 LR: 8.6351e-05
Epoch: [4][600/688]Elapsed 38.9

Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [4][0/263]Elapsed 0.05s | Loss: 0.4707
Epoch: [4][50/263]Elapsed 2.55s | Loss: 0.4251
Epoch: [4][100/263]Elapsed 5.19s | Loss: 0.4161
Epoch: [4][150/263]Elapsed 7.88s | Loss: 0.3964
Epoch: [4][200/263]Elapsed 10.58s | Loss: 0.3828
Epoch: [4][250/263]Elapsed 13.27s | Loss: 0.3760


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.3621 | Average Valid Loss: 0.3761 | Time: 58.53s


Train:   0%|          | 0/688 [00:00<?, ?batch/s]

Epoch: [5][0/688]Elapsed 0.09s | Loss: 0.2641 Grad: 115937.6641 LR: 8.4288e-05
Epoch: [5][50/688]Elapsed 3.40s | Loss: 0.3388 Grad: 58689.8281 LR: 8.3511e-05
Epoch: [5][100/688]Elapsed 6.71s | Loss: 0.3302 Grad: 117482.8672 LR: 8.2719e-05
Epoch: [5][150/688]Elapsed 10.10s | Loss: 0.3322 Grad: 66110.0391 LR: 8.1913e-05
Epoch: [5][200/688]Elapsed 13.37s | Loss: 0.3228 Grad: 32968.7305 LR: 8.1092e-05
Epoch: [5][250/688]Elapsed 16.58s | Loss: 0.3280 Grad: 89560.6953 LR: 8.0258e-05
Epoch: [5][300/688]Elapsed 19.77s | Loss: 0.3330 Grad: 60946.8789 LR: 7.9410e-05
Epoch: [5][350/688]Elapsed 22.97s | Loss: 0.3336 Grad: 72699.4219 LR: 7.8549e-05
Epoch: [5][400/688]Elapsed 26.16s | Loss: 0.3348 Grad: 75051.0078 LR: 7.7675e-05
Epoch: [5][450/688]Elapsed 29.50s | Loss: 0.3306 Grad: 34111.4219 LR: 7.6789e-05
Epoch: [5][500/688]Elapsed 32.72s | Loss: 0.3321 Grad: 71710.3906 LR: 7.5890e-05
Epoch: [5][550/688]Elapsed 35.95s | Loss: 0.3350 Grad: 62397.6914 LR: 7.4981e-05
Epoch: [5][600/688]Elapsed 39.16

Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [5][0/263]Elapsed 0.05s | Loss: 0.3611
Epoch: [5][50/263]Elapsed 2.56s | Loss: 0.3827
Epoch: [5][100/263]Elapsed 5.20s | Loss: 0.3832
Epoch: [5][150/263]Elapsed 7.90s | Loss: 0.3649
Epoch: [5][200/263]Elapsed 10.60s | Loss: 0.3544
Epoch: [5][250/263]Elapsed 13.28s | Loss: 0.3512


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.3330 | Average Valid Loss: 0.3516 | Time: 58.81s
Best model found in epoch 5 | valid loss: 0.3516


Train:   0%|          | 0/688 [00:00<?, ?batch/s]

Epoch: [6][0/688]Elapsed 0.08s | Loss: 0.2779 Grad: 131744.1719 LR: 7.2413e-05
Epoch: [6][50/688]Elapsed 3.44s | Loss: 0.3069 Grad: 51005.0430 LR: 7.1464e-05
Epoch: [6][100/688]Elapsed 6.78s | Loss: 0.3011 Grad: 113976.4375 LR: 7.0505e-05
Epoch: [6][150/688]Elapsed 10.14s | Loss: 0.3034 Grad: 80900.9219 LR: 6.9536e-05
Epoch: [6][200/688]Elapsed 13.40s | Loss: 0.2948 Grad: 57642.9961 LR: 6.8559e-05
Epoch: [6][250/688]Elapsed 16.62s | Loss: 0.2975 Grad: 66051.7031 LR: 6.7574e-05
Epoch: [6][300/688]Elapsed 19.84s | Loss: 0.3015 Grad: 95077.5703 LR: 6.6581e-05
Epoch: [6][350/688]Elapsed 23.05s | Loss: 0.3035 Grad: 64820.8789 LR: 6.5580e-05
Epoch: [6][400/688]Elapsed 26.26s | Loss: 0.3058 Grad: 83273.7188 LR: 6.4573e-05
Epoch: [6][450/688]Elapsed 29.48s | Loss: 0.3045 Grad: 42404.4531 LR: 6.3559e-05
Epoch: [6][500/688]Elapsed 32.71s | Loss: 0.3057 Grad: 92223.2812 LR: 6.2539e-05
Epoch: [6][550/688]Elapsed 35.93s | Loss: 0.3079 Grad: 72750.7578 LR: 6.1513e-05
Epoch: [6][600/688]Elapsed 39.15

Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [6][0/263]Elapsed 0.05s | Loss: 0.3291
Epoch: [6][50/263]Elapsed 2.56s | Loss: 0.3676
Epoch: [6][100/263]Elapsed 5.16s | Loss: 0.3638
Epoch: [6][150/263]Elapsed 7.86s | Loss: 0.3556
Epoch: [6][200/263]Elapsed 10.55s | Loss: 0.3512
Epoch: [6][250/263]Elapsed 13.23s | Loss: 0.3627


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.3069 | Average Valid Loss: 0.3657 | Time: 58.82s
Fold 1 Valid Loss: (Easy) 0.7089 | (Hard) 0.6892
Elapse: 5.84 min 
- Second Stage 
Use Checkpoint: ViTMAE_base_fold_1_stage_1.pth


Train:   0%|          | 0/310 [00:00<?, ?batch/s]

Epoch: [1][0/310]Elapsed 0.07s | Loss: 0.4979 Grad: 225450.2344 LR: 4.0000e-06
Epoch: [1][50/310]Elapsed 3.37s | Loss: 0.3376 Grad: 115073.7109 LR: 8.2610e-06
Epoch: [1][100/310]Elapsed 6.58s | Loss: 0.2962 Grad: 80301.5547 LR: 2.0288e-05
Epoch: [1][150/310]Elapsed 9.79s | Loss: 0.2786 Grad: 55540.2031 LR: 3.7945e-05
Epoch: [1][200/310]Elapsed 12.99s | Loss: 0.2667 Grad: 67248.7266 LR: 5.8097e-05
Epoch: [1][250/310]Elapsed 16.18s | Loss: 0.2612 Grad: 82177.4453 LR: 7.7167e-05
Epoch: [1][300/310]Elapsed 19.39s | Loss: 0.2537 Grad: 61013.2148 LR: 9.1768e-05
Epoch: [1][309/310]Elapsed 19.97s | Loss: 0.2529 Grad: 147799.9688 LR: 9.3905e-05


Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [1][0/263]Elapsed 0.05s | Loss: 0.4397
Epoch: [1][50/263]Elapsed 2.57s | Loss: 0.4789
Epoch: [1][100/263]Elapsed 5.20s | Loss: 0.4702
Epoch: [1][150/263]Elapsed 7.90s | Loss: 0.4520
Epoch: [1][200/263]Elapsed 10.59s | Loss: 0.4176
Epoch: [1][250/263]Elapsed 13.28s | Loss: 0.3782


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.2529 | Average Valid Loss: 0.3712 | Time: 33.99s
Best model found in epoch 1 | valid loss: 0.3712


Train:   0%|          | 0/310 [00:00<?, ?batch/s]

Epoch: [2][0/310]Elapsed 0.07s | Loss: 0.2278 Grad: 71299.1016 LR: 9.3905e-05
Epoch: [2][50/310]Elapsed 3.28s | Loss: 0.2289 Grad: 66296.0938 LR: 9.9827e-05
Epoch: [2][100/310]Elapsed 6.49s | Loss: 0.2217 Grad: 57176.6992 LR: 9.9965e-05
Epoch: [2][150/310]Elapsed 9.71s | Loss: 0.2207 Grad: 56318.3984 LR: 9.9822e-05
Epoch: [2][200/310]Elapsed 12.91s | Loss: 0.2213 Grad: 55996.1836 LR: 9.9569e-05
Epoch: [2][250/310]Elapsed 16.11s | Loss: 0.2218 Grad: 91462.8438 LR: 9.9208e-05
Epoch: [2][300/310]Elapsed 19.37s | Loss: 0.2200 Grad: 67472.1641 LR: 9.8738e-05
Epoch: [2][309/310]Elapsed 19.98s | Loss: 0.2199 Grad: 132452.3906 LR: 9.8631e-05


Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [2][0/263]Elapsed 0.05s | Loss: 0.5574
Epoch: [2][50/263]Elapsed 2.54s | Loss: 0.5015
Epoch: [2][100/263]Elapsed 5.14s | Loss: 0.4959
Epoch: [2][150/263]Elapsed 7.82s | Loss: 0.4737
Epoch: [2][200/263]Elapsed 10.51s | Loss: 0.4377
Epoch: [2][250/263]Elapsed 13.21s | Loss: 0.3930


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.2199 | Average Valid Loss: 0.3854 | Time: 33.94s


Train:   0%|          | 0/310 [00:00<?, ?batch/s]

Epoch: [3][0/310]Elapsed 0.07s | Loss: 0.2078 Grad: 71437.3125 LR: 9.8631e-05
Epoch: [3][50/310]Elapsed 3.31s | Loss: 0.2151 Grad: 42390.0625 LR: 9.8033e-05
Epoch: [3][100/310]Elapsed 6.54s | Loss: 0.2062 Grad: 46328.0664 LR: 9.7329e-05
Epoch: [3][150/310]Elapsed 9.75s | Loss: 0.2051 Grad: 74149.6328 LR: 9.6521e-05
Epoch: [3][200/310]Elapsed 12.95s | Loss: 0.2064 Grad: 80213.6094 LR: 9.5610e-05
Epoch: [3][250/310]Elapsed 16.16s | Loss: 0.2077 Grad: 74812.5234 LR: 9.4599e-05
Epoch: [3][300/310]Elapsed 19.36s | Loss: 0.2053 Grad: 66209.5781 LR: 9.3490e-05
Epoch: [3][309/310]Elapsed 19.95s | Loss: 0.2056 Grad: 138378.1719 LR: 9.3257e-05


Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [3][0/263]Elapsed 0.05s | Loss: 0.5098
Epoch: [3][50/263]Elapsed 2.53s | Loss: 0.4944
Epoch: [3][100/263]Elapsed 5.11s | Loss: 0.4931
Epoch: [3][150/263]Elapsed 7.78s | Loss: 0.4707
Epoch: [3][200/263]Elapsed 10.47s | Loss: 0.4341
Epoch: [3][250/263]Elapsed 13.15s | Loss: 0.3901


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2056 | Average Valid Loss: 0.3824 | Time: 33.84s


Train:   0%|          | 0/310 [00:00<?, ?batch/s]

Epoch: [4][0/310]Elapsed 0.07s | Loss: 0.1872 Grad: 64458.1016 LR: 9.3257e-05
Epoch: [4][50/310]Elapsed 3.42s | Loss: 0.1943 Grad: 73127.9688 LR: 9.2034e-05
Epoch: [4][100/310]Elapsed 6.76s | Loss: 0.1902 Grad: 68855.8984 LR: 9.0718e-05
Epoch: [4][150/310]Elapsed 10.11s | Loss: 0.1919 Grad: 53297.6211 LR: 8.9312e-05
Epoch: [4][200/310]Elapsed 13.47s | Loss: 0.1922 Grad: 77607.1562 LR: 8.7820e-05
Epoch: [4][250/310]Elapsed 16.84s | Loss: 0.1932 Grad: 67898.8828 LR: 8.6245e-05
Epoch: [4][300/310]Elapsed 20.04s | Loss: 0.1912 Grad: 65302.0898 LR: 8.4590e-05
Epoch: [4][309/310]Elapsed 20.62s | Loss: 0.1910 Grad: 119600.5234 LR: 8.4250e-05


Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [4][0/263]Elapsed 0.05s | Loss: 0.5196
Epoch: [4][50/263]Elapsed 2.55s | Loss: 0.5122
Epoch: [4][100/263]Elapsed 5.21s | Loss: 0.5065
Epoch: [4][150/263]Elapsed 7.90s | Loss: 0.4801
Epoch: [4][200/263]Elapsed 10.59s | Loss: 0.4430
Epoch: [4][250/263]Elapsed 13.28s | Loss: 0.3978


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1910 | Average Valid Loss: 0.3910 | Time: 34.64s


Train:   0%|          | 0/310 [00:00<?, ?batch/s]

Epoch: [5][0/310]Elapsed 0.07s | Loss: 0.2289 Grad: 100719.3359 LR: 8.4250e-05
Epoch: [5][50/310]Elapsed 3.46s | Loss: 0.1897 Grad: 74599.9453 LR: 8.2504e-05
Epoch: [5][100/310]Elapsed 6.82s | Loss: 0.1831 Grad: 44348.6953 LR: 8.0687e-05
Epoch: [5][150/310]Elapsed 10.08s | Loss: 0.1859 Grad: 56699.8008 LR: 7.8802e-05
Epoch: [5][200/310]Elapsed 13.39s | Loss: 0.1853 Grad: 78025.8828 LR: 7.6854e-05
Epoch: [5][250/310]Elapsed 16.60s | Loss: 0.1857 Grad: 49435.4531 LR: 7.4847e-05
Epoch: [5][300/310]Elapsed 19.82s | Loss: 0.1828 Grad: 49491.8008 LR: 7.2786e-05
Epoch: [5][309/310]Elapsed 20.40s | Loss: 0.1823 Grad: 91015.4297 LR: 7.2367e-05


Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [5][0/263]Elapsed 0.05s | Loss: 0.4275
Epoch: [5][50/263]Elapsed 2.55s | Loss: 0.4993
Epoch: [5][100/263]Elapsed 5.18s | Loss: 0.4891
Epoch: [5][150/263]Elapsed 7.87s | Loss: 0.4648
Epoch: [5][200/263]Elapsed 10.58s | Loss: 0.4299
Epoch: [5][250/263]Elapsed 13.26s | Loss: 0.3869


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1823 | Average Valid Loss: 0.3799 | Time: 34.41s


Train:   0%|          | 0/310 [00:00<?, ?batch/s]

Epoch: [6][0/310]Elapsed 0.09s | Loss: 0.2297 Grad: 86213.1641 LR: 7.2367e-05
Epoch: [6][50/310]Elapsed 3.33s | Loss: 0.1772 Grad: 51848.6406 LR: 7.0246e-05
Epoch: [6][100/310]Elapsed 6.61s | Loss: 0.1702 Grad: 53034.8516 LR: 6.8080e-05
Epoch: [6][150/310]Elapsed 9.84s | Loss: 0.1732 Grad: 65503.3242 LR: 6.5874e-05
Epoch: [6][200/310]Elapsed 13.06s | Loss: 0.1742 Grad: 61631.5703 LR: 6.3634e-05
Epoch: [6][250/310]Elapsed 16.42s | Loss: 0.1759 Grad: 67630.6641 LR: 6.1364e-05
Epoch: [6][300/310]Elapsed 19.64s | Loss: 0.1734 Grad: 84886.6250 LR: 5.9068e-05
Epoch: [6][309/310]Elapsed 20.23s | Loss: 0.1733 Grad: 134524.5469 LR: 5.8607e-05


Valid:   0%|          | 0/263 [00:00<?, ?batch/s]

Epoch: [6][0/263]Elapsed 0.06s | Loss: 0.4464
Epoch: [6][50/263]Elapsed 2.55s | Loss: 0.4979
Epoch: [6][100/263]Elapsed 5.18s | Loss: 0.4935
Epoch: [6][150/263]Elapsed 7.87s | Loss: 0.4680
Epoch: [6][200/263]Elapsed 10.56s | Loss: 0.4318
Epoch: [6][250/263]Elapsed 13.26s | Loss: 0.3879


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1733 | Average Valid Loss: 0.3810 | Time: 34.23s
Fold 1 Valid Loss: (Easy) 0.8666 | (Hard) 0.4378
Elapse: 3.43 min 
Fold: 2 || Valid size 3358 
- First Stage 


Train:   0%|          | 0/733 [00:00<?, ?batch/s]

Epoch: [1][0/733]Elapsed 0.06s | Loss: 0.8474 Grad: 110023.9922 LR: 4.0000e-06
Epoch: [1][50/733]Elapsed 3.27s | Loss: 0.7850 Grad: 54207.1094 LR: 4.7668e-06
Epoch: [1][100/733]Elapsed 6.46s | Loss: 0.7611 Grad: 43369.1875 LR: 7.0428e-06
Epoch: [1][150/733]Elapsed 9.82s | Loss: 0.7469 Grad: 92422.9062 LR: 1.0755e-05
Epoch: [1][200/733]Elapsed 13.17s | Loss: 0.7379 Grad: 54772.4609 LR: 1.5785e-05
Epoch: [1][250/733]Elapsed 16.44s | Loss: 0.7359 Grad: 57654.6211 LR: 2.1973e-05
Epoch: [1][300/733]Elapsed 19.78s | Loss: 0.7301 Grad: 62945.9023 LR: 2.9120e-05
Epoch: [1][350/733]Elapsed 23.01s | Loss: 0.7158 Grad: 63493.7617 LR: 3.6997e-05
Epoch: [1][400/733]Elapsed 26.20s | Loss: 0.7049 Grad: 72179.7656 LR: 4.5354e-05
Epoch: [1][450/733]Elapsed 29.44s | Loss: 0.6941 Grad: 111966.7188 LR: 5.3924e-05
Epoch: [1][500/733]Elapsed 32.63s | Loss: 0.6821 Grad: 100885.9688 LR: 6.2432e-05
Epoch: [1][550/733]Elapsed 35.82s | Loss: 0.6759 Grad: 76118.3047 LR: 7.0607e-05
Epoch: [1][600/733]Elapsed 39.02

Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [1][0/210]Elapsed 0.05s | Loss: 0.3399
Epoch: [1][50/210]Elapsed 2.54s | Loss: 0.4824
Epoch: [1][100/210]Elapsed 5.17s | Loss: 0.4912
Epoch: [1][150/210]Elapsed 7.86s | Loss: 0.4855
Epoch: [1][200/210]Elapsed 10.53s | Loss: 0.4574


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6406 | Average Valid Loss: 0.4556 | Time: 58.56s
Best model found in epoch 1 | valid loss: 0.4556


Train:   0%|          | 0/733 [00:00<?, ?batch/s]

Epoch: [2][0/733]Elapsed 0.07s | Loss: 0.6367 Grad: 138131.6250 LR: 9.3626e-05
Epoch: [2][50/733]Elapsed 3.35s | Loss: 0.4865 Grad: 92758.3594 LR: 9.7217e-05
Epoch: [2][100/733]Elapsed 6.61s | Loss: 0.4975 Grad: 59636.9219 LR: 9.9362e-05
Epoch: [2][150/733]Elapsed 9.83s | Loss: 0.4837 Grad: 77994.0078 LR: 1.0000e-04
Epoch: [2][200/733]Elapsed 13.12s | Loss: 0.4932 Grad: 77626.0781 LR: 9.9988e-05
Epoch: [2][250/733]Elapsed 16.38s | Loss: 0.4855 Grad: 58132.4141 LR: 9.9957e-05
Epoch: [2][300/733]Elapsed 19.66s | Loss: 0.4848 Grad: 65531.7031 LR: 9.9906e-05
Epoch: [2][350/733]Elapsed 22.92s | Loss: 0.4769 Grad: 55612.3359 LR: 9.9836e-05
Epoch: [2][400/733]Elapsed 26.33s | Loss: 0.4758 Grad: 56520.7852 LR: 9.9746e-05
Epoch: [2][450/733]Elapsed 29.59s | Loss: 0.4729 Grad: 94232.3203 LR: 9.9636e-05
Epoch: [2][500/733]Elapsed 32.82s | Loss: 0.4693 Grad: 84381.9141 LR: 9.9507e-05
Epoch: [2][550/733]Elapsed 36.08s | Loss: 0.4703 Grad: 66714.0625 LR: 9.9358e-05
Epoch: [2][600/733]Elapsed 39.34s 

Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [2][0/210]Elapsed 0.06s | Loss: 0.2195
Epoch: [2][50/210]Elapsed 2.55s | Loss: 0.4203
Epoch: [2][100/210]Elapsed 5.17s | Loss: 0.4415
Epoch: [2][150/210]Elapsed 7.84s | Loss: 0.4294
Epoch: [2][200/210]Elapsed 10.53s | Loss: 0.4091


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.4584 | Average Valid Loss: 0.4088 | Time: 59.18s
Best model found in epoch 2 | valid loss: 0.4088


Train:   0%|          | 0/733 [00:00<?, ?batch/s]

Epoch: [3][0/733]Elapsed 0.09s | Loss: 0.5515 Grad: 230262.6250 LR: 9.8653e-05
Epoch: [3][50/733]Elapsed 3.47s | Loss: 0.4106 Grad: 146552.2812 LR: 9.8415e-05
Epoch: [3][100/733]Elapsed 6.69s | Loss: 0.4046 Grad: 191979.0312 LR: 9.8157e-05
Epoch: [3][150/733]Elapsed 9.94s | Loss: 0.4074 Grad: 193847.2188 LR: 9.7881e-05
Epoch: [3][200/733]Elapsed 13.21s | Loss: 0.4182 Grad: 116852.8438 LR: 9.7586e-05
Epoch: [3][250/733]Elapsed 16.53s | Loss: 0.4122 Grad: 73736.6484 LR: 9.7272e-05
Epoch: [3][300/733]Elapsed 19.73s | Loss: 0.4111 Grad: 52675.0273 LR: 9.6940e-05
Epoch: [3][350/733]Elapsed 23.05s | Loss: 0.4039 Grad: 56050.5703 LR: 9.6589e-05
Epoch: [3][400/733]Elapsed 26.35s | Loss: 0.4027 Grad: 103404.5547 LR: 9.6219e-05
Epoch: [3][450/733]Elapsed 29.60s | Loss: 0.4032 Grad: 100287.5078 LR: 9.5832e-05
Epoch: [3][500/733]Elapsed 32.84s | Loss: 0.3986 Grad: 88420.1562 LR: 9.5426e-05
Epoch: [3][550/733]Elapsed 36.06s | Loss: 0.4013 Grad: 64033.3750 LR: 9.5003e-05
Epoch: [3][600/733]Elapsed 3

Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [3][0/210]Elapsed 0.05s | Loss: 0.2633
Epoch: [3][50/210]Elapsed 2.54s | Loss: 0.4059
Epoch: [3][100/210]Elapsed 5.17s | Loss: 0.4300
Epoch: [3][150/210]Elapsed 7.84s | Loss: 0.4120
Epoch: [3][200/210]Elapsed 10.52s | Loss: 0.3965


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.3954 | Average Valid Loss: 0.3965 | Time: 58.92s
Best model found in epoch 3 | valid loss: 0.3965


Train:   0%|          | 0/733 [00:00<?, ?batch/s]

Epoch: [4][0/733]Elapsed 0.07s | Loss: 0.2792 Grad: 162908.1562 LR: 9.3314e-05
Epoch: [4][50/733]Elapsed 3.30s | Loss: 0.3548 Grad: 77184.0625 LR: 9.2810e-05
Epoch: [4][100/733]Elapsed 6.51s | Loss: 0.3546 Grad: 99262.3984 LR: 9.2289e-05
Epoch: [4][150/733]Elapsed 9.85s | Loss: 0.3525 Grad: 117689.6484 LR: 9.1752e-05
Epoch: [4][200/733]Elapsed 13.08s | Loss: 0.3603 Grad: 64379.1992 LR: 9.1198e-05
Epoch: [4][250/733]Elapsed 16.33s | Loss: 0.3562 Grad: 77791.5703 LR: 9.0627e-05
Epoch: [4][300/733]Elapsed 19.53s | Loss: 0.3585 Grad: 52699.6680 LR: 9.0041e-05
Epoch: [4][350/733]Elapsed 22.82s | Loss: 0.3531 Grad: 43461.0938 LR: 8.9439e-05
Epoch: [4][400/733]Elapsed 26.13s | Loss: 0.3568 Grad: 84726.4375 LR: 8.8822e-05
Epoch: [4][450/733]Elapsed 29.34s | Loss: 0.3579 Grad: 76209.9297 LR: 8.8189e-05
Epoch: [4][500/733]Elapsed 32.57s | Loss: 0.3554 Grad: 101454.4375 LR: 8.7542e-05
Epoch: [4][550/733]Elapsed 35.78s | Loss: 0.3590 Grad: 72798.6406 LR: 8.6879e-05
Epoch: [4][600/733]Elapsed 39.01

Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [4][0/210]Elapsed 0.06s | Loss: 0.1934
Epoch: [4][50/210]Elapsed 2.54s | Loss: 0.4125
Epoch: [4][100/210]Elapsed 5.15s | Loss: 0.4276
Epoch: [4][150/210]Elapsed 7.84s | Loss: 0.4147
Epoch: [4][200/210]Elapsed 10.51s | Loss: 0.4084


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.3566 | Average Valid Loss: 0.4099 | Time: 58.90s


Train:   0%|          | 0/733 [00:00<?, ?batch/s]

Epoch: [5][0/733]Elapsed 0.07s | Loss: 0.2435 Grad: 163639.3281 LR: 8.4347e-05
Epoch: [5][50/733]Elapsed 3.31s | Loss: 0.3152 Grad: 160513.4219 LR: 8.3620e-05
Epoch: [5][100/733]Elapsed 6.53s | Loss: 0.3131 Grad: 202108.5156 LR: 8.2879e-05
Epoch: [5][150/733]Elapsed 9.78s | Loss: 0.3127 Grad: 180029.3594 LR: 8.2125e-05
Epoch: [5][200/733]Elapsed 13.03s | Loss: 0.3262 Grad: 135542.8125 LR: 8.1359e-05
Epoch: [5][250/733]Elapsed 16.25s | Loss: 0.3262 Grad: 188057.9375 LR: 8.0580e-05
Epoch: [5][300/733]Elapsed 19.47s | Loss: 0.3261 Grad: 52256.0117 LR: 7.9790e-05
Epoch: [5][350/733]Elapsed 22.70s | Loss: 0.3202 Grad: 46011.6992 LR: 7.8987e-05
Epoch: [5][400/733]Elapsed 25.91s | Loss: 0.3217 Grad: 133174.6875 LR: 7.8173e-05
Epoch: [5][450/733]Elapsed 29.23s | Loss: 0.3214 Grad: 72496.8750 LR: 7.7348e-05
Epoch: [5][500/733]Elapsed 32.44s | Loss: 0.3210 Grad: 75058.9688 LR: 7.6513e-05
Epoch: [5][550/733]Elapsed 35.63s | Loss: 0.3260 Grad: 61315.7109 LR: 7.5667e-05
Epoch: [5][600/733]Elapsed 3

Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [5][0/210]Elapsed 0.06s | Loss: 0.2460
Epoch: [5][50/210]Elapsed 2.54s | Loss: 0.3675
Epoch: [5][100/210]Elapsed 5.15s | Loss: 0.3858
Epoch: [5][150/210]Elapsed 7.82s | Loss: 0.3810
Epoch: [5][200/210]Elapsed 10.50s | Loss: 0.3870


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.3252 | Average Valid Loss: 0.3905 | Time: 58.37s
Best model found in epoch 5 | valid loss: 0.3905


Train:   0%|          | 0/733 [00:00<?, ?batch/s]

Epoch: [6][0/733]Elapsed 0.09s | Loss: 0.2441 Grad: 152680.1250 LR: 7.2504e-05
Epoch: [6][50/733]Elapsed 3.38s | Loss: 0.3022 Grad: 56063.1289 LR: 7.1614e-05
Epoch: [6][100/733]Elapsed 6.57s | Loss: 0.2909 Grad: 78554.1406 LR: 7.0716e-05
Epoch: [6][150/733]Elapsed 9.77s | Loss: 0.2941 Grad: 84347.8672 LR: 6.9809e-05
Epoch: [6][200/733]Elapsed 13.08s | Loss: 0.3003 Grad: 70693.7578 LR: 6.8894e-05
Epoch: [6][250/733]Elapsed 16.30s | Loss: 0.3002 Grad: 93264.0234 LR: 6.7972e-05
Epoch: [6][300/733]Elapsed 19.55s | Loss: 0.3001 Grad: 77059.5156 LR: 6.7043e-05
Epoch: [6][350/733]Elapsed 22.79s | Loss: 0.2928 Grad: 32510.8730 LR: 6.6108e-05
Epoch: [6][400/733]Elapsed 25.99s | Loss: 0.2940 Grad: 76513.1719 LR: 6.5166e-05
Epoch: [6][450/733]Elapsed 29.21s | Loss: 0.2961 Grad: 73868.2344 LR: 6.4218e-05
Epoch: [6][500/733]Elapsed 32.41s | Loss: 0.2959 Grad: 73045.5078 LR: 6.3264e-05
Epoch: [6][550/733]Elapsed 35.71s | Loss: 0.2991 Grad: 69051.9219 LR: 6.2305e-05
Epoch: [6][600/733]Elapsed 38.92s 

Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [6][0/210]Elapsed 0.05s | Loss: 0.2237
Epoch: [6][50/210]Elapsed 2.53s | Loss: 0.3631
Epoch: [6][100/210]Elapsed 5.12s | Loss: 0.3907
Epoch: [6][150/210]Elapsed 7.79s | Loss: 0.3807
Epoch: [6][200/210]Elapsed 10.46s | Loss: 0.3877


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.2986 | Average Valid Loss: 0.3922 | Time: 58.46s
Fold 2 Valid Loss: (Easy) 0.7626 | (Hard) 0.8187
Elapse: 5.89 min 
- Second Stage 
Use Checkpoint: ViTMAE_base_fold_2_stage_1.pth


Train:   0%|          | 0/318 [00:00<?, ?batch/s]

Epoch: [1][0/318]Elapsed 0.07s | Loss: 0.4384 Grad: 191232.1875 LR: 4.0000e-06
Epoch: [1][50/318]Elapsed 3.35s | Loss: 0.3287 Grad: 97425.8047 LR: 8.0513e-06
Epoch: [1][100/318]Elapsed 6.70s | Loss: 0.2975 Grad: 68689.5703 LR: 1.9521e-05
Epoch: [1][150/318]Elapsed 9.93s | Loss: 0.2735 Grad: 73113.4453 LR: 3.6474e-05
Epoch: [1][200/318]Elapsed 13.18s | Loss: 0.2634 Grad: 48699.7461 LR: 5.6047e-05
Epoch: [1][250/318]Elapsed 16.41s | Loss: 0.2582 Grad: 58923.5898 LR: 7.4937e-05
Epoch: [1][300/318]Elapsed 19.63s | Loss: 0.2522 Grad: 83881.3125 LR: 8.9956e-05
Epoch: [1][317/318]Elapsed 20.75s | Loss: 0.2507 Grad: 108990.2188 LR: 9.3896e-05


Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [1][0/210]Elapsed 0.06s | Loss: 0.3440
Epoch: [1][50/210]Elapsed 2.55s | Loss: 0.4534
Epoch: [1][100/210]Elapsed 5.14s | Loss: 0.4667
Epoch: [1][150/210]Elapsed 7.79s | Loss: 0.4339
Epoch: [1][200/210]Elapsed 10.49s | Loss: 0.3808


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.2507 | Average Valid Loss: 0.3740 | Time: 31.85s
Best model found in epoch 1 | valid loss: 0.3740


Train:   0%|          | 0/318 [00:00<?, ?batch/s]

Epoch: [2][0/318]Elapsed 0.08s | Loss: 0.2583 Grad: 77261.0859 LR: 9.3896e-05
Epoch: [2][50/318]Elapsed 3.46s | Loss: 0.2237 Grad: 66741.7188 LR: 9.9779e-05
Epoch: [2][100/318]Elapsed 6.74s | Loss: 0.2223 Grad: 53984.0977 LR: 9.9969e-05
Epoch: [2][150/318]Elapsed 9.96s | Loss: 0.2164 Grad: 72071.1562 LR: 9.9837e-05
Epoch: [2][200/318]Elapsed 13.19s | Loss: 0.2171 Grad: 67927.2031 LR: 9.9600e-05
Epoch: [2][250/318]Elapsed 16.43s | Loss: 0.2189 Grad: 59583.6250 LR: 9.9260e-05
Epoch: [2][300/318]Elapsed 19.66s | Loss: 0.2156 Grad: 69455.6094 LR: 9.8816e-05
Epoch: [2][317/318]Elapsed 20.76s | Loss: 0.2146 Grad: 121428.6094 LR: 9.8632e-05


Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [2][0/210]Elapsed 0.06s | Loss: 0.3877
Epoch: [2][50/210]Elapsed 2.56s | Loss: 0.4778
Epoch: [2][100/210]Elapsed 5.17s | Loss: 0.4854
Epoch: [2][150/210]Elapsed 7.85s | Loss: 0.4480
Epoch: [2][200/210]Elapsed 10.53s | Loss: 0.3899


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.2146 | Average Valid Loss: 0.3828 | Time: 31.90s


Train:   0%|          | 0/318 [00:00<?, ?batch/s]

Epoch: [3][0/318]Elapsed 0.07s | Loss: 0.2367 Grad: 81175.8750 LR: 9.8632e-05
Epoch: [3][50/318]Elapsed 3.45s | Loss: 0.2079 Grad: 76580.5938 LR: 9.8050e-05
Epoch: [3][100/318]Elapsed 6.80s | Loss: 0.2092 Grad: 46910.2578 LR: 9.7367e-05
Epoch: [3][150/318]Elapsed 10.08s | Loss: 0.2045 Grad: 69346.1094 LR: 9.6586e-05
Epoch: [3][200/318]Elapsed 13.30s | Loss: 0.2040 Grad: 54838.0742 LR: 9.5707e-05
Epoch: [3][250/318]Elapsed 16.52s | Loss: 0.2060 Grad: 62108.0000 LR: 9.4733e-05
Epoch: [3][300/318]Elapsed 19.77s | Loss: 0.2037 Grad: 85393.3984 LR: 9.3665e-05
Epoch: [3][317/318]Elapsed 20.92s | Loss: 0.2030 Grad: 115079.0156 LR: 9.3258e-05


Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [3][0/210]Elapsed 0.05s | Loss: 0.3691
Epoch: [3][50/210]Elapsed 2.54s | Loss: 0.4819
Epoch: [3][100/210]Elapsed 5.16s | Loss: 0.4925
Epoch: [3][150/210]Elapsed 7.84s | Loss: 0.4566
Epoch: [3][200/210]Elapsed 10.52s | Loss: 0.3952


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2030 | Average Valid Loss: 0.3869 | Time: 32.06s


Train:   0%|          | 0/318 [00:00<?, ?batch/s]

Epoch: [4][0/318]Elapsed 0.07s | Loss: 0.2303 Grad: 89538.4219 LR: 9.3258e-05
Epoch: [4][50/318]Elapsed 3.55s | Loss: 0.1926 Grad: 60088.4219 LR: 9.2067e-05
Epoch: [4][100/318]Elapsed 6.93s | Loss: 0.1917 Grad: 59682.5469 LR: 9.0787e-05
Epoch: [4][150/318]Elapsed 10.42s | Loss: 0.1892 Grad: 77308.5625 LR: 8.9423e-05
Epoch: [4][200/318]Elapsed 13.66s | Loss: 0.1891 Grad: 55237.4453 LR: 8.7976e-05
Epoch: [4][250/318]Elapsed 16.92s | Loss: 0.1918 Grad: 53151.6602 LR: 8.6449e-05
Epoch: [4][300/318]Elapsed 20.15s | Loss: 0.1887 Grad: 59511.9375 LR: 8.4847e-05
Epoch: [4][317/318]Elapsed 21.25s | Loss: 0.1884 Grad: 130677.8438 LR: 8.4252e-05


Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [4][0/210]Elapsed 0.05s | Loss: 0.3623
Epoch: [4][50/210]Elapsed 2.54s | Loss: 0.4914
Epoch: [4][100/210]Elapsed 5.12s | Loss: 0.5053
Epoch: [4][150/210]Elapsed 7.79s | Loss: 0.4651
Epoch: [4][200/210]Elapsed 10.47s | Loss: 0.4017


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1884 | Average Valid Loss: 0.3930 | Time: 32.33s


Train:   0%|          | 0/318 [00:00<?, ?batch/s]

Epoch: [5][0/318]Elapsed 0.07s | Loss: 0.1760 Grad: 79973.5625 LR: 8.4252e-05
Epoch: [5][50/318]Elapsed 3.42s | Loss: 0.1854 Grad: 76961.5234 LR: 8.2551e-05
Epoch: [5][100/318]Elapsed 6.73s | Loss: 0.1827 Grad: 47572.3594 LR: 8.0782e-05
Epoch: [5][150/318]Elapsed 10.04s | Loss: 0.1801 Grad: 82632.6719 LR: 7.8949e-05
Epoch: [5][200/318]Elapsed 13.45s | Loss: 0.1802 Grad: 62068.8789 LR: 7.7055e-05
Epoch: [5][250/318]Elapsed 16.75s | Loss: 0.1817 Grad: 53371.6172 LR: 7.5105e-05
Epoch: [5][300/318]Elapsed 20.00s | Loss: 0.1789 Grad: 64810.3555 LR: 7.3102e-05
Epoch: [5][317/318]Elapsed 21.11s | Loss: 0.1790 Grad: 110660.2734 LR: 7.2369e-05


Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [5][0/210]Elapsed 0.06s | Loss: 0.4498
Epoch: [5][50/210]Elapsed 2.54s | Loss: 0.5083
Epoch: [5][100/210]Elapsed 5.15s | Loss: 0.5247
Epoch: [5][150/210]Elapsed 7.83s | Loss: 0.4798
Epoch: [5][200/210]Elapsed 10.51s | Loss: 0.4118


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1790 | Average Valid Loss: 0.4031 | Time: 32.23s


Train:   0%|          | 0/318 [00:00<?, ?batch/s]

Epoch: [6][0/318]Elapsed 0.09s | Loss: 0.1809 Grad: 94466.9844 LR: 7.2369e-05
Epoch: [6][50/318]Elapsed 3.35s | Loss: 0.1693 Grad: 57349.2539 LR: 7.0302e-05
Epoch: [6][100/318]Elapsed 6.69s | Loss: 0.1693 Grad: 42213.7852 LR: 6.8192e-05
Epoch: [6][150/318]Elapsed 9.92s | Loss: 0.1678 Grad: 105314.1562 LR: 6.6044e-05
Epoch: [6][200/318]Elapsed 13.21s | Loss: 0.1682 Grad: 79356.9375 LR: 6.3863e-05
Epoch: [6][250/318]Elapsed 16.59s | Loss: 0.1700 Grad: 57108.7305 LR: 6.1653e-05
Epoch: [6][300/318]Elapsed 19.97s | Loss: 0.1668 Grad: 80992.7578 LR: 5.9418e-05
Epoch: [6][317/318]Elapsed 21.08s | Loss: 0.1664 Grad: 145391.2344 LR: 5.8609e-05


Valid:   0%|          | 0/210 [00:00<?, ?batch/s]

Epoch: [6][0/210]Elapsed 0.06s | Loss: 0.4607
Epoch: [6][50/210]Elapsed 2.54s | Loss: 0.4890
Epoch: [6][100/210]Elapsed 5.18s | Loss: 0.4998
Epoch: [6][150/210]Elapsed 7.86s | Loss: 0.4616
Epoch: [6][200/210]Elapsed 10.56s | Loss: 0.4010


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1664 | Average Valid Loss: 0.3926 | Time: 32.25s
Fold 2 Valid Loss: (Easy) 0.8977 | (Hard) 0.4404
Elapse: 3.23 min 
Fold: 3 || Valid size 3010 
- First Stage 


Train:   0%|          | 0/749 [00:00<?, ?batch/s]

Epoch: [1][0/749]Elapsed 0.07s | Loss: 0.8421 Grad: 104765.6172 LR: 4.0000e-06
Epoch: [1][50/749]Elapsed 3.38s | Loss: 0.8028 Grad: 52627.5977 LR: 4.7344e-06
Epoch: [1][100/749]Elapsed 6.68s | Loss: 0.7733 Grad: 39935.0898 LR: 6.9152e-06
Epoch: [1][150/749]Elapsed 9.88s | Loss: 0.7563 Grad: 77116.6250 LR: 1.0476e-05
Epoch: [1][200/749]Elapsed 13.10s | Loss: 0.7458 Grad: 48565.5742 LR: 1.5307e-05
Epoch: [1][250/749]Elapsed 16.29s | Loss: 0.7431 Grad: 44270.1797 LR: 2.1261e-05
Epoch: [1][300/749]Elapsed 19.49s | Loss: 0.7357 Grad: 101196.0312 LR: 2.8155e-05
Epoch: [1][350/749]Elapsed 22.69s | Loss: 0.7232 Grad: 111517.2031 LR: 3.5780e-05
Epoch: [1][400/749]Elapsed 25.94s | Loss: 0.7091 Grad: 94469.5078 LR: 4.3901e-05
Epoch: [1][450/749]Elapsed 29.20s | Loss: 0.6980 Grad: 91164.0625 LR: 5.2269e-05
Epoch: [1][500/749]Elapsed 32.41s | Loss: 0.6854 Grad: 88565.2578 LR: 6.0629e-05
Epoch: [1][550/749]Elapsed 35.78s | Loss: 0.6715 Grad: 41208.0664 LR: 6.8726e-05
Epoch: [1][600/749]Elapsed 39.19

Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [1][0/189]Elapsed 0.06s | Loss: 0.6078
Epoch: [1][50/189]Elapsed 2.56s | Loss: 0.5669
Epoch: [1][100/189]Elapsed 5.16s | Loss: 0.5612
Epoch: [1][150/189]Elapsed 7.83s | Loss: 0.5240


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6357 | Average Valid Loss: 0.4897 | Time: 58.74s
Best model found in epoch 1 | valid loss: 0.4897


Train:   0%|          | 0/749 [00:00<?, ?batch/s]

Epoch: [2][0/749]Elapsed 0.08s | Loss: 0.5285 Grad: 153236.0469 LR: 9.3625e-05
Epoch: [2][50/749]Elapsed 3.55s | Loss: 0.5196 Grad: 91221.7031 LR: 9.7154e-05
Epoch: [2][100/749]Elapsed 6.88s | Loss: 0.5146 Grad: 126457.0703 LR: 9.9300e-05
Epoch: [2][150/749]Elapsed 10.11s | Loss: 0.4995 Grad: 83084.7578 LR: 1.0000e-04
Epoch: [2][200/749]Elapsed 13.31s | Loss: 0.5021 Grad: 61740.6016 LR: 9.9990e-05
Epoch: [2][250/749]Elapsed 16.61s | Loss: 0.4901 Grad: 61411.1406 LR: 9.9961e-05
Epoch: [2][300/749]Elapsed 19.88s | Loss: 0.4880 Grad: 94236.4609 LR: 9.9914e-05
Epoch: [2][350/749]Elapsed 23.10s | Loss: 0.4834 Grad: 76017.9062 LR: 9.9847e-05
Epoch: [2][400/749]Elapsed 26.29s | Loss: 0.4827 Grad: 59148.8203 LR: 9.9762e-05
Epoch: [2][450/749]Elapsed 29.49s | Loss: 0.4775 Grad: 61303.7852 LR: 9.9658e-05
Epoch: [2][500/749]Elapsed 32.75s | Loss: 0.4695 Grad: 83883.7109 LR: 9.9536e-05
Epoch: [2][550/749]Elapsed 35.99s | Loss: 0.4629 Grad: 77640.1250 LR: 9.9395e-05
Epoch: [2][600/749]Elapsed 39.24

Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [2][0/189]Elapsed 0.06s | Loss: 0.5622
Epoch: [2][50/189]Elapsed 2.57s | Loss: 0.4476
Epoch: [2][100/189]Elapsed 5.22s | Loss: 0.4484
Epoch: [2][150/189]Elapsed 7.91s | Loss: 0.4250


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.4537 | Average Valid Loss: 0.4055 | Time: 59.03s
Best model found in epoch 2 | valid loss: 0.4055


Train:   0%|          | 0/749 [00:00<?, ?batch/s]

Epoch: [3][0/749]Elapsed 0.08s | Loss: 0.4041 Grad: 148023.4688 LR: 9.8653e-05
Epoch: [3][50/749]Elapsed 3.34s | Loss: 0.4009 Grad: 71507.1484 LR: 9.8420e-05
Epoch: [3][100/749]Elapsed 6.60s | Loss: 0.4096 Grad: 87882.4141 LR: 9.8169e-05
Epoch: [3][150/749]Elapsed 9.98s | Loss: 0.4012 Grad: 70796.6016 LR: 9.7899e-05
Epoch: [3][200/749]Elapsed 13.21s | Loss: 0.4109 Grad: 66732.2344 LR: 9.7612e-05
Epoch: [3][250/749]Elapsed 16.46s | Loss: 0.4041 Grad: 63298.1328 LR: 9.7306e-05
Epoch: [3][300/749]Elapsed 19.68s | Loss: 0.4043 Grad: 106246.8203 LR: 9.6983e-05
Epoch: [3][350/749]Elapsed 22.89s | Loss: 0.4041 Grad: 66397.0469 LR: 9.6642e-05
Epoch: [3][400/749]Elapsed 26.11s | Loss: 0.4037 Grad: 65949.6016 LR: 9.6284e-05
Epoch: [3][450/749]Elapsed 29.34s | Loss: 0.4023 Grad: 81103.7812 LR: 9.5908e-05
Epoch: [3][500/749]Elapsed 32.56s | Loss: 0.3992 Grad: 74921.8203 LR: 9.5515e-05
Epoch: [3][550/749]Elapsed 35.80s | Loss: 0.3938 Grad: 67589.0938 LR: 9.5104e-05
Epoch: [3][600/749]Elapsed 39.00s

Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [3][0/189]Elapsed 0.07s | Loss: 0.4545
Epoch: [3][50/189]Elapsed 2.56s | Loss: 0.4429
Epoch: [3][100/189]Elapsed 5.17s | Loss: 0.4424
Epoch: [3][150/189]Elapsed 7.85s | Loss: 0.4199


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.3901 | Average Valid Loss: 0.4035 | Time: 58.57s
Best model found in epoch 3 | valid loss: 0.4035


Train:   0%|          | 0/749 [00:00<?, ?batch/s]

Epoch: [4][0/749]Elapsed 0.09s | Loss: 0.3030 Grad: 148245.7812 LR: 9.3314e-05
Epoch: [4][50/749]Elapsed 3.51s | Loss: 0.3587 Grad: 191026.7969 LR: 9.2821e-05
Epoch: [4][100/749]Elapsed 6.73s | Loss: 0.3616 Grad: 272210.1250 LR: 9.2311e-05
Epoch: [4][150/749]Elapsed 9.94s | Loss: 0.3603 Grad: 201999.4375 LR: 9.1786e-05
Epoch: [4][200/749]Elapsed 13.20s | Loss: 0.3647 Grad: 51915.2852 LR: 9.1245e-05
Epoch: [4][250/749]Elapsed 16.41s | Loss: 0.3612 Grad: 63895.5547 LR: 9.0689e-05
Epoch: [4][300/749]Elapsed 19.61s | Loss: 0.3612 Grad: 115386.2656 LR: 9.0117e-05
Epoch: [4][350/749]Elapsed 22.86s | Loss: 0.3604 Grad: 55962.8242 LR: 8.9530e-05
Epoch: [4][400/749]Elapsed 26.09s | Loss: 0.3599 Grad: 75362.1328 LR: 8.8928e-05
Epoch: [4][450/749]Elapsed 29.36s | Loss: 0.3589 Grad: 102783.7891 LR: 8.8312e-05
Epoch: [4][500/749]Elapsed 32.60s | Loss: 0.3553 Grad: 96928.6484 LR: 8.7681e-05
Epoch: [4][550/749]Elapsed 35.92s | Loss: 0.3514 Grad: 85940.8203 LR: 8.7036e-05
Epoch: [4][600/749]Elapsed 39

Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [4][0/189]Elapsed 0.06s | Loss: 0.3817
Epoch: [4][50/189]Elapsed 2.54s | Loss: 0.4072
Epoch: [4][100/189]Elapsed 5.14s | Loss: 0.4004
Epoch: [4][150/189]Elapsed 7.82s | Loss: 0.3874


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.3534 | Average Valid Loss: 0.3801 | Time: 58.89s
Best model found in epoch 4 | valid loss: 0.3801


Train:   0%|          | 0/749 [00:00<?, ?batch/s]

Epoch: [5][0/749]Elapsed 0.08s | Loss: 0.2405 Grad: 125856.9062 LR: 8.4347e-05
Epoch: [5][50/749]Elapsed 3.41s | Loss: 0.3290 Grad: 113237.7734 LR: 8.3635e-05
Epoch: [5][100/749]Elapsed 6.66s | Loss: 0.3208 Grad: 164149.6250 LR: 8.2910e-05
Epoch: [5][150/749]Elapsed 9.96s | Loss: 0.3226 Grad: 162168.6719 LR: 8.2173e-05
Epoch: [5][200/749]Elapsed 13.24s | Loss: 0.3297 Grad: 129574.3516 LR: 8.1424e-05
Epoch: [5][250/749]Elapsed 16.46s | Loss: 0.3279 Grad: 183313.1875 LR: 8.0663e-05
Epoch: [5][300/749]Elapsed 19.67s | Loss: 0.3289 Grad: 191649.2500 LR: 7.9891e-05
Epoch: [5][350/749]Elapsed 22.89s | Loss: 0.3296 Grad: 141597.7969 LR: 7.9107e-05
Epoch: [5][400/749]Elapsed 26.11s | Loss: 0.3300 Grad: 145608.8594 LR: 7.8313e-05
Epoch: [5][450/749]Elapsed 29.45s | Loss: 0.3287 Grad: 101221.3750 LR: 7.7507e-05
Epoch: [5][500/749]Elapsed 32.86s | Loss: 0.3263 Grad: 98276.1250 LR: 7.6691e-05
Epoch: [5][550/749]Elapsed 36.18s | Loss: 0.3230 Grad: 94228.0859 LR: 7.5866e-05
Epoch: [5][600/749]Elapse

Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [5][0/189]Elapsed 0.05s | Loss: 0.2791
Epoch: [5][50/189]Elapsed 2.54s | Loss: 0.4189
Epoch: [5][100/189]Elapsed 5.15s | Loss: 0.4028
Epoch: [5][150/189]Elapsed 7.79s | Loss: 0.3939


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.3245 | Average Valid Loss: 0.3926 | Time: 59.12s


Train:   0%|          | 0/749 [00:00<?, ?batch/s]

Epoch: [6][0/749]Elapsed 0.07s | Loss: 0.2859 Grad: 160763.9844 LR: 7.2503e-05
Epoch: [6][50/749]Elapsed 3.45s | Loss: 0.2980 Grad: 128739.8359 LR: 7.1632e-05
Epoch: [6][100/749]Elapsed 6.71s | Loss: 0.2940 Grad: 239556.4531 LR: 7.0753e-05
Epoch: [6][150/749]Elapsed 9.94s | Loss: 0.2993 Grad: 248925.5781 LR: 6.9866e-05
Epoch: [6][200/749]Elapsed 13.39s | Loss: 0.3077 Grad: 151124.6250 LR: 6.8972e-05
Epoch: [6][250/749]Elapsed 16.69s | Loss: 0.3051 Grad: 216015.7656 LR: 6.8070e-05
Epoch: [6][300/749]Elapsed 20.06s | Loss: 0.3045 Grad: inf LR: 6.7162e-05
Epoch: [6][350/749]Elapsed 23.32s | Loss: 0.3035 Grad: 107227.2266 LR: 6.6247e-05
Epoch: [6][400/749]Elapsed 26.61s | Loss: 0.3052 Grad: 81212.9453 LR: 6.5326e-05
Epoch: [6][450/749]Elapsed 29.94s | Loss: 0.3053 Grad: 86492.3750 LR: 6.4399e-05
Epoch: [6][500/749]Elapsed 33.32s | Loss: 0.3015 Grad: 56841.1016 LR: 6.3467e-05
Epoch: [6][550/749]Elapsed 36.66s | Loss: 0.2970 Grad: 71224.7969 LR: 6.2530e-05
Epoch: [6][600/749]Elapsed 39.86s |

Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [6][0/189]Elapsed 0.07s | Loss: 0.3533
Epoch: [6][50/189]Elapsed 2.55s | Loss: 0.3891
Epoch: [6][100/189]Elapsed 5.17s | Loss: 0.3730
Epoch: [6][150/189]Elapsed 7.87s | Loss: 0.3748


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.2980 | Average Valid Loss: 0.3815 | Time: 59.50s
Fold 3 Valid Loss: (Easy) 0.7886 | (Hard) 0.7029
Elapse: 5.91 min 
- Second Stage 
Use Checkpoint: ViTMAE_base_fold_3_stage_1.pth


Train:   0%|          | 0/324 [00:00<?, ?batch/s]

Epoch: [1][0/324]Elapsed 0.07s | Loss: 0.3891 Grad: 187635.3750 LR: 4.0000e-06
Epoch: [1][50/324]Elapsed 3.44s | Loss: 0.3319 Grad: 109213.6328 LR: 7.9039e-06
Epoch: [1][100/324]Elapsed 6.68s | Loss: 0.2955 Grad: 76658.5000 LR: 1.8981e-05
Epoch: [1][150/324]Elapsed 9.92s | Loss: 0.2723 Grad: 78727.0469 LR: 3.5428e-05
Epoch: [1][200/324]Elapsed 13.17s | Loss: 0.2608 Grad: 94325.0234 LR: 5.4572e-05
Epoch: [1][250/324]Elapsed 16.40s | Loss: 0.2543 Grad: 61312.9219 LR: 7.3297e-05
Epoch: [1][300/324]Elapsed 19.63s | Loss: 0.2491 Grad: 64324.5625 LR: 8.8558e-05
Epoch: [1][323/324]Elapsed 21.12s | Loss: 0.2474 Grad: 128752.9688 LR: 9.3890e-05


Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [1][0/189]Elapsed 0.06s | Loss: 0.5301
Epoch: [1][50/189]Elapsed 2.57s | Loss: 0.4989
Epoch: [1][100/189]Elapsed 5.21s | Loss: 0.4942
Epoch: [1][150/189]Elapsed 7.91s | Loss: 0.4463


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.2474 | Average Valid Loss: 0.4035 | Time: 31.19s
Best model found in epoch 1 | valid loss: 0.4035


Train:   0%|          | 0/324 [00:00<?, ?batch/s]

Epoch: [2][0/324]Elapsed 0.07s | Loss: 0.3009 Grad: 81137.0625 LR: 9.3890e-05
Epoch: [2][50/324]Elapsed 3.32s | Loss: 0.2233 Grad: 86412.7266 LR: 9.9741e-05
Epoch: [2][100/324]Elapsed 6.56s | Loss: 0.2204 Grad: 56485.0430 LR: 9.9972e-05
Epoch: [2][150/324]Elapsed 9.88s | Loss: 0.2191 Grad: 54152.0742 LR: 9.9847e-05
Epoch: [2][200/324]Elapsed 13.22s | Loss: 0.2180 Grad: 84265.9297 LR: 9.9621e-05
Epoch: [2][250/324]Elapsed 16.44s | Loss: 0.2159 Grad: 57353.1406 LR: 9.9296e-05
Epoch: [2][300/324]Elapsed 19.66s | Loss: 0.2146 Grad: 66870.1328 LR: 9.8871e-05
Epoch: [2][323/324]Elapsed 21.14s | Loss: 0.2137 Grad: 127969.8594 LR: 9.8632e-05


Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [2][0/189]Elapsed 0.06s | Loss: 0.4720
Epoch: [2][50/189]Elapsed 2.55s | Loss: 0.4913
Epoch: [2][100/189]Elapsed 5.20s | Loss: 0.4968
Epoch: [2][150/189]Elapsed 7.88s | Loss: 0.4494


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.2137 | Average Valid Loss: 0.4048 | Time: 31.16s


Train:   0%|          | 0/324 [00:00<?, ?batch/s]

Epoch: [3][0/324]Elapsed 0.08s | Loss: 0.2362 Grad: 84401.1953 LR: 9.8632e-05
Epoch: [3][50/324]Elapsed 3.32s | Loss: 0.2079 Grad: 65978.9141 LR: 9.8062e-05
Epoch: [3][100/324]Elapsed 6.58s | Loss: 0.2050 Grad: 65368.2734 LR: 9.7395e-05
Epoch: [3][150/324]Elapsed 9.95s | Loss: 0.2020 Grad: 60936.2695 LR: 9.6633e-05
Epoch: [3][200/324]Elapsed 13.36s | Loss: 0.2019 Grad: 94834.5391 LR: 9.5776e-05
Epoch: [3][250/324]Elapsed 16.65s | Loss: 0.1996 Grad: 64659.1797 LR: 9.4828e-05
Epoch: [3][300/324]Elapsed 19.90s | Loss: 0.1986 Grad: 74858.5547 LR: 9.3789e-05
Epoch: [3][323/324]Elapsed 21.39s | Loss: 0.1979 Grad: 104496.4531 LR: 9.3259e-05


Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [3][0/189]Elapsed 0.06s | Loss: 0.5966
Epoch: [3][50/189]Elapsed 2.55s | Loss: 0.5164
Epoch: [3][100/189]Elapsed 5.17s | Loss: 0.5148
Epoch: [3][150/189]Elapsed 7.86s | Loss: 0.4636


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.1979 | Average Valid Loss: 0.4152 | Time: 31.41s


Train:   0%|          | 0/324 [00:00<?, ?batch/s]

Epoch: [4][0/324]Elapsed 0.07s | Loss: 0.2327 Grad: 71647.3594 LR: 9.3259e-05
Epoch: [4][50/324]Elapsed 3.42s | Loss: 0.1957 Grad: 53399.0586 LR: 9.2091e-05
Epoch: [4][100/324]Elapsed 6.67s | Loss: 0.1915 Grad: 43552.8398 LR: 9.0837e-05
Epoch: [4][150/324]Elapsed 9.92s | Loss: 0.1890 Grad: 65068.1367 LR: 8.9502e-05
Epoch: [4][200/324]Elapsed 13.19s | Loss: 0.1888 Grad: 69815.6406 LR: 8.8087e-05
Epoch: [4][250/324]Elapsed 16.43s | Loss: 0.1874 Grad: 60595.6836 LR: 8.6595e-05
Epoch: [4][300/324]Elapsed 19.64s | Loss: 0.1868 Grad: 68991.2188 LR: 8.5030e-05
Epoch: [4][323/324]Elapsed 21.13s | Loss: 0.1858 Grad: 101822.6172 LR: 8.4253e-05


Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [4][0/189]Elapsed 0.05s | Loss: 0.5007
Epoch: [4][50/189]Elapsed 2.54s | Loss: 0.4993
Epoch: [4][100/189]Elapsed 5.15s | Loss: 0.4974
Epoch: [4][150/189]Elapsed 7.85s | Loss: 0.4467


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1858 | Average Valid Loss: 0.4027 | Time: 31.12s
Best model found in epoch 4 | valid loss: 0.4027


Train:   0%|          | 0/324 [00:00<?, ?batch/s]

Epoch: [5][0/324]Elapsed 0.07s | Loss: 0.2294 Grad: 92634.2656 LR: 8.4253e-05
Epoch: [5][50/324]Elapsed 3.39s | Loss: 0.1811 Grad: 55781.4141 LR: 8.2584e-05
Epoch: [5][100/324]Elapsed 6.68s | Loss: 0.1812 Grad: 52013.6016 LR: 8.0850e-05
Epoch: [5][150/324]Elapsed 9.89s | Loss: 0.1782 Grad: 59231.0898 LR: 7.9054e-05
Epoch: [5][200/324]Elapsed 13.10s | Loss: 0.1777 Grad: 87168.0312 LR: 7.7199e-05
Epoch: [5][250/324]Elapsed 16.30s | Loss: 0.1755 Grad: 63029.3711 LR: 7.5289e-05
Epoch: [5][300/324]Elapsed 19.52s | Loss: 0.1750 Grad: 90728.0312 LR: 7.3329e-05
Epoch: [5][323/324]Elapsed 21.01s | Loss: 0.1745 Grad: 102067.9219 LR: 7.2371e-05


Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [5][0/189]Elapsed 0.05s | Loss: 0.4925
Epoch: [5][50/189]Elapsed 2.53s | Loss: 0.5223
Epoch: [5][100/189]Elapsed 5.14s | Loss: 0.5194
Epoch: [5][150/189]Elapsed 7.80s | Loss: 0.4628


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1745 | Average Valid Loss: 0.4157 | Time: 30.93s


Train:   0%|          | 0/324 [00:00<?, ?batch/s]

Epoch: [6][0/324]Elapsed 0.08s | Loss: 0.2410 Grad: 100338.7422 LR: 7.2371e-05
Epoch: [6][50/324]Elapsed 3.45s | Loss: 0.1775 Grad: 55617.9375 LR: 7.0342e-05
Epoch: [6][100/324]Elapsed 6.68s | Loss: 0.1723 Grad: 38166.6680 LR: 6.8273e-05
Epoch: [6][150/324]Elapsed 9.98s | Loss: 0.1674 Grad: 66522.7969 LR: 6.6166e-05
Epoch: [6][200/324]Elapsed 13.29s | Loss: 0.1654 Grad: 89059.4844 LR: 6.4027e-05
Epoch: [6][250/324]Elapsed 16.66s | Loss: 0.1641 Grad: 61324.0156 LR: 6.1860e-05
Epoch: [6][300/324]Elapsed 20.00s | Loss: 0.1629 Grad: 84364.8750 LR: 5.9669e-05
Epoch: [6][323/324]Elapsed 21.50s | Loss: 0.1623 Grad: 119128.1797 LR: 5.8611e-05


Valid:   0%|          | 0/189 [00:00<?, ?batch/s]

Epoch: [6][0/189]Elapsed 0.05s | Loss: 0.4748
Epoch: [6][50/189]Elapsed 2.54s | Loss: 0.4961
Epoch: [6][100/189]Elapsed 5.16s | Loss: 0.4987
Epoch: [6][150/189]Elapsed 7.84s | Loss: 0.4488


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1623 | Average Valid Loss: 0.4045 | Time: 31.47s
Fold 3 Valid Loss: (Easy) 0.9826 | (Hard) 0.4484
Elapse: 3.14 min 
Fold: 4 || Valid size 2752 
- First Stage 


Train:   0%|          | 0/761 [00:00<?, ?batch/s]

Epoch: [1][0/761]Elapsed 0.07s | Loss: 0.8595 Grad: 107881.7500 LR: 4.0000e-06
Epoch: [1][50/761]Elapsed 3.40s | Loss: 0.8012 Grad: 51815.8281 LR: 4.7115e-06
Epoch: [1][100/761]Elapsed 6.81s | Loss: 0.7722 Grad: 48772.9648 LR: 6.8247e-06
Epoch: [1][150/761]Elapsed 9.99s | Loss: 0.7545 Grad: 82810.5781 LR: 1.0277e-05
Epoch: [1][200/761]Elapsed 13.19s | Loss: 0.7435 Grad: 51057.8945 LR: 1.4966e-05
Epoch: [1][250/761]Elapsed 16.39s | Loss: 0.7386 Grad: 60126.5508 LR: 2.0754e-05
Epoch: [1][300/761]Elapsed 19.57s | Loss: 0.7309 Grad: 99437.0859 LR: 2.7467e-05
Epoch: [1][350/761]Elapsed 22.74s | Loss: 0.7177 Grad: 113154.1797 LR: 3.4908e-05
Epoch: [1][400/761]Elapsed 25.92s | Loss: 0.7076 Grad: 89720.9609 LR: 4.2855e-05
Epoch: [1][450/761]Elapsed 29.09s | Loss: 0.6953 Grad: 117681.0156 LR: 5.1073e-05
Epoch: [1][500/761]Elapsed 32.28s | Loss: 0.6843 Grad: 148144.0781 LR: 5.9319e-05
Epoch: [1][550/761]Elapsed 35.47s | Loss: 0.6778 Grad: 88001.8672 LR: 6.7348e-05
Epoch: [1][600/761]Elapsed 38.6

Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [1][0/172]Elapsed 0.05s | Loss: 0.5376
Epoch: [1][50/172]Elapsed 2.49s | Loss: 0.5619
Epoch: [1][100/172]Elapsed 5.03s | Loss: 0.5551
Epoch: [1][150/172]Elapsed 7.60s | Loss: 0.5146


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6301 | Average Valid Loss: 0.4996 | Time: 57.82s
Best model found in epoch 1 | valid loss: 0.4996


Train:   0%|          | 0/761 [00:00<?, ?batch/s]

Epoch: [2][0/761]Elapsed 0.08s | Loss: 0.5570 Grad: 222074.4062 LR: 9.3624e-05
Epoch: [2][50/761]Elapsed 3.41s | Loss: 0.4842 Grad: 93851.8047 LR: 9.7108e-05
Epoch: [2][100/761]Elapsed 6.74s | Loss: 0.4954 Grad: 101462.0312 LR: 9.9254e-05
Epoch: [2][150/761]Elapsed 10.03s | Loss: 0.4832 Grad: 85526.0938 LR: 1.0000e-04
Epoch: [2][200/761]Elapsed 13.37s | Loss: 0.4871 Grad: 74313.1328 LR: 9.9991e-05
Epoch: [2][250/761]Elapsed 16.61s | Loss: 0.4813 Grad: 75358.5469 LR: 9.9964e-05
Epoch: [2][300/761]Elapsed 19.84s | Loss: 0.4793 Grad: 111507.7500 LR: 9.9919e-05
Epoch: [2][350/761]Elapsed 23.06s | Loss: 0.4767 Grad: 76300.6484 LR: 9.9856e-05
Epoch: [2][400/761]Elapsed 26.24s | Loss: 0.4786 Grad: 50917.9844 LR: 9.9774e-05
Epoch: [2][450/761]Elapsed 29.49s | Loss: 0.4739 Grad: 72314.3984 LR: 9.9674e-05
Epoch: [2][500/761]Elapsed 32.72s | Loss: 0.4730 Grad: 124219.5312 LR: 9.9556e-05
Epoch: [2][550/761]Elapsed 36.04s | Loss: 0.4704 Grad: 90936.4141 LR: 9.9420e-05
Epoch: [2][600/761]Elapsed 39.

Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [2][0/172]Elapsed 0.06s | Loss: 0.6764
Epoch: [2][50/172]Elapsed 2.50s | Loss: 0.5649
Epoch: [2][100/172]Elapsed 5.02s | Loss: 0.5542
Epoch: [2][150/172]Elapsed 7.60s | Loss: 0.5027


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.4485 | Average Valid Loss: 0.4833 | Time: 58.55s
Best model found in epoch 2 | valid loss: 0.4833


Train:   0%|          | 0/761 [00:00<?, ?batch/s]

Epoch: [3][0/761]Elapsed 0.07s | Loss: 0.3469 Grad: 211937.3750 LR: 9.8653e-05
Epoch: [3][50/761]Elapsed 3.28s | Loss: 0.4116 Grad: 78513.9766 LR: 9.8424e-05
Epoch: [3][100/761]Elapsed 6.51s | Loss: 0.4000 Grad: 72022.4766 LR: 9.8177e-05
Epoch: [3][150/761]Elapsed 9.74s | Loss: 0.3906 Grad: 85586.7109 LR: 9.7912e-05
Epoch: [3][200/761]Elapsed 13.01s | Loss: 0.4018 Grad: 69596.8203 LR: 9.7630e-05
Epoch: [3][250/761]Elapsed 16.35s | Loss: 0.3990 Grad: 85659.3438 LR: 9.7331e-05
Epoch: [3][300/761]Elapsed 19.66s | Loss: 0.3983 Grad: 144807.1562 LR: 9.7015e-05
Epoch: [3][350/761]Elapsed 22.84s | Loss: 0.3968 Grad: 72181.1562 LR: 9.6681e-05
Epoch: [3][400/761]Elapsed 26.05s | Loss: 0.3979 Grad: 84021.3750 LR: 9.6330e-05
Epoch: [3][450/761]Elapsed 29.37s | Loss: 0.3961 Grad: 94168.4297 LR: 9.5962e-05
Epoch: [3][500/761]Elapsed 32.79s | Loss: 0.3962 Grad: 92100.8828 LR: 9.5578e-05
Epoch: [3][550/761]Elapsed 36.04s | Loss: 0.3951 Grad: 70159.4219 LR: 9.5177e-05
Epoch: [3][600/761]Elapsed 39.31s

Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [3][0/172]Elapsed 0.05s | Loss: 0.7824
Epoch: [3][50/172]Elapsed 2.50s | Loss: 0.5475
Epoch: [3][100/172]Elapsed 5.05s | Loss: 0.5305
Epoch: [3][150/172]Elapsed 7.62s | Loss: 0.4990


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.3840 | Average Valid Loss: 0.4867 | Time: 58.49s


Train:   0%|          | 0/761 [00:00<?, ?batch/s]

Epoch: [4][0/761]Elapsed 0.08s | Loss: 0.3315 Grad: 191555.6250 LR: 9.3313e-05
Epoch: [4][50/761]Elapsed 3.31s | Loss: 0.3575 Grad: 141065.2969 LR: 9.2828e-05
Epoch: [4][100/761]Elapsed 6.49s | Loss: 0.3413 Grad: 153526.1094 LR: 9.2328e-05
Epoch: [4][150/761]Elapsed 9.68s | Loss: 0.3471 Grad: 191745.9062 LR: 9.1811e-05
Epoch: [4][200/761]Elapsed 12.87s | Loss: 0.3526 Grad: 147829.9844 LR: 9.1280e-05
Epoch: [4][250/761]Elapsed 16.16s | Loss: 0.3501 Grad: 174040.0938 LR: 9.0733e-05
Epoch: [4][300/761]Elapsed 19.48s | Loss: 0.3504 Grad: inf LR: 9.0172e-05
Epoch: [4][350/761]Elapsed 22.81s | Loss: 0.3506 Grad: 92524.1641 LR: 8.9595e-05
Epoch: [4][400/761]Elapsed 26.07s | Loss: 0.3536 Grad: 71005.3594 LR: 8.9005e-05
Epoch: [4][450/761]Elapsed 29.26s | Loss: 0.3541 Grad: 68672.2969 LR: 8.8400e-05
Epoch: [4][500/761]Elapsed 32.45s | Loss: 0.3530 Grad: 88800.5156 LR: 8.7781e-05
Epoch: [4][550/761]Elapsed 35.68s | Loss: 0.3531 Grad: 69449.5547 LR: 8.7149e-05
Epoch: [4][600/761]Elapsed 38.88s | 

Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [4][0/172]Elapsed 0.06s | Loss: 0.6268
Epoch: [4][50/172]Elapsed 2.50s | Loss: 0.4998
Epoch: [4][100/172]Elapsed 5.04s | Loss: 0.4822
Epoch: [4][150/172]Elapsed 7.65s | Loss: 0.4582


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.3425 | Average Valid Loss: 0.4514 | Time: 58.22s
Best model found in epoch 4 | valid loss: 0.4514


Train:   0%|          | 0/761 [00:00<?, ?batch/s]

Epoch: [5][0/761]Elapsed 0.08s | Loss: 0.3433 Grad: 212585.0625 LR: 8.4346e-05
Epoch: [5][50/761]Elapsed 3.28s | Loss: 0.3224 Grad: 129292.8281 LR: 8.3646e-05
Epoch: [5][100/761]Elapsed 6.47s | Loss: 0.3099 Grad: 113905.1016 LR: 8.2933e-05
Epoch: [5][150/761]Elapsed 9.66s | Loss: 0.3133 Grad: 192636.1719 LR: 8.2208e-05
Epoch: [5][200/761]Elapsed 12.86s | Loss: 0.3264 Grad: 154705.9375 LR: 8.1471e-05
Epoch: [5][250/761]Elapsed 16.07s | Loss: 0.3241 Grad: 229672.5625 LR: 8.0723e-05
Epoch: [5][300/761]Elapsed 19.26s | Loss: 0.3241 Grad: inf LR: 7.9964e-05
Epoch: [5][350/761]Elapsed 22.45s | Loss: 0.3241 Grad: 77764.9844 LR: 7.9194e-05
Epoch: [5][400/761]Elapsed 25.64s | Loss: 0.3243 Grad: 59822.0117 LR: 7.8413e-05
Epoch: [5][450/761]Elapsed 28.86s | Loss: 0.3259 Grad: 70149.5703 LR: 7.7622e-05
Epoch: [5][500/761]Elapsed 32.21s | Loss: 0.3272 Grad: 80015.0156 LR: 7.6820e-05
Epoch: [5][550/761]Elapsed 35.57s | Loss: 0.3275 Grad: 48528.3516 LR: 7.6009e-05
Epoch: [5][600/761]Elapsed 38.89s | 

Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [5][0/172]Elapsed 0.07s | Loss: 0.2812
Epoch: [5][50/172]Elapsed 2.51s | Loss: 0.4750
Epoch: [5][100/172]Elapsed 5.06s | Loss: 0.4480
Epoch: [5][150/172]Elapsed 7.66s | Loss: 0.4293


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.3157 | Average Valid Loss: 0.4203 | Time: 58.45s
Best model found in epoch 5 | valid loss: 0.4203


Train:   0%|          | 0/761 [00:00<?, ?batch/s]

Epoch: [6][0/761]Elapsed 0.07s | Loss: 0.2775 Grad: 178747.5938 LR: 7.2502e-05
Epoch: [6][50/761]Elapsed 3.29s | Loss: 0.3046 Grad: 148635.2812 LR: 7.1645e-05
Epoch: [6][100/761]Elapsed 6.49s | Loss: 0.2954 Grad: 327823.9062 LR: 7.0780e-05
Epoch: [6][150/761]Elapsed 9.74s | Loss: 0.2975 Grad: 122372.8047 LR: 6.9907e-05
Epoch: [6][200/761]Elapsed 13.05s | Loss: 0.3016 Grad: 166688.2812 LR: 6.9027e-05
Epoch: [6][250/761]Elapsed 16.38s | Loss: 0.2994 Grad: 182653.8125 LR: 6.8140e-05
Epoch: [6][300/761]Elapsed 19.70s | Loss: 0.2977 Grad: 237998.2812 LR: 6.7247e-05
Epoch: [6][350/761]Elapsed 23.01s | Loss: 0.2992 Grad: 62148.5078 LR: 6.6347e-05
Epoch: [6][400/761]Elapsed 26.31s | Loss: 0.3028 Grad: 91525.6797 LR: 6.5441e-05
Epoch: [6][450/761]Elapsed 29.48s | Loss: 0.3029 Grad: 79361.4922 LR: 6.4530e-05
Epoch: [6][500/761]Elapsed 32.72s | Loss: 0.3011 Grad: 73027.1406 LR: 6.3613e-05
Epoch: [6][550/761]Elapsed 36.08s | Loss: 0.3001 Grad: 78984.9375 LR: 6.2692e-05
Epoch: [6][600/761]Elapsed 3

Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [6][0/172]Elapsed 0.05s | Loss: 0.5082
Epoch: [6][50/172]Elapsed 2.49s | Loss: 0.4386
Epoch: [6][100/172]Elapsed 5.02s | Loss: 0.4200
Epoch: [6][150/172]Elapsed 7.60s | Loss: 0.4072


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.2891 | Average Valid Loss: 0.4022 | Time: 58.38s
Best model found in epoch 6 | valid loss: 0.4022
Fold 4 Valid Loss: (Easy) 0.8474 | (Hard) 0.7215
Elapse: 5.85 min 
- Second Stage 
Use Checkpoint: ViTMAE_base_fold_4_stage_1.pth


Train:   0%|          | 0/327 [00:00<?, ?batch/s]

Epoch: [1][0/327]Elapsed 0.07s | Loss: 0.4214 Grad: 186275.7812 LR: 4.0000e-06
Epoch: [1][50/327]Elapsed 3.42s | Loss: 0.3035 Grad: 131767.1094 LR: 7.8332e-06
Epoch: [1][100/327]Elapsed 6.66s | Loss: 0.2880 Grad: 86533.0469 LR: 1.8721e-05
Epoch: [1][150/327]Elapsed 9.84s | Loss: 0.2674 Grad: 71290.5391 LR: 3.4923e-05
Epoch: [1][200/327]Elapsed 13.05s | Loss: 0.2559 Grad: 94192.3828 LR: 5.3854e-05
Epoch: [1][250/327]Elapsed 16.24s | Loss: 0.2517 Grad: 60277.0469 LR: 7.2488e-05
Epoch: [1][300/327]Elapsed 19.51s | Loss: 0.2474 Grad: 50262.6992 LR: 8.7850e-05
Epoch: [1][326/327]Elapsed 21.25s | Loss: 0.2447 Grad: 64948.5195 LR: 9.3697e-05


Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [1][0/172]Elapsed 0.05s | Loss: 0.4291
Epoch: [1][50/172]Elapsed 2.49s | Loss: 0.4837
Epoch: [1][100/172]Elapsed 5.04s | Loss: 0.4901
Epoch: [1][150/172]Elapsed 7.65s | Loss: 0.4291


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.2447 | Average Valid Loss: 0.4042 | Time: 30.13s
Best model found in epoch 1 | valid loss: 0.4042


Train:   0%|          | 0/327 [00:00<?, ?batch/s]

Epoch: [2][0/327]Elapsed 0.08s | Loss: 0.2437 Grad: 116576.3750 LR: 9.3697e-05
Epoch: [2][50/327]Elapsed 3.39s | Loss: 0.2263 Grad: 64512.9531 LR: 9.9678e-05
Epoch: [2][100/327]Elapsed 6.57s | Loss: 0.2273 Grad: 63614.0430 LR: 9.9975e-05
Epoch: [2][150/327]Elapsed 9.76s | Loss: 0.2209 Grad: 50156.8398 LR: 9.9855e-05
Epoch: [2][200/327]Elapsed 12.96s | Loss: 0.2166 Grad: 87329.3594 LR: 9.9637e-05
Epoch: [2][250/327]Elapsed 16.14s | Loss: 0.2179 Grad: 70332.1641 LR: 9.9320e-05
Epoch: [2][300/327]Elapsed 19.36s | Loss: 0.2169 Grad: 60130.5469 LR: 9.8906e-05
Epoch: [2][326/327]Elapsed 21.04s | Loss: 0.2151 Grad: 72047.5703 LR: 9.8653e-05


Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [2][0/172]Elapsed 0.05s | Loss: 0.5320
Epoch: [2][50/172]Elapsed 2.52s | Loss: 0.5215
Epoch: [2][100/172]Elapsed 5.07s | Loss: 0.5182
Epoch: [2][150/172]Elapsed 7.67s | Loss: 0.4503


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.2151 | Average Valid Loss: 0.4217 | Time: 29.93s


Train:   0%|          | 0/327 [00:00<?, ?batch/s]

Epoch: [3][0/327]Elapsed 0.07s | Loss: 0.2062 Grad: 111273.7344 LR: 9.8653e-05
Epoch: [3][50/327]Elapsed 3.33s | Loss: 0.2058 Grad: 60762.8594 LR: 9.8092e-05
Epoch: [3][100/327]Elapsed 6.52s | Loss: 0.2073 Grad: 55237.8398 LR: 9.7437e-05
Epoch: [3][150/327]Elapsed 9.72s | Loss: 0.2015 Grad: 52846.9805 LR: 9.6687e-05
Epoch: [3][200/327]Elapsed 12.91s | Loss: 0.2005 Grad: 87877.6953 LR: 9.5845e-05
Epoch: [3][250/327]Elapsed 16.11s | Loss: 0.2029 Grad: 89821.9375 LR: 9.4913e-05
Epoch: [3][300/327]Elapsed 19.35s | Loss: 0.2036 Grad: 70297.1016 LR: 9.3892e-05
Epoch: [3][326/327]Elapsed 21.02s | Loss: 0.2019 Grad: 70102.8438 LR: 9.3326e-05


Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [3][0/172]Elapsed 0.05s | Loss: 0.4715
Epoch: [3][50/172]Elapsed 2.52s | Loss: 0.5196
Epoch: [3][100/172]Elapsed 5.09s | Loss: 0.5160
Epoch: [3][150/172]Elapsed 7.74s | Loss: 0.4470


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2019 | Average Valid Loss: 0.4196 | Time: 29.99s


Train:   0%|          | 0/327 [00:00<?, ?batch/s]

Epoch: [4][0/327]Elapsed 0.07s | Loss: 0.1900 Grad: 122531.5469 LR: 9.3326e-05
Epoch: [4][50/327]Elapsed 3.28s | Loss: 0.1894 Grad: 70308.4531 LR: 9.2174e-05
Epoch: [4][100/327]Elapsed 6.46s | Loss: 0.1903 Grad: 55454.9492 LR: 9.0938e-05
Epoch: [4][150/327]Elapsed 9.67s | Loss: 0.1845 Grad: 70864.0469 LR: 8.9622e-05
Epoch: [4][200/327]Elapsed 12.86s | Loss: 0.1845 Grad: 81246.2422 LR: 8.8227e-05
Epoch: [4][250/327]Elapsed 16.05s | Loss: 0.1864 Grad: 61120.3516 LR: 8.6756e-05
Epoch: [4][300/327]Elapsed 19.27s | Loss: 0.1871 Grad: 53148.1641 LR: 8.5213e-05
Epoch: [4][326/327]Elapsed 20.95s | Loss: 0.1866 Grad: 54513.2461 LR: 8.4383e-05


Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [4][0/172]Elapsed 0.05s | Loss: 0.5085
Epoch: [4][50/172]Elapsed 2.51s | Loss: 0.5107
Epoch: [4][100/172]Elapsed 5.07s | Loss: 0.5111
Epoch: [4][150/172]Elapsed 7.68s | Loss: 0.4409


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1866 | Average Valid Loss: 0.4123 | Time: 29.88s


Train:   0%|          | 0/327 [00:00<?, ?batch/s]

Epoch: [5][0/327]Elapsed 0.07s | Loss: 0.2047 Grad: 97721.8203 LR: 8.4383e-05
Epoch: [5][50/327]Elapsed 3.40s | Loss: 0.1800 Grad: 66729.7422 LR: 8.2735e-05
Epoch: [5][100/327]Elapsed 6.60s | Loss: 0.1802 Grad: 54372.9961 LR: 8.1023e-05
Epoch: [5][150/327]Elapsed 9.81s | Loss: 0.1768 Grad: 61354.1445 LR: 7.9249e-05
Epoch: [5][200/327]Elapsed 13.02s | Loss: 0.1768 Grad: 81741.8672 LR: 7.7418e-05
Epoch: [5][250/327]Elapsed 16.22s | Loss: 0.1777 Grad: 61220.8555 LR: 7.5532e-05
Epoch: [5][300/327]Elapsed 19.44s | Loss: 0.1790 Grad: 82333.4219 LR: 7.3596e-05
Epoch: [5][326/327]Elapsed 21.14s | Loss: 0.1771 Grad: 67302.5469 LR: 7.2570e-05


Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [5][0/172]Elapsed 0.05s | Loss: 0.5761
Epoch: [5][50/172]Elapsed 2.49s | Loss: 0.5021
Epoch: [5][100/172]Elapsed 5.04s | Loss: 0.4971
Epoch: [5][150/172]Elapsed 7.63s | Loss: 0.4321


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1771 | Average Valid Loss: 0.4054 | Time: 30.01s


Train:   0%|          | 0/327 [00:00<?, ?batch/s]

Epoch: [6][0/327]Elapsed 0.07s | Loss: 0.1783 Grad: 113428.1484 LR: 7.2570e-05
Epoch: [6][50/327]Elapsed 3.51s | Loss: 0.1704 Grad: 77080.4375 LR: 7.0565e-05
Epoch: [6][100/327]Elapsed 6.83s | Loss: 0.1686 Grad: 58343.8945 LR: 6.8518e-05
Epoch: [6][150/327]Elapsed 10.16s | Loss: 0.1642 Grad: 89405.5078 LR: 6.6436e-05
Epoch: [6][200/327]Elapsed 13.39s | Loss: 0.1643 Grad: 68858.6875 LR: 6.4320e-05
Epoch: [6][250/327]Elapsed 16.57s | Loss: 0.1661 Grad: 83269.4219 LR: 6.2177e-05
Epoch: [6][300/327]Elapsed 19.86s | Loss: 0.1666 Grad: 47871.1562 LR: 6.0009e-05
Epoch: [6][326/327]Elapsed 21.55s | Loss: 0.1659 Grad: 77176.4609 LR: 5.8874e-05


Valid:   0%|          | 0/172 [00:00<?, ?batch/s]

Epoch: [6][0/172]Elapsed 0.05s | Loss: 0.4316
Epoch: [6][50/172]Elapsed 2.51s | Loss: 0.5050
Epoch: [6][100/172]Elapsed 5.07s | Loss: 0.4964
Epoch: [6][150/172]Elapsed 7.73s | Loss: 0.4309


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1659 | Average Valid Loss: 0.4039 | Time: 30.52s
Best model found in epoch 6 | valid loss: 0.4039
Fold 4 Valid Loss: (Easy) 1.0062 | (Hard) 0.4251
Elapse: 3.02 min 
Training Complete!
CV Result (Stage=1): 0.7553410530090332
CV Result (Stage=2): 0.7453713417053223
Elapse: 46.20 min 


In [None]:
pd.set_option('display.max_columns', None)
KL_CRITERION = nn.KLDivLoss(reduction='batchmean')
SOFTMAX = nn.Softmax(dim=1)

TARGET2ID = {
    'Seizure': 0,
    'LPD': 1,
    'GPD': 2,
    'LRDA': 3,
    'GRDA': 4,
    'Other': 5
}

from kl_divergence import score as kaggle_score 

def calc_kaggle_score(solution, submission):
    solution = solution.to_frame().T
    solution[TARGETS] = solution[TARGETS].astype(np.float32)
    submission = submission.to_frame().T
    submission.columns = ['eeg_id'] + TARGETS
    submission[TARGETS] = submission[TARGETS].astype(np.float32)
    
    return kaggle_score(solution, submission, 'eeg_id')


def calc_kl_div(p, q):
    p = torch.tensor(p.astype(np.float32)).unsqueeze(0)
    q = torch.tensor(q.astype(np.float32)).unsqueeze(0)
    kl_score = KL_CRITERION(F.log_softmax(p, dim=1), q)
    return kl_score.item()

In [None]:
# csv_path = './outputs/ENet_b2_softmax/ENet_b2_softmax_oof_2.csv'

oof_df = pd.read_csv('./outputs/ViTMAE_base/ViTMAE_base_oof_2.csv')
oof_df['target_pred'] = oof_df[TARGETS_PRED].apply(lambda x: np.argmax(x), axis=1)
oof_df['target_id'] = oof_df['target'].map(TARGET2ID)

oof_df["kl_loss"] = oof_df.apply(
    lambda row: 
        KL_CRITERION(
            F.log_softmax(
                SOFTMAX(
                    torch.tensor(row[TARGETS_PRED].astype(np.float32)).unsqueeze(0)
                    )
                ), 
            torch.tensor(row[TARGETS].astype(np.float32))
            ).numpy(),
    axis=1)

oof_df["kl_loss"] = oof_df['kl_loss'].astype(np.float32)

oof_df[TARGETS_PRED] = SOFTMAX( torch.tensor(oof_df[TARGETS_PRED].values.astype(np.float32)))

oof_df.head()

In [None]:
oof_df["kl_loss"].plot(kind='hist', bins=100, title='KL Loss Distribution', figsize=(8, 5))
print(oof_df["kl_loss"].mean())
plt.show()

In [None]:
plot_oof = oof_df.copy()

# plot confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(plot_oof['target_id'], plot_oof['target_pred']) # (y_true, y_pred)
cm = cm / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=TARGET2ID.keys(), yticklabels=TARGET2ID.keys())
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.show()

In [None]:
# new figure
fig, axes = plt.subplots(6, 5, figsize=(18, 16), sharex=True, sharey=True)

plot_oof = oof_df[oof_df['kl_loss'] > 0.2]

for row in range(axes.shape[0]):
    row_selects = plot_oof[plot_oof['target_id']==row]
    target_label = BRAIN_ACTIVITY[row]
    for col in range(axes.shape[1]):
        ax = axes[row, col]
        idx = np.random.choice(row_selects.index)
        df_rows = plot_oof.loc[idx]
        ax.plot(df_rows[TARGETS].values , label='True')
        ax.plot(df_rows[TARGETS_PRED].values, label='Pred')
        ax.set_title(f"{idx} | KL: {df_rows['kl_loss']:.4f} ") #
        ax.set_xticks(range(6))
        ax.set_xticklabels(BRAIN_ACTIVITY)
        ax.grid(True)
        ax.legend()
        if col == 0:
            ax.set_ylabel(target_label, fontsize=12)
       
fig.tight_layout()
plt.show()

In [None]:
csv_path = './outputs/ENet_b2_xymasking_remove_less/ENet_b2_xymasking_remove_less_oof_2.csv'

oof_df = pd.read_csv(csv_path)

oof_df["kl_loss"] = oof_df.apply(
    lambda row: 
        KL_CRITERION(
            F.log_softmax(
                SOFTMAX(
                    torch.tensor(row[TARGETS_PRED].astype(np.float32)).unsqueeze(0)
                    )
                ), 
            torch.tensor(row[TARGETS].astype(np.float32))
            ).numpy(),
    axis=1)

oof_df["kl_loss"] = oof_df['kl_loss'].astype(np.float32)

# y_pred = oof_df[TARGETS_PRED].values.astype(np.float32)
# y_pred_smax = SOFTMAX(torch.tensor(y_pred)).numpy()
# oof_df[TARGETS_PRED] = y_pred_smax

oof_df['target_pred'] = oof_df[TARGETS_PRED].apply(lambda x: np.argmax(x), axis=1)
oof_df['target_id'] = oof_df['target'].map(TARGET2ID)

oof_df.head()

In [None]:
oof_df["kl_loss"].plot(kind='hist', bins=100, title='KL Loss Distribution', figsize=(8, 5))
print(oof_df["kl_loss"].mean())
plt.show()

In [None]:
KL_CRITERION(
    torch.log(
        SOFTMAX(
            torch.tensor(oof_df[TARGETS_PRED].values.astype(np.float32))
            )
        ), 
    torch.tensor(oof_df[TARGETS].values.astype(np.float32))
    ).numpy(),

In [None]:
submission_df = oof_df[['eeg_id']+TARGETS_PRED].copy()
submission_df.columns = ['eeg_id'] + TARGETS

solution_df = oof_df[['eeg_id']+TARGETS].copy()

score_value = kaggle_score(solution_df, submission_df, 'eeg_id')

score_value

In [None]:
plot_oof = oof_df.copy()

# plot confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(plot_oof['target_id'], plot_oof['target_pred']) # (y_true, y_pred)
cm = cm / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=TARGET2ID.keys(), yticklabels=TARGET2ID.keys())
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.show()

In [None]:
# new figure
fig, axes = plt.subplots(6, 5, figsize=(18, 16), sharex=True, sharey=True)

plot_oof = oof_df[oof_df['kl_loss'] > 0.2]

for row in range(axes.shape[0]):
    row_selects = plot_oof[plot_oof['target_id']==row]
    target_label = BRAIN_ACTIVITY[row]
    for col in range(axes.shape[1]):
        ax = axes[row, col]
        idx = np.random.choice(row_selects.index)
        df_rows = plot_oof.loc[idx]
        ax.plot(df_rows[TARGETS].values , label='True')
        ax.plot(df_rows[TARGETS_PRED].values, label='Pred')
        ax.set_title(f"{idx} | KL: {df_rows['kl_loss']:.4f} ") #
        ax.set_xticks(range(6))
        ax.set_xticklabels(BRAIN_ACTIVITY)
        ax.grid(True)
        ax.legend()
        if col == 0:
            ax.set_ylabel(target_label, fontsize=12)
       
fig.tight_layout()
plt.show()




In [None]:
oof_df[oof_df['eeg_id'] == 11127485]

In [None]:
oof_df[oof_df['eeg_id'].duplicated()] #oof_df.shape #.groupby('eeg_id')['patient_id'].agg(['nunique', 'count']).sort_values(by='count', ascending=False).head(10)

In [None]:
score_kaggle = oof_df2.loc[:10].apply(lambda row: calc_kaggle_score(row[['eeg_id']+TARGETS], row[['eeg_id']+TARGETS_PRED]), axis=1)
score_kaggle

In [None]:
submission_df = oof_df2[['eeg_id']+TARGETS_PRED].copy()
submission_df.columns = ['eeg_id'] + TARGETS

solution_df = oof_df2[['eeg_id']+TARGETS].copy()

score_value = kaggle_score(solution_df, submission_df, 'eeg_id')

score_value

In [None]:
# oof_df1, cv_1 = analyze_oof("./outputs/ENet_b2_xymasking_remove_less/ENet_b2_xymasking_remove_less_oof_1.csv")
# print(cv_1)
# oof_df1.head()