In [1]:
import pandas as pd 
import numpy as np 
from scipy.stats import entropy
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt

from engine_hms_trainer import *
from engine_hms_model import CustomModel, JobConfig, ModelConfig

import torch
from torch import nn
import torch.nn.functional as F

  _torch_pytree._register_pytree_node(


In [2]:
seed_everything(JobConfig.SEED)

ModelConfig.EPOCHS = 6
ModelConfig.USE_EEG_SPECTROGRAMS = False
ModelConfig.MODEL_BACKBONE = 'tf_efficientnet_b2'
ModelConfig.MODEL_NAME = "ENet_b2_xymask_cutmix"
ModelConfig.AUGMENT = True
ModelConfig.USE_KAGGLE_SPECTROGRAMS = True
ModelConfig.USE_EEG_SPECTROGRAMS = True

ModelConfig.AUGMENTATIONS = ['xy_masking', 'cut_mix']

hms_predictor = HMSPredictor(JobConfig, ModelConfig)

****************************************************************************************************
Script Start: Sat Mar  9 16:49:25 2024
Initializing HMS Predictor...
Model Name: ENet_b2_two_stages_xymask
Drop Rate: 0.15
Drop Path Rate: 0.25
Augment: True
Augmentations: ['xy_masking']
Enropy Split: 5.5
Device: cuda
Output Dir: ./outputs/
****************************************************************************************************


In [3]:
train_easy, train_hard, all_specs, all_eegs = hms_predictor.load_train_data()

print(train_easy.shape)
print(train_hard.shape)

# check if contain NaN
print(train_easy.isnull().sum().sum())
print(train_hard.isnull().sum().sum())

display(train_easy.head())
print(" ")
display(train_hard.head())

(11999, 14)
(5090, 14)
0
0


Unnamed: 0,eeg_id,spectrogram_id,min,max,patient_id,target,total_votes,entropy,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,642382,14960202,1008.0,1032.0,5955,Other,2,7.802343,0.0,0.0,0.0,0.0,0.0,1.0
1,751790,618728447,908.0,908.0,38549,GPD,1,7.802343,0.0,0.0,1.0,0.0,0.0,0.0
2,778705,52296320,0.0,0.0,40955,Other,2,7.68682,0.0,0.0,0.0,0.0,0.0,1.0
3,1629671,2036345030,0.0,160.0,37481,Seizure,51,7.619243,1.0,0.0,0.0,0.0,0.0,0.0
4,2061593,320962633,1450.0,1450.0,23828,Other,1,7.802343,0.0,0.0,0.0,0.0,0.0,1.0


 


Unnamed: 0,eeg_id,spectrogram_id,min,max,patient_id,target,total_votes,entropy,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,789577333,0.0,16.0,20654,Other,48,3.341757,0.0,0.0,0.25,0.0,0.166667,0.583333
1,582999,1552638400,0.0,38.0,20230,LPD,154,3.550549,0.0,0.857143,0.0,0.071429,0.0,0.071429
2,1895581,128369999,1138.0,1138.0,47999,Other,13,3.565051,0.076923,0.0,0.0,0.0,0.076923,0.846154
3,2482631,978166025,1902.0,1944.0,20606,Other,105,1.431066,0.0,0.0,0.133333,0.066667,0.133333,0.666667
4,2521897,673742515,0.0,4.0,62117,Other,24,1.516203,0.0,0.0,0.083333,0.083333,0.333333,0.5


In [4]:
# Use only half data for fast debugging
# train_easy = train_easy[:len(train_easy)//2]
# train_hard = train_hard[:len(train_hard)//2]

hms_predictor.train_folds(train_easy, train_hard, all_specs, all_eegs)

Fold: 0 First Training


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [1][0/599]Elapsed 1.09s | Loss: 0.8345 Grad: 69042.6250 LR: 4.0000e-06
Epoch: [1][50/599]Elapsed 8.27s | Loss: 0.8308 Grad: 75636.1328 LR: 5.1479e-06
Epoch: [1][100/599]Elapsed 15.35s | Loss: 0.8226 Grad: 61634.6484 LR: 8.5368e-06
Epoch: [1][150/599]Elapsed 22.45s | Loss: 0.8171 Grad: 57516.7031 LR: 1.4005e-05
Epoch: [1][200/599]Elapsed 29.56s | Loss: 0.8091 Grad: 70790.9766 LR: 2.1290e-05
Epoch: [1][250/599]Elapsed 36.68s | Loss: 0.7969 Grad: 69634.0078 LR: 3.0044e-05
Epoch: [1][300/599]Elapsed 43.83s | Loss: 0.7804 Grad: 79607.8828 LR: 3.9848e-05
Epoch: [1][350/599]Elapsed 50.99s | Loss: 0.7615 Grad: 129872.3438 LR: 5.0233e-05
Epoch: [1][400/599]Elapsed 58.16s | Loss: 0.7422 Grad: 53024.8203 LR: 6.0703e-05
Epoch: [1][450/599]Elapsed 65.34s | Loss: 0.7222 Grad: 71614.6484 LR: 7.0757e-05
Epoch: [1][500/599]Elapsed 72.54s | Loss: 0.7026 Grad: 67804.5078 LR: 7.9913e-05
Epoch: [1][550/599]Elapsed 79.74s | Loss: 0.6828 Grad: 75003.4922 LR: 8.7735e-05
Epoch: [1][598/599]Elapsed 86.67

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [1][0/150]Elapsed 0.11s | Loss: 0.7545
Epoch: [1][50/150]Elapsed 4.91s | Loss: 0.4970
Epoch: [1][100/150]Elapsed 9.70s | Loss: 0.5004


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6654 | Average Valid Loss: 0.4962 | Time: 101.30s
Best model found in epoch 1 | valid loss: 0.4962


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [2][0/599]Elapsed 0.11s | Loss: 0.3842 Grad: 142093.0469 LR: 9.3639e-05
Epoch: [2][50/599]Elapsed 7.32s | Loss: 0.4178 Grad: 46330.9102 LR: 9.7834e-05
Epoch: [2][100/599]Elapsed 14.59s | Loss: 0.4063 Grad: 33928.4258 LR: 9.9837e-05
Epoch: [2][150/599]Elapsed 21.89s | Loss: 0.4143 Grad: 89391.2266 LR: 9.9994e-05
Epoch: [2][200/599]Elapsed 29.21s | Loss: 0.4045 Grad: 31731.7090 LR: 9.9961e-05
Epoch: [2][250/599]Elapsed 36.52s | Loss: 0.3976 Grad: 41026.5625 LR: 9.9899e-05
Epoch: [2][300/599]Elapsed 43.84s | Loss: 0.3959 Grad: 60293.5391 LR: 9.9807e-05
Epoch: [2][350/599]Elapsed 51.17s | Loss: 0.3928 Grad: 25876.3184 LR: 9.9685e-05
Epoch: [2][400/599]Elapsed 58.53s | Loss: 0.3892 Grad: 30222.4316 LR: 9.9535e-05
Epoch: [2][450/599]Elapsed 65.89s | Loss: 0.3836 Grad: 47859.9648 LR: 9.9355e-05
Epoch: [2][500/599]Elapsed 73.24s | Loss: 0.3812 Grad: 43956.9141 LR: 9.9146e-05
Epoch: [2][550/599]Elapsed 80.60s | Loss: 0.3775 Grad: 68269.3828 LR: 9.8908e-05
Epoch: [2][598/599]Elapsed 87.67

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [2][0/150]Elapsed 0.10s | Loss: 0.6648
Epoch: [2][50/150]Elapsed 4.90s | Loss: 0.4353
Epoch: [2][100/150]Elapsed 9.69s | Loss: 0.4526


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.3741 | Average Valid Loss: 0.4538 | Time: 102.32s
Best model found in epoch 2 | valid loss: 0.4538


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [3][0/599]Elapsed 0.10s | Loss: 0.2464 Grad: 109735.1641 LR: 9.8653e-05
Epoch: [3][50/599]Elapsed 7.46s | Loss: 0.2902 Grad: 126770.3516 LR: 9.8359e-05
Epoch: [3][100/599]Elapsed 14.91s | Loss: 0.2907 Grad: 28631.8047 LR: 9.8036e-05
Epoch: [3][150/599]Elapsed 22.34s | Loss: 0.3047 Grad: 37765.5352 LR: 9.7685e-05
Epoch: [3][200/599]Elapsed 29.76s | Loss: 0.3014 Grad: 37428.9531 LR: 9.7306e-05
Epoch: [3][250/599]Elapsed 37.17s | Loss: 0.2984 Grad: 36949.7891 LR: 9.6899e-05
Epoch: [3][300/599]Elapsed 44.56s | Loss: 0.3000 Grad: 36849.8672 LR: 9.6464e-05
Epoch: [3][350/599]Elapsed 51.94s | Loss: 0.2989 Grad: 49472.4336 LR: 9.6002e-05
Epoch: [3][400/599]Elapsed 59.32s | Loss: 0.2971 Grad: 30841.2441 LR: 9.5513e-05
Epoch: [3][450/599]Elapsed 66.69s | Loss: 0.2947 Grad: 46519.9570 LR: 9.4997e-05
Epoch: [3][500/599]Elapsed 74.13s | Loss: 0.2944 Grad: 26111.7148 LR: 9.4455e-05
Epoch: [3][550/599]Elapsed 81.53s | Loss: 0.2919 Grad: 55629.1445 LR: 9.3886e-05
Epoch: [3][598/599]Elapsed 88.6

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [3][0/150]Elapsed 0.11s | Loss: 0.6345
Epoch: [3][50/150]Elapsed 4.95s | Loss: 0.4356
Epoch: [3][100/150]Elapsed 9.75s | Loss: 0.4567


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2912 | Average Valid Loss: 0.4587 | Time: 103.34s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [4][0/599]Elapsed 0.10s | Loss: 0.2141 Grad: 123974.9453 LR: 9.3316e-05
Epoch: [4][50/599]Elapsed 7.46s | Loss: 0.2411 Grad: 79364.4219 LR: 9.2697e-05
Epoch: [4][100/599]Elapsed 14.88s | Loss: 0.2421 Grad: 30797.0547 LR: 9.2053e-05
Epoch: [4][150/599]Elapsed 22.30s | Loss: 0.2558 Grad: 39875.2695 LR: 9.1384e-05
Epoch: [4][200/599]Elapsed 29.73s | Loss: 0.2524 Grad: 27349.4492 LR: 9.0691e-05
Epoch: [4][250/599]Elapsed 37.17s | Loss: 0.2509 Grad: 24700.2266 LR: 8.9973e-05
Epoch: [4][300/599]Elapsed 44.62s | Loss: 0.2504 Grad: 33373.4922 LR: 8.9233e-05
Epoch: [4][350/599]Elapsed 52.09s | Loss: 0.2494 Grad: 30650.9355 LR: 8.8469e-05
Epoch: [4][400/599]Elapsed 59.52s | Loss: 0.2485 Grad: 62647.5078 LR: 8.7682e-05
Epoch: [4][450/599]Elapsed 66.95s | Loss: 0.2457 Grad: 58746.7539 LR: 8.6873e-05
Epoch: [4][500/599]Elapsed 74.39s | Loss: 0.2468 Grad: 35471.3125 LR: 8.6043e-05
Epoch: [4][550/599]Elapsed 81.82s | Loss: 0.2455 Grad: 63155.4219 LR: 8.5191e-05
Epoch: [4][598/599]Elapsed 88.94

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [4][0/150]Elapsed 0.10s | Loss: 0.5775
Epoch: [4][50/150]Elapsed 4.92s | Loss: 0.4239
Epoch: [4][100/150]Elapsed 9.73s | Loss: 0.4504


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.2450 | Average Valid Loss: 0.4561 | Time: 103.63s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [5][0/599]Elapsed 0.12s | Loss: 0.1996 Grad: 141908.5156 LR: 8.4354e-05
Epoch: [5][50/599]Elapsed 7.48s | Loss: 0.2061 Grad: 95460.1484 LR: 8.3462e-05
Epoch: [5][100/599]Elapsed 14.93s | Loss: 0.2032 Grad: 50888.0391 LR: 8.2550e-05
Epoch: [5][150/599]Elapsed 22.38s | Loss: 0.2116 Grad: 30870.4570 LR: 8.1619e-05
Epoch: [5][200/599]Elapsed 29.84s | Loss: 0.2079 Grad: 34410.8750 LR: 8.0670e-05
Epoch: [5][250/599]Elapsed 37.27s | Loss: 0.2070 Grad: 26766.7988 LR: 7.9702e-05
Epoch: [5][300/599]Elapsed 44.71s | Loss: 0.2075 Grad: 60561.5820 LR: 7.8717e-05
Epoch: [5][350/599]Elapsed 52.16s | Loss: 0.2089 Grad: 37366.2695 LR: 7.7715e-05
Epoch: [5][400/599]Elapsed 59.61s | Loss: 0.2076 Grad: 22367.2188 LR: 7.6697e-05
Epoch: [5][450/599]Elapsed 67.07s | Loss: 0.2054 Grad: 45877.0000 LR: 7.5663e-05
Epoch: [5][500/599]Elapsed 74.49s | Loss: 0.2057 Grad: 56285.5156 LR: 7.4614e-05
Epoch: [5][550/599]Elapsed 81.91s | Loss: 0.2037 Grad: 75790.4531 LR: 7.3550e-05
Epoch: [5][598/599]Elapsed 89.02

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [5][0/150]Elapsed 0.10s | Loss: 0.6307
Epoch: [5][50/150]Elapsed 4.90s | Loss: 0.4379
Epoch: [5][100/150]Elapsed 9.71s | Loss: 0.4559


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.2033 | Average Valid Loss: 0.4626 | Time: 103.67s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [6][0/599]Elapsed 0.10s | Loss: 0.2046 Grad: inf LR: 7.2516e-05
Epoch: [6][50/599]Elapsed 7.50s | Loss: 0.1709 Grad: 65081.3008 LR: 7.1426e-05
Epoch: [6][100/599]Elapsed 14.96s | Loss: 0.1697 Grad: 56867.7539 LR: 7.0323e-05
Epoch: [6][150/599]Elapsed 22.42s | Loss: 0.1770 Grad: 46316.9141 LR: 6.9208e-05
Epoch: [6][200/599]Elapsed 29.85s | Loss: 0.1752 Grad: 31888.0469 LR: 6.8082e-05
Epoch: [6][250/599]Elapsed 37.28s | Loss: 0.1748 Grad: 27573.3027 LR: 6.6945e-05
Epoch: [6][300/599]Elapsed 44.73s | Loss: 0.1743 Grad: 29712.3477 LR: 6.5799e-05
Epoch: [6][350/599]Elapsed 52.14s | Loss: 0.1733 Grad: 54139.2969 LR: 6.4642e-05
Epoch: [6][400/599]Elapsed 59.58s | Loss: 0.1729 Grad: 20577.5410 LR: 6.3478e-05
Epoch: [6][450/599]Elapsed 66.98s | Loss: 0.1702 Grad: 42005.5234 LR: 6.2305e-05
Epoch: [6][500/599]Elapsed 74.41s | Loss: 0.1710 Grad: 29141.4336 LR: 6.1125e-05
Epoch: [6][550/599]Elapsed 81.84s | Loss: 0.1693 Grad: 43241.3789 LR: 5.9939e-05
Epoch: [6][598/599]Elapsed 88.98s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [6][0/150]Elapsed 0.11s | Loss: 0.5734
Epoch: [6][50/150]Elapsed 4.94s | Loss: 0.4446
Epoch: [6][100/150]Elapsed 9.74s | Loss: 0.4778


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1696 | Average Valid Loss: 0.4815 | Time: 103.66s
Fold: 1 First Training


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [1][0/599]Elapsed 0.10s | Loss: 0.8187 Grad: 90367.3984 LR: 4.0000e-06
Epoch: [1][50/599]Elapsed 7.48s | Loss: 0.8193 Grad: 61856.2148 LR: 5.1479e-06
Epoch: [1][100/599]Elapsed 14.96s | Loss: 0.8062 Grad: 69708.6875 LR: 8.5368e-06
Epoch: [1][150/599]Elapsed 22.43s | Loss: 0.8008 Grad: 65168.4062 LR: 1.4005e-05
Epoch: [1][200/599]Elapsed 29.93s | Loss: 0.7945 Grad: 60428.7070 LR: 2.1290e-05
Epoch: [1][250/599]Elapsed 37.41s | Loss: 0.7847 Grad: 65722.2031 LR: 3.0044e-05
Epoch: [1][300/599]Elapsed 44.88s | Loss: 0.7691 Grad: 88584.0000 LR: 3.9848e-05
Epoch: [1][350/599]Elapsed 52.36s | Loss: 0.7533 Grad: 84640.8047 LR: 5.0233e-05
Epoch: [1][400/599]Elapsed 59.80s | Loss: 0.7345 Grad: 102233.5078 LR: 6.0703e-05
Epoch: [1][450/599]Elapsed 67.22s | Loss: 0.7145 Grad: 79600.0234 LR: 7.0757e-05
Epoch: [1][500/599]Elapsed 74.66s | Loss: 0.6955 Grad: 49560.7461 LR: 7.9913e-05
Epoch: [1][550/599]Elapsed 82.08s | Loss: 0.6765 Grad: 57476.7422 LR: 8.7735e-05
Epoch: [1][598/599]Elapsed 89.20

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [1][0/150]Elapsed 0.11s | Loss: 0.3597
Epoch: [1][50/150]Elapsed 4.91s | Loss: 0.4746
Epoch: [1][100/150]Elapsed 9.71s | Loss: 0.4953


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6594 | Average Valid Loss: 0.5030 | Time: 103.86s
Best model found in epoch 1 | valid loss: 0.5030


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [2][0/599]Elapsed 0.10s | Loss: 0.5007 Grad: 150057.3750 LR: 9.3639e-05
Epoch: [2][50/599]Elapsed 7.48s | Loss: 0.4185 Grad: 75971.0703 LR: 9.7834e-05
Epoch: [2][100/599]Elapsed 14.92s | Loss: 0.4069 Grad: 71578.7891 LR: 9.9837e-05
Epoch: [2][150/599]Elapsed 22.37s | Loss: 0.4100 Grad: 59593.5156 LR: 9.9994e-05
Epoch: [2][200/599]Elapsed 29.80s | Loss: 0.4034 Grad: 28914.7852 LR: 9.9961e-05
Epoch: [2][250/599]Elapsed 37.24s | Loss: 0.3993 Grad: 39601.1992 LR: 9.9899e-05
Epoch: [2][300/599]Elapsed 44.69s | Loss: 0.3955 Grad: 39854.2031 LR: 9.9807e-05
Epoch: [2][350/599]Elapsed 52.12s | Loss: 0.3918 Grad: 46848.2617 LR: 9.9685e-05
Epoch: [2][400/599]Elapsed 59.54s | Loss: 0.3854 Grad: 28335.2754 LR: 9.9535e-05
Epoch: [2][450/599]Elapsed 66.96s | Loss: 0.3800 Grad: 52756.7930 LR: 9.9355e-05
Epoch: [2][500/599]Elapsed 74.38s | Loss: 0.3772 Grad: 37780.6367 LR: 9.9146e-05
Epoch: [2][550/599]Elapsed 81.81s | Loss: 0.3733 Grad: 28562.9863 LR: 9.8908e-05
Epoch: [2][598/599]Elapsed 88.96

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [2][0/150]Elapsed 0.10s | Loss: 0.3118
Epoch: [2][50/150]Elapsed 4.91s | Loss: 0.3850
Epoch: [2][100/150]Elapsed 9.77s | Loss: 0.3962


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.3705 | Average Valid Loss: 0.3977 | Time: 103.68s
Best model found in epoch 2 | valid loss: 0.3977


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [3][0/599]Elapsed 0.12s | Loss: 0.3298 Grad: nan LR: 9.8653e-05
Epoch: [3][50/599]Elapsed 7.54s | Loss: 0.3010 Grad: 21338.7129 LR: 9.8359e-05
Epoch: [3][100/599]Elapsed 15.02s | Loss: 0.3003 Grad: 39418.8438 LR: 9.8036e-05
Epoch: [3][150/599]Elapsed 22.48s | Loss: 0.3095 Grad: 40957.5391 LR: 9.7685e-05
Epoch: [3][200/599]Elapsed 29.94s | Loss: 0.3049 Grad: 21981.3145 LR: 9.7306e-05
Epoch: [3][250/599]Elapsed 37.40s | Loss: 0.3028 Grad: 26094.0352 LR: 9.6899e-05
Epoch: [3][300/599]Elapsed 44.85s | Loss: 0.3040 Grad: 45831.8789 LR: 9.6464e-05
Epoch: [3][350/599]Elapsed 52.31s | Loss: 0.3011 Grad: 42579.3984 LR: 9.6002e-05
Epoch: [3][400/599]Elapsed 59.76s | Loss: 0.2969 Grad: 27818.4473 LR: 9.5513e-05
Epoch: [3][450/599]Elapsed 67.26s | Loss: 0.2935 Grad: 55390.7617 LR: 9.4997e-05
Epoch: [3][500/599]Elapsed 74.73s | Loss: 0.2934 Grad: 36681.0156 LR: 9.4455e-05
Epoch: [3][550/599]Elapsed 82.18s | Loss: 0.2925 Grad: 29872.9043 LR: 9.3886e-05
Epoch: [3][598/599]Elapsed 89.37s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [3][0/150]Elapsed 0.10s | Loss: 0.2976
Epoch: [3][50/150]Elapsed 4.94s | Loss: 0.3757
Epoch: [3][100/150]Elapsed 9.75s | Loss: 0.3833


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2922 | Average Valid Loss: 0.3843 | Time: 104.06s
Best model found in epoch 3 | valid loss: 0.3843


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [4][0/599]Elapsed 0.10s | Loss: 0.3630 Grad: inf LR: 9.3316e-05
Epoch: [4][50/599]Elapsed 7.51s | Loss: 0.2468 Grad: 29989.4531 LR: 9.2697e-05
Epoch: [4][100/599]Elapsed 15.03s | Loss: 0.2463 Grad: 47235.0469 LR: 9.2053e-05
Epoch: [4][150/599]Elapsed 22.50s | Loss: 0.2568 Grad: 39901.6680 LR: 9.1384e-05
Epoch: [4][200/599]Elapsed 29.97s | Loss: 0.2511 Grad: 22997.1211 LR: 9.0691e-05
Epoch: [4][250/599]Elapsed 37.44s | Loss: 0.2494 Grad: 22708.4727 LR: 8.9973e-05
Epoch: [4][300/599]Elapsed 44.91s | Loss: 0.2499 Grad: 59539.6602 LR: 8.9233e-05
Epoch: [4][350/599]Elapsed 52.35s | Loss: 0.2494 Grad: 40909.0781 LR: 8.8469e-05
Epoch: [4][400/599]Elapsed 59.79s | Loss: 0.2449 Grad: 37714.2500 LR: 8.7682e-05
Epoch: [4][450/599]Elapsed 67.24s | Loss: 0.2419 Grad: 78410.5000 LR: 8.6873e-05
Epoch: [4][500/599]Elapsed 74.69s | Loss: 0.2416 Grad: 40052.4180 LR: 8.6043e-05
Epoch: [4][550/599]Elapsed 82.12s | Loss: 0.2405 Grad: 32881.4570 LR: 8.5191e-05
Epoch: [4][598/599]Elapsed 89.26s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [4][0/150]Elapsed 0.10s | Loss: 0.2862
Epoch: [4][50/150]Elapsed 4.91s | Loss: 0.3762
Epoch: [4][100/150]Elapsed 9.71s | Loss: 0.3900


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.2397 | Average Valid Loss: 0.3935 | Time: 103.91s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [5][0/599]Elapsed 0.10s | Loss: 0.2920 Grad: 204411.2812 LR: 8.4354e-05
Epoch: [5][50/599]Elapsed 7.50s | Loss: 0.2099 Grad: 36181.5898 LR: 8.3462e-05
Epoch: [5][100/599]Elapsed 14.96s | Loss: 0.2079 Grad: 45585.6836 LR: 8.2550e-05
Epoch: [5][150/599]Elapsed 22.42s | Loss: 0.2181 Grad: 37846.0625 LR: 8.1619e-05
Epoch: [5][200/599]Elapsed 29.87s | Loss: 0.2138 Grad: 30355.0137 LR: 8.0670e-05
Epoch: [5][250/599]Elapsed 37.32s | Loss: 0.2133 Grad: 26774.6094 LR: 7.9702e-05
Epoch: [5][300/599]Elapsed 44.77s | Loss: 0.2151 Grad: 28958.6367 LR: 7.8717e-05
Epoch: [5][350/599]Elapsed 52.23s | Loss: 0.2129 Grad: 34448.8867 LR: 7.7715e-05
Epoch: [5][400/599]Elapsed 59.69s | Loss: 0.2092 Grad: 31104.0605 LR: 7.6697e-05
Epoch: [5][450/599]Elapsed 67.14s | Loss: 0.2063 Grad: 63028.3516 LR: 7.5663e-05
Epoch: [5][500/599]Elapsed 74.59s | Loss: 0.2051 Grad: 70360.7344 LR: 7.4614e-05
Epoch: [5][550/599]Elapsed 82.03s | Loss: 0.2032 Grad: 20539.6523 LR: 7.3550e-05
Epoch: [5][598/599]Elapsed 89.17

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [5][0/150]Elapsed 0.10s | Loss: 0.2875
Epoch: [5][50/150]Elapsed 4.90s | Loss: 0.3639
Epoch: [5][100/150]Elapsed 9.69s | Loss: 0.3694


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.2026 | Average Valid Loss: 0.3742 | Time: 103.79s
Best model found in epoch 5 | valid loss: 0.3742


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [6][0/599]Elapsed 0.10s | Loss: 0.3045 Grad: nan LR: 7.2516e-05
Epoch: [6][50/599]Elapsed 7.45s | Loss: 0.1850 Grad: 27147.7188 LR: 7.1426e-05
Epoch: [6][100/599]Elapsed 14.91s | Loss: 0.1811 Grad: 44022.1016 LR: 7.0323e-05
Epoch: [6][150/599]Elapsed 22.36s | Loss: 0.1869 Grad: 38053.1172 LR: 6.9208e-05
Epoch: [6][200/599]Elapsed 29.80s | Loss: 0.1832 Grad: 31175.2031 LR: 6.8082e-05
Epoch: [6][250/599]Elapsed 37.24s | Loss: 0.1810 Grad: 19660.8320 LR: 6.6945e-05
Epoch: [6][300/599]Elapsed 44.69s | Loss: 0.1823 Grad: 40185.8789 LR: 6.5799e-05
Epoch: [6][350/599]Elapsed 52.13s | Loss: 0.1792 Grad: 45287.7383 LR: 6.4642e-05
Epoch: [6][400/599]Elapsed 59.57s | Loss: 0.1758 Grad: 25235.6094 LR: 6.3478e-05
Epoch: [6][450/599]Elapsed 67.01s | Loss: 0.1737 Grad: 54232.3906 LR: 6.2305e-05
Epoch: [6][500/599]Elapsed 74.46s | Loss: 0.1741 Grad: 74183.1797 LR: 6.1125e-05
Epoch: [6][550/599]Elapsed 81.90s | Loss: 0.1728 Grad: 26178.2246 LR: 5.9939e-05
Epoch: [6][598/599]Elapsed 89.06s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [6][0/150]Elapsed 0.10s | Loss: 0.2618
Epoch: [6][50/150]Elapsed 4.90s | Loss: 0.3879
Epoch: [6][100/150]Elapsed 9.70s | Loss: 0.4003


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1719 | Average Valid Loss: 0.4031 | Time: 103.70s
Fold: 2 First Training


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [1][0/599]Elapsed 0.11s | Loss: 0.8003 Grad: 68377.9453 LR: 4.0000e-06
Epoch: [1][50/599]Elapsed 7.50s | Loss: 0.8133 Grad: 59741.7500 LR: 5.1479e-06
Epoch: [1][100/599]Elapsed 14.99s | Loss: 0.8081 Grad: 62179.2812 LR: 8.5368e-06
Epoch: [1][150/599]Elapsed 22.50s | Loss: 0.8026 Grad: 93055.9922 LR: 1.4005e-05
Epoch: [1][200/599]Elapsed 30.00s | Loss: 0.7958 Grad: 93327.9531 LR: 2.1290e-05
Epoch: [1][250/599]Elapsed 37.50s | Loss: 0.7849 Grad: 115605.7891 LR: 3.0044e-05
Epoch: [1][300/599]Elapsed 44.98s | Loss: 0.7689 Grad: 44815.0312 LR: 3.9848e-05
Epoch: [1][350/599]Elapsed 52.47s | Loss: 0.7525 Grad: 51016.1133 LR: 5.0233e-05
Epoch: [1][400/599]Elapsed 59.95s | Loss: 0.7363 Grad: 51482.1016 LR: 6.0703e-05
Epoch: [1][450/599]Elapsed 67.42s | Loss: 0.7169 Grad: 54093.9258 LR: 7.0757e-05
Epoch: [1][500/599]Elapsed 74.89s | Loss: 0.6994 Grad: 59739.3750 LR: 7.9913e-05
Epoch: [1][550/599]Elapsed 82.36s | Loss: 0.6817 Grad: 67814.3125 LR: 8.7735e-05
Epoch: [1][598/599]Elapsed 89.52

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [1][0/150]Elapsed 0.10s | Loss: 0.3481
Epoch: [1][50/150]Elapsed 4.90s | Loss: 0.4895
Epoch: [1][100/150]Elapsed 9.70s | Loss: 0.4805


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6662 | Average Valid Loss: 0.4810 | Time: 104.15s
Best model found in epoch 1 | valid loss: 0.4810


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [2][0/599]Elapsed 0.10s | Loss: 0.3993 Grad: 158628.1094 LR: 9.3639e-05
Epoch: [2][50/599]Elapsed 7.56s | Loss: 0.4273 Grad: 122793.6875 LR: 9.7834e-05
Epoch: [2][100/599]Elapsed 15.09s | Loss: 0.4090 Grad: 97506.4609 LR: 9.9837e-05
Epoch: [2][150/599]Elapsed 22.59s | Loss: 0.4170 Grad: 44530.9102 LR: 9.9994e-05
Epoch: [2][200/599]Elapsed 30.11s | Loss: 0.4108 Grad: 54149.6016 LR: 9.9961e-05
Epoch: [2][250/599]Elapsed 37.63s | Loss: 0.4012 Grad: 34391.4023 LR: 9.9899e-05
Epoch: [2][300/599]Elapsed 45.17s | Loss: 0.3996 Grad: 42744.9531 LR: 9.9807e-05
Epoch: [2][350/599]Elapsed 52.67s | Loss: 0.3943 Grad: 30271.6113 LR: 9.9685e-05
Epoch: [2][400/599]Elapsed 60.16s | Loss: 0.3911 Grad: 51743.9883 LR: 9.9535e-05
Epoch: [2][450/599]Elapsed 67.66s | Loss: 0.3868 Grad: 47971.2852 LR: 9.9355e-05
Epoch: [2][500/599]Elapsed 75.16s | Loss: 0.3837 Grad: 31625.2012 LR: 9.9146e-05
Epoch: [2][550/599]Elapsed 82.61s | Loss: 0.3804 Grad: 37215.1953 LR: 9.8908e-05
Epoch: [2][598/599]Elapsed 89.7

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [2][0/150]Elapsed 0.10s | Loss: 0.2730
Epoch: [2][50/150]Elapsed 4.90s | Loss: 0.4346
Epoch: [2][100/150]Elapsed 9.70s | Loss: 0.4301


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.3771 | Average Valid Loss: 0.4353 | Time: 104.41s
Best model found in epoch 2 | valid loss: 0.4353


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [3][0/599]Elapsed 0.10s | Loss: 0.3076 Grad: 136920.3594 LR: 9.8653e-05
Epoch: [3][50/599]Elapsed 7.53s | Loss: 0.3020 Grad: 37668.9766 LR: 9.8359e-05
Epoch: [3][100/599]Elapsed 15.02s | Loss: 0.2938 Grad: 33333.1719 LR: 9.8036e-05
Epoch: [3][150/599]Elapsed 22.50s | Loss: 0.3068 Grad: 76015.5547 LR: 9.7685e-05
Epoch: [3][200/599]Elapsed 29.98s | Loss: 0.3025 Grad: 31309.6660 LR: 9.7306e-05
Epoch: [3][250/599]Elapsed 37.46s | Loss: 0.2968 Grad: 37845.9453 LR: 9.6899e-05
Epoch: [3][300/599]Elapsed 44.92s | Loss: 0.2991 Grad: 43311.5820 LR: 9.6464e-05
Epoch: [3][350/599]Elapsed 52.38s | Loss: 0.2960 Grad: 24142.7539 LR: 9.6002e-05
Epoch: [3][400/599]Elapsed 59.82s | Loss: 0.2946 Grad: 36584.7266 LR: 9.5513e-05
Epoch: [3][450/599]Elapsed 67.28s | Loss: 0.2936 Grad: 46863.9844 LR: 9.4997e-05
Epoch: [3][500/599]Elapsed 74.73s | Loss: 0.2930 Grad: 28120.1406 LR: 9.4455e-05
Epoch: [3][550/599]Elapsed 82.20s | Loss: 0.2910 Grad: 41510.5234 LR: 9.3886e-05
Epoch: [3][598/599]Elapsed 89.35

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [3][0/150]Elapsed 0.10s | Loss: 0.2387
Epoch: [3][50/150]Elapsed 4.92s | Loss: 0.4006
Epoch: [3][100/150]Elapsed 9.73s | Loss: 0.3961


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2900 | Average Valid Loss: 0.4051 | Time: 104.02s
Best model found in epoch 3 | valid loss: 0.4051


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [4][0/599]Elapsed 0.10s | Loss: 0.2528 Grad: 176778.8438 LR: 9.3316e-05
Epoch: [4][50/599]Elapsed 7.48s | Loss: 0.2449 Grad: 45472.7578 LR: 9.2697e-05
Epoch: [4][100/599]Elapsed 14.94s | Loss: 0.2429 Grad: 35932.4492 LR: 9.2053e-05
Epoch: [4][150/599]Elapsed 22.40s | Loss: 0.2538 Grad: 51461.5508 LR: 9.1384e-05
Epoch: [4][200/599]Elapsed 29.84s | Loss: 0.2457 Grad: 41022.2617 LR: 9.0691e-05
Epoch: [4][250/599]Elapsed 37.29s | Loss: 0.2424 Grad: 46454.7773 LR: 8.9973e-05
Epoch: [4][300/599]Elapsed 44.73s | Loss: 0.2441 Grad: 47555.8906 LR: 8.9233e-05
Epoch: [4][350/599]Elapsed 52.16s | Loss: 0.2457 Grad: 24433.6934 LR: 8.8469e-05
Epoch: [4][400/599]Elapsed 59.59s | Loss: 0.2443 Grad: 34682.4297 LR: 8.7682e-05
Epoch: [4][450/599]Elapsed 67.02s | Loss: 0.2421 Grad: 64096.6680 LR: 8.6873e-05
Epoch: [4][500/599]Elapsed 74.45s | Loss: 0.2422 Grad: 27044.6074 LR: 8.6043e-05
Epoch: [4][550/599]Elapsed 81.86s | Loss: 0.2408 Grad: 51179.5898 LR: 8.5191e-05
Epoch: [4][598/599]Elapsed 88.98

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [4][0/150]Elapsed 0.10s | Loss: 0.2153
Epoch: [4][50/150]Elapsed 4.90s | Loss: 0.4048
Epoch: [4][100/150]Elapsed 9.70s | Loss: 0.3973


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.2401 | Average Valid Loss: 0.4048 | Time: 103.62s
Best model found in epoch 4 | valid loss: 0.4048


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [5][0/599]Elapsed 0.10s | Loss: 0.2391 Grad: inf LR: 8.4354e-05
Epoch: [5][50/599]Elapsed 7.48s | Loss: 0.2076 Grad: 33032.9883 LR: 8.3462e-05
Epoch: [5][100/599]Elapsed 14.91s | Loss: 0.2023 Grad: 45631.1250 LR: 8.2550e-05
Epoch: [5][150/599]Elapsed 22.35s | Loss: 0.2091 Grad: 52869.7773 LR: 8.1619e-05
Epoch: [5][200/599]Elapsed 29.80s | Loss: 0.2064 Grad: 58297.9297 LR: 8.0670e-05
Epoch: [5][250/599]Elapsed 37.23s | Loss: 0.2013 Grad: 39694.0703 LR: 7.9702e-05
Epoch: [5][300/599]Elapsed 44.65s | Loss: 0.2046 Grad: 99279.4531 LR: 7.8717e-05
Epoch: [5][350/599]Elapsed 52.07s | Loss: 0.2045 Grad: 22583.1484 LR: 7.7715e-05
Epoch: [5][400/599]Elapsed 59.50s | Loss: 0.2030 Grad: 24573.7871 LR: 7.6697e-05
Epoch: [5][450/599]Elapsed 66.91s | Loss: 0.2015 Grad: 39375.6406 LR: 7.5663e-05
Epoch: [5][500/599]Elapsed 74.34s | Loss: 0.2005 Grad: 39181.7461 LR: 7.4614e-05
Epoch: [5][550/599]Elapsed 81.77s | Loss: 0.1994 Grad: 53333.0273 LR: 7.3550e-05
Epoch: [5][598/599]Elapsed 88.92s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [5][0/150]Elapsed 0.10s | Loss: 0.2581
Epoch: [5][50/150]Elapsed 4.90s | Loss: 0.4076
Epoch: [5][100/150]Elapsed 9.70s | Loss: 0.4055


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1988 | Average Valid Loss: 0.4153 | Time: 103.55s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [6][0/599]Elapsed 0.10s | Loss: 0.2281 Grad: nan LR: 7.2516e-05
Epoch: [6][50/599]Elapsed 7.54s | Loss: 0.1846 Grad: 34706.3438 LR: 7.1426e-05
Epoch: [6][100/599]Elapsed 15.07s | Loss: 0.1742 Grad: 57179.8086 LR: 7.0323e-05
Epoch: [6][150/599]Elapsed 22.59s | Loss: 0.1801 Grad: 64806.7852 LR: 6.9208e-05
Epoch: [6][200/599]Elapsed 30.09s | Loss: 0.1760 Grad: 54682.3906 LR: 6.8082e-05
Epoch: [6][250/599]Elapsed 37.58s | Loss: 0.1713 Grad: 36326.0508 LR: 6.6945e-05
Epoch: [6][300/599]Elapsed 45.06s | Loss: 0.1712 Grad: 59267.0156 LR: 6.5799e-05
Epoch: [6][350/599]Elapsed 52.55s | Loss: 0.1708 Grad: 27459.0703 LR: 6.4642e-05
Epoch: [6][400/599]Elapsed 60.00s | Loss: 0.1688 Grad: 34191.8984 LR: 6.3478e-05
Epoch: [6][450/599]Elapsed 67.43s | Loss: 0.1681 Grad: 31459.0000 LR: 6.2305e-05
Epoch: [6][500/599]Elapsed 74.87s | Loss: 0.1680 Grad: 45260.2461 LR: 6.1125e-05
Epoch: [6][550/599]Elapsed 82.30s | Loss: 0.1674 Grad: 46867.4961 LR: 5.9939e-05
Epoch: [6][598/599]Elapsed 89.43s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [6][0/150]Elapsed 0.10s | Loss: 0.2649
Epoch: [6][50/150]Elapsed 4.91s | Loss: 0.4299
Epoch: [6][100/150]Elapsed 9.71s | Loss: 0.4286


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1665 | Average Valid Loss: 0.4390 | Time: 104.08s
Fold: 3 First Training


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [1][0/599]Elapsed 0.10s | Loss: 0.8273 Grad: 74139.0234 LR: 4.0000e-06
Epoch: [1][50/599]Elapsed 7.46s | Loss: 0.8262 Grad: 63661.5742 LR: 5.1479e-06
Epoch: [1][100/599]Elapsed 14.92s | Loss: 0.8208 Grad: 68845.8594 LR: 8.5368e-06
Epoch: [1][150/599]Elapsed 22.37s | Loss: 0.8151 Grad: 74867.7344 LR: 1.4005e-05
Epoch: [1][200/599]Elapsed 29.85s | Loss: 0.8073 Grad: 98248.8906 LR: 2.1290e-05
Epoch: [1][250/599]Elapsed 37.33s | Loss: 0.7973 Grad: 83854.9844 LR: 3.0044e-05
Epoch: [1][300/599]Elapsed 44.80s | Loss: 0.7815 Grad: 92509.2891 LR: 3.9848e-05
Epoch: [1][350/599]Elapsed 52.29s | Loss: 0.7646 Grad: 73865.8594 LR: 5.0233e-05
Epoch: [1][400/599]Elapsed 59.72s | Loss: 0.7471 Grad: 45443.0508 LR: 6.0703e-05
Epoch: [1][450/599]Elapsed 67.15s | Loss: 0.7272 Grad: 106684.6875 LR: 7.0757e-05
Epoch: [1][500/599]Elapsed 74.58s | Loss: 0.7092 Grad: 75979.3203 LR: 7.9913e-05
Epoch: [1][550/599]Elapsed 82.03s | Loss: 0.6898 Grad: 55420.1250 LR: 8.7735e-05
Epoch: [1][598/599]Elapsed 89.15

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [1][0/150]Elapsed 0.10s | Loss: 0.3692
Epoch: [1][50/150]Elapsed 4.91s | Loss: 0.4581
Epoch: [1][100/150]Elapsed 9.71s | Loss: 0.4744


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6734 | Average Valid Loss: 0.4686 | Time: 103.82s
Best model found in epoch 1 | valid loss: 0.4686


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [2][0/599]Elapsed 0.10s | Loss: 0.4489 Grad: 124573.5781 LR: 9.3639e-05
Epoch: [2][50/599]Elapsed 7.46s | Loss: 0.4341 Grad: 92630.8906 LR: 9.7834e-05
Epoch: [2][100/599]Elapsed 14.90s | Loss: 0.4234 Grad: 100738.1797 LR: 9.9837e-05
Epoch: [2][150/599]Elapsed 22.38s | Loss: 0.4298 Grad: 91063.8828 LR: 9.9994e-05
Epoch: [2][200/599]Elapsed 29.83s | Loss: 0.4215 Grad: 55412.8164 LR: 9.9961e-05
Epoch: [2][250/599]Elapsed 37.27s | Loss: 0.4133 Grad: 78166.1953 LR: 9.9899e-05
Epoch: [2][300/599]Elapsed 44.72s | Loss: 0.4092 Grad: 76705.4375 LR: 9.9807e-05
Epoch: [2][350/599]Elapsed 52.14s | Loss: 0.4040 Grad: 80506.2109 LR: 9.9685e-05
Epoch: [2][400/599]Elapsed 59.58s | Loss: 0.3986 Grad: 32667.5410 LR: 9.9535e-05
Epoch: [2][450/599]Elapsed 67.01s | Loss: 0.3921 Grad: 59831.5859 LR: 9.9355e-05
Epoch: [2][500/599]Elapsed 74.46s | Loss: 0.3900 Grad: 42693.1680 LR: 9.9146e-05
Epoch: [2][550/599]Elapsed 81.89s | Loss: 0.3861 Grad: 41280.7422 LR: 9.8908e-05
Epoch: [2][598/599]Elapsed 89.0

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [2][0/150]Elapsed 0.10s | Loss: 0.3091
Epoch: [2][50/150]Elapsed 4.91s | Loss: 0.3732
Epoch: [2][100/150]Elapsed 9.71s | Loss: 0.3841


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.3824 | Average Valid Loss: 0.3766 | Time: 103.68s
Best model found in epoch 2 | valid loss: 0.3766


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [3][0/599]Elapsed 0.11s | Loss: 0.3555 Grad: 153966.7031 LR: 9.8653e-05
Epoch: [3][50/599]Elapsed 7.50s | Loss: 0.3132 Grad: 102386.6328 LR: 9.8359e-05
Epoch: [3][100/599]Elapsed 14.98s | Loss: 0.3071 Grad: 79695.2109 LR: 9.8036e-05
Epoch: [3][150/599]Elapsed 22.45s | Loss: 0.3205 Grad: 51851.0352 LR: 9.7685e-05
Epoch: [3][200/599]Elapsed 29.90s | Loss: 0.3159 Grad: 23971.7910 LR: 9.7306e-05
Epoch: [3][250/599]Elapsed 37.35s | Loss: 0.3097 Grad: 44047.1836 LR: 9.6899e-05
Epoch: [3][300/599]Elapsed 44.78s | Loss: 0.3096 Grad: 45992.7422 LR: 9.6464e-05
Epoch: [3][350/599]Elapsed 52.21s | Loss: 0.3074 Grad: 34557.1641 LR: 9.6002e-05
Epoch: [3][400/599]Elapsed 59.63s | Loss: 0.3046 Grad: 51766.1992 LR: 9.5513e-05
Epoch: [3][450/599]Elapsed 67.03s | Loss: 0.3010 Grad: 46042.9922 LR: 9.4997e-05
Epoch: [3][500/599]Elapsed 74.44s | Loss: 0.3016 Grad: 40000.1914 LR: 9.4455e-05
Epoch: [3][550/599]Elapsed 81.84s | Loss: 0.2992 Grad: 18017.2148 LR: 9.3886e-05
Epoch: [3][598/599]Elapsed 88.9

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [3][0/150]Elapsed 0.10s | Loss: 0.3740
Epoch: [3][50/150]Elapsed 4.90s | Loss: 0.3827
Epoch: [3][100/150]Elapsed 9.71s | Loss: 0.3938


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2973 | Average Valid Loss: 0.3883 | Time: 103.58s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [4][0/599]Elapsed 0.11s | Loss: 0.3309 Grad: inf LR: 9.3316e-05
Epoch: [4][50/599]Elapsed 7.46s | Loss: 0.2578 Grad: 136723.5000 LR: 9.2697e-05
Epoch: [4][100/599]Elapsed 14.89s | Loss: 0.2560 Grad: 84996.6484 LR: 9.2053e-05
Epoch: [4][150/599]Elapsed 22.32s | Loss: 0.2669 Grad: 40628.9062 LR: 9.1384e-05
Epoch: [4][200/599]Elapsed 29.73s | Loss: 0.2612 Grad: 26916.2656 LR: 9.0691e-05
Epoch: [4][250/599]Elapsed 37.11s | Loss: 0.2554 Grad: 38021.3438 LR: 8.9973e-05
Epoch: [4][300/599]Elapsed 44.51s | Loss: 0.2561 Grad: 57571.6328 LR: 8.9233e-05
Epoch: [4][350/599]Elapsed 51.86s | Loss: 0.2547 Grad: 34215.8438 LR: 8.8469e-05
Epoch: [4][400/599]Elapsed 59.21s | Loss: 0.2507 Grad: 44059.8672 LR: 8.7682e-05
Epoch: [4][450/599]Elapsed 66.56s | Loss: 0.2493 Grad: 50520.3711 LR: 8.6873e-05
Epoch: [4][500/599]Elapsed 73.91s | Loss: 0.2498 Grad: 52084.0898 LR: 8.6043e-05
Epoch: [4][550/599]Elapsed 81.24s | Loss: 0.2472 Grad: 26276.8066 LR: 8.5191e-05
Epoch: [4][598/599]Elapsed 88.28s | Los

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [4][0/150]Elapsed 0.10s | Loss: 0.3262
Epoch: [4][50/150]Elapsed 4.91s | Loss: 0.3705
Epoch: [4][100/150]Elapsed 9.72s | Loss: 0.3828


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.2465 | Average Valid Loss: 0.3773 | Time: 102.97s


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [5][0/599]Elapsed 0.10s | Loss: 0.3328 Grad: 269655.9375 LR: 8.4354e-05
Epoch: [5][50/599]Elapsed 7.42s | Loss: 0.2265 Grad: inf LR: 8.3462e-05
Epoch: [5][100/599]Elapsed 14.82s | Loss: 0.2188 Grad: 45888.8828 LR: 8.2550e-05
Epoch: [5][150/599]Elapsed 22.22s | Loss: 0.2292 Grad: 61627.4062 LR: 8.1619e-05
Epoch: [5][200/599]Elapsed 29.63s | Loss: 0.2231 Grad: 30160.1016 LR: 8.0670e-05
Epoch: [5][250/599]Elapsed 37.04s | Loss: 0.2190 Grad: 48515.2656 LR: 7.9702e-05
Epoch: [5][300/599]Elapsed 44.39s | Loss: 0.2205 Grad: 41070.1133 LR: 7.8717e-05
Epoch: [5][350/599]Elapsed 51.73s | Loss: 0.2184 Grad: 52738.1172 LR: 7.7715e-05
Epoch: [5][400/599]Elapsed 59.08s | Loss: 0.2145 Grad: 55156.1406 LR: 7.6697e-05
Epoch: [5][450/599]Elapsed 66.48s | Loss: 0.2119 Grad: 68620.8984 LR: 7.5663e-05
Epoch: [5][500/599]Elapsed 73.86s | Loss: 0.2132 Grad: 27360.5156 LR: 7.4614e-05
Epoch: [5][550/599]Elapsed 81.21s | Loss: 0.2106 Grad: 62294.9219 LR: 7.3550e-05
Epoch: [5][598/599]Elapsed 88.30s | Los

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [5][0/150]Elapsed 0.11s | Loss: 0.2922
Epoch: [5][50/150]Elapsed 4.92s | Loss: 0.3564
Epoch: [5][100/150]Elapsed 9.74s | Loss: 0.3712


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.2091 | Average Valid Loss: 0.3667 | Time: 102.99s
Best model found in epoch 5 | valid loss: 0.3667


Train:   0%|          | 0/599 [00:00<?, ?batch/s]

Epoch: [6][0/599]Elapsed 0.11s | Loss: 0.3163 Grad: nan LR: 7.2516e-05
Epoch: [6][50/599]Elapsed 7.39s | Loss: 0.1834 Grad: 172242.2656 LR: 7.1426e-05
Epoch: [6][100/599]Elapsed 14.80s | Loss: 0.1824 Grad: 38766.4961 LR: 7.0323e-05
Epoch: [6][150/599]Elapsed 22.19s | Loss: 0.1884 Grad: 42425.7734 LR: 6.9208e-05
Epoch: [6][200/599]Elapsed 29.55s | Loss: 0.1838 Grad: 27190.8613 LR: 6.8082e-05
Epoch: [6][250/599]Elapsed 36.91s | Loss: 0.1802 Grad: 41719.3164 LR: 6.6945e-05
Epoch: [6][300/599]Elapsed 44.26s | Loss: 0.1824 Grad: 44560.8125 LR: 6.5799e-05
Epoch: [6][350/599]Elapsed 51.63s | Loss: 0.1791 Grad: 45821.7109 LR: 6.4642e-05
Epoch: [6][400/599]Elapsed 58.99s | Loss: 0.1779 Grad: 43277.3164 LR: 6.3478e-05
Epoch: [6][450/599]Elapsed 66.35s | Loss: 0.1768 Grad: 64677.0273 LR: 6.2305e-05
Epoch: [6][500/599]Elapsed 73.69s | Loss: 0.1780 Grad: 58330.9883 LR: 6.1125e-05
Epoch: [6][550/599]Elapsed 81.07s | Loss: 0.1758 Grad: 22247.4980 LR: 5.9939e-05
Epoch: [6][598/599]Elapsed 88.12s | Los

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [6][0/150]Elapsed 0.10s | Loss: 0.3369
Epoch: [6][50/150]Elapsed 4.93s | Loss: 0.3649
Epoch: [6][100/150]Elapsed 9.76s | Loss: 0.3879


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1758 | Average Valid Loss: 0.3792 | Time: 102.84s
Fold: 4 First Training


Train:   0%|          | 0/600 [00:00<?, ?batch/s]

Epoch: [1][0/600]Elapsed 0.10s | Loss: 0.8104 Grad: 77296.3828 LR: 4.0000e-06
Epoch: [1][50/600]Elapsed 7.35s | Loss: 0.8118 Grad: 89289.0078 LR: 5.1441e-06
Epoch: [1][100/600]Elapsed 14.67s | Loss: 0.8057 Grad: 82057.2734 LR: 8.5219e-06
Epoch: [1][150/600]Elapsed 22.00s | Loss: 0.7992 Grad: 83809.7500 LR: 1.3972e-05
Epoch: [1][200/600]Elapsed 29.35s | Loss: 0.7927 Grad: 64031.8906 LR: 2.1236e-05
Epoch: [1][250/600]Elapsed 36.68s | Loss: 0.7816 Grad: 148438.6875 LR: 2.9966e-05
Epoch: [1][300/600]Elapsed 44.02s | Loss: 0.7661 Grad: 84197.5703 LR: 3.9746e-05
Epoch: [1][350/600]Elapsed 51.36s | Loss: 0.7503 Grad: 56332.8398 LR: 5.0110e-05
Epoch: [1][400/600]Elapsed 58.71s | Loss: 0.7329 Grad: 62849.7891 LR: 6.0565e-05
Epoch: [1][450/600]Elapsed 66.03s | Loss: 0.7137 Grad: 65987.9766 LR: 7.0611e-05
Epoch: [1][500/600]Elapsed 73.37s | Loss: 0.6992 Grad: 85965.5234 LR: 7.9770e-05
Epoch: [1][550/600]Elapsed 80.69s | Loss: 0.6800 Grad: 45446.4414 LR: 8.7605e-05
Epoch: [1][599/600]Elapsed 87.89

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [1][0/150]Elapsed 0.10s | Loss: 0.5313
Epoch: [1][50/150]Elapsed 4.91s | Loss: 0.4166
Epoch: [1][100/150]Elapsed 9.72s | Loss: 0.4280


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.6640 | Average Valid Loss: 0.4346 | Time: 102.50s
Best model found in epoch 1 | valid loss: 0.4346


Train:   0%|          | 0/600 [00:00<?, ?batch/s]

Epoch: [2][0/600]Elapsed 0.10s | Loss: 0.4216 Grad: 107151.9141 LR: 9.3743e-05
Epoch: [2][50/600]Elapsed 7.33s | Loss: 0.4258 Grad: 40717.3516 LR: 9.7891e-05
Epoch: [2][100/600]Elapsed 14.63s | Loss: 0.4181 Grad: 30210.1250 LR: 9.9851e-05
Epoch: [2][150/600]Elapsed 21.93s | Loss: 0.4234 Grad: 39897.7578 LR: 9.9994e-05
Epoch: [2][200/600]Elapsed 29.22s | Loss: 0.4155 Grad: 68133.2422 LR: 9.9961e-05
Epoch: [2][250/600]Elapsed 36.50s | Loss: 0.4082 Grad: 42026.0898 LR: 9.9898e-05
Epoch: [2][300/600]Elapsed 43.78s | Loss: 0.4046 Grad: 36317.4531 LR: 9.9806e-05
Epoch: [2][350/600]Elapsed 51.07s | Loss: 0.4000 Grad: 72099.4766 LR: 9.9684e-05
Epoch: [2][400/600]Elapsed 58.36s | Loss: 0.3964 Grad: 54664.1055 LR: 9.9534e-05
Epoch: [2][450/600]Elapsed 65.65s | Loss: 0.3902 Grad: 42462.8047 LR: 9.9354e-05
Epoch: [2][500/600]Elapsed 72.94s | Loss: 0.3873 Grad: 37698.4727 LR: 9.9145e-05
Epoch: [2][550/600]Elapsed 80.22s | Loss: 0.3826 Grad: 32689.3359 LR: 9.8908e-05
Epoch: [2][599/600]Elapsed 87.42

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [2][0/150]Elapsed 0.10s | Loss: 0.3788
Epoch: [2][50/150]Elapsed 4.93s | Loss: 0.3346
Epoch: [2][100/150]Elapsed 9.75s | Loss: 0.3453


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.3787 | Average Valid Loss: 0.3539 | Time: 102.07s
Best model found in epoch 2 | valid loss: 0.3539


Train:   0%|          | 0/600 [00:00<?, ?batch/s]

Epoch: [3][0/600]Elapsed 0.10s | Loss: 0.2974 Grad: 197952.1719 LR: 9.8642e-05
Epoch: [3][50/600]Elapsed 7.35s | Loss: 0.3056 Grad: 75907.4609 LR: 9.8347e-05
Epoch: [3][100/600]Elapsed 14.68s | Loss: 0.3007 Grad: 73180.0859 LR: 9.8024e-05
Epoch: [3][150/600]Elapsed 22.04s | Loss: 0.3112 Grad: 39016.2969 LR: 9.7672e-05
Epoch: [3][200/600]Elapsed 29.41s | Loss: 0.3046 Grad: 57186.2422 LR: 9.7293e-05
Epoch: [3][250/600]Elapsed 36.75s | Loss: 0.3018 Grad: 32109.8750 LR: 9.6886e-05
Epoch: [3][300/600]Elapsed 44.09s | Loss: 0.3033 Grad: 29138.8359 LR: 9.6451e-05
Epoch: [3][350/600]Elapsed 51.42s | Loss: 0.3025 Grad: 33576.0977 LR: 9.5989e-05
Epoch: [3][400/600]Elapsed 58.72s | Loss: 0.3008 Grad: 46018.4492 LR: 9.5500e-05
Epoch: [3][450/600]Elapsed 66.03s | Loss: 0.2979 Grad: 34781.6406 LR: 9.4984e-05
Epoch: [3][500/600]Elapsed 73.32s | Loss: 0.2985 Grad: 44090.0508 LR: 9.4442e-05
Epoch: [3][550/600]Elapsed 80.62s | Loss: 0.2960 Grad: 28345.2402 LR: 9.3874e-05
Epoch: [3][599/600]Elapsed 87.84

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [3][0/150]Elapsed 0.10s | Loss: 0.3262
Epoch: [3][50/150]Elapsed 4.93s | Loss: 0.3213
Epoch: [3][100/150]Elapsed 9.75s | Loss: 0.3269


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.2954 | Average Valid Loss: 0.3362 | Time: 102.49s
Best model found in epoch 3 | valid loss: 0.3362


Train:   0%|          | 0/600 [00:00<?, ?batch/s]

Epoch: [4][0/600]Elapsed 0.10s | Loss: 0.3589 Grad: 178806.8594 LR: 9.3280e-05
Epoch: [4][50/600]Elapsed 7.35s | Loss: 0.2587 Grad: 95814.1719 LR: 9.2660e-05
Epoch: [4][100/600]Elapsed 14.67s | Loss: 0.2573 Grad: 30016.9238 LR: 9.2016e-05
Epoch: [4][150/600]Elapsed 22.00s | Loss: 0.2647 Grad: 35494.4453 LR: 9.1347e-05
Epoch: [4][200/600]Elapsed 29.32s | Loss: 0.2591 Grad: 43863.7617 LR: 9.0653e-05
Epoch: [4][250/600]Elapsed 36.62s | Loss: 0.2563 Grad: 39507.2773 LR: 8.9936e-05
Epoch: [4][300/600]Elapsed 43.92s | Loss: 0.2565 Grad: 42624.8438 LR: 8.9195e-05
Epoch: [4][350/600]Elapsed 51.21s | Loss: 0.2549 Grad: 42990.7734 LR: 8.8431e-05
Epoch: [4][400/600]Elapsed 58.51s | Loss: 0.2522 Grad: 61675.2500 LR: 8.7645e-05
Epoch: [4][450/600]Elapsed 65.78s | Loss: 0.2498 Grad: 45832.8320 LR: 8.6836e-05
Epoch: [4][500/600]Elapsed 73.05s | Loss: 0.2500 Grad: 46312.4258 LR: 8.6006e-05
Epoch: [4][550/600]Elapsed 80.30s | Loss: 0.2484 Grad: 25525.4551 LR: 8.5155e-05
Epoch: [4][599/600]Elapsed 87.47

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [4][0/150]Elapsed 0.10s | Loss: 0.2470
Epoch: [4][50/150]Elapsed 4.91s | Loss: 0.3171
Epoch: [4][100/150]Elapsed 9.72s | Loss: 0.3188


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.2483 | Average Valid Loss: 0.3272 | Time: 102.09s
Best model found in epoch 4 | valid loss: 0.3272


Train:   0%|          | 0/600 [00:00<?, ?batch/s]

Epoch: [5][0/600]Elapsed 0.10s | Loss: 0.3079 Grad: inf LR: 8.4283e-05
Epoch: [5][50/600]Elapsed 7.36s | Loss: 0.2202 Grad: 65916.3672 LR: 8.3391e-05
Epoch: [5][100/600]Elapsed 14.67s | Loss: 0.2154 Grad: 30941.9199 LR: 8.2479e-05
Epoch: [5][150/600]Elapsed 21.99s | Loss: 0.2269 Grad: 41342.4609 LR: 8.1549e-05
Epoch: [5][200/600]Elapsed 29.30s | Loss: 0.2196 Grad: 44537.3516 LR: 8.0599e-05
Epoch: [5][250/600]Elapsed 36.62s | Loss: 0.2171 Grad: 44996.5312 LR: 7.9632e-05
Epoch: [5][300/600]Elapsed 43.94s | Loss: 0.2171 Grad: 58481.2148 LR: 7.8648e-05
Epoch: [5][350/600]Elapsed 51.25s | Loss: 0.2159 Grad: 29488.3789 LR: 7.7646e-05
Epoch: [5][400/600]Elapsed 58.57s | Loss: 0.2139 Grad: 46536.0703 LR: 7.6629e-05
Epoch: [5][450/600]Elapsed 65.88s | Loss: 0.2116 Grad: 49855.4414 LR: 7.5595e-05
Epoch: [5][500/600]Elapsed 73.18s | Loss: 0.2117 Grad: 55910.3867 LR: 7.4547e-05
Epoch: [5][550/600]Elapsed 80.47s | Loss: 0.2102 Grad: 33077.9023 LR: 7.3484e-05
Epoch: [5][599/600]Elapsed 87.67s | Loss

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [5][0/150]Elapsed 0.10s | Loss: 0.2219
Epoch: [5][50/150]Elapsed 4.92s | Loss: 0.3477
Epoch: [5][100/150]Elapsed 9.74s | Loss: 0.3424


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.2092 | Average Valid Loss: 0.3558 | Time: 102.31s


Train:   0%|          | 0/600 [00:00<?, ?batch/s]

Epoch: [6][0/600]Elapsed 0.10s | Loss: 0.3993 Grad: nan LR: 7.2408e-05
Epoch: [6][50/600]Elapsed 7.37s | Loss: 0.1873 Grad: 126611.4844 LR: 7.1318e-05
Epoch: [6][100/600]Elapsed 14.68s | Loss: 0.1843 Grad: 60061.8125 LR: 7.0216e-05
Epoch: [6][150/600]Elapsed 22.01s | Loss: 0.1894 Grad: 103571.8828 LR: 6.9102e-05
Epoch: [6][200/600]Elapsed 29.30s | Loss: 0.1820 Grad: 51432.1172 LR: 6.7976e-05
Epoch: [6][250/600]Elapsed 36.61s | Loss: 0.1797 Grad: 26347.6465 LR: 6.6841e-05
Epoch: [6][300/600]Elapsed 43.90s | Loss: 0.1794 Grad: 51269.6680 LR: 6.5695e-05
Epoch: [6][350/600]Elapsed 51.19s | Loss: 0.1803 Grad: 30600.9316 LR: 6.4540e-05
Epoch: [6][400/600]Elapsed 58.49s | Loss: 0.1789 Grad: 47675.8477 LR: 6.3377e-05
Epoch: [6][450/600]Elapsed 65.80s | Loss: 0.1774 Grad: 79564.5391 LR: 6.2205e-05
Epoch: [6][500/600]Elapsed 73.13s | Loss: 0.1761 Grad: 45022.0430 LR: 6.1027e-05
Epoch: [6][550/600]Elapsed 80.47s | Loss: 0.1740 Grad: 31700.4824 LR: 5.9842e-05
Epoch: [6][599/600]Elapsed 87.67s | Lo

Valid:   0%|          | 0/150 [00:00<?, ?batch/s]

Epoch: [6][0/150]Elapsed 0.10s | Loss: 0.2023
Epoch: [6][50/150]Elapsed 4.93s | Loss: 0.3414
Epoch: [6][100/150]Elapsed 9.76s | Loss: 0.3411


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1736 | Average Valid Loss: 0.3524 | Time: 102.32s
CV Result (Stage=1): 0.8220928504087618 (torch) | 0.8220928501711877 (kaggle)
Elapse: 51.70 min 
Fold: 0 Second training


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [1][0/254]Elapsed 0.10s | Loss: 0.4260 Grad: inf LR: 4.0000e-06




Epoch: [1][50/254]Elapsed 7.33s | Loss: 0.4311 Grad: 53782.8555 LR: 1.0315e-05
Epoch: [1][100/254]Elapsed 14.62s | Loss: 0.3955 Grad: 61463.7344 LR: 2.7599e-05
Epoch: [1][150/254]Elapsed 21.93s | Loss: 0.3704 Grad: 40065.9570 LR: 5.1303e-05
Epoch: [1][200/254]Elapsed 29.22s | Loss: 0.3418 Grad: 42977.9375 LR: 7.5190e-05
Epoch: [1][250/254]Elapsed 36.51s | Loss: 0.3197 Grad: 30192.0293 LR: 9.2976e-05
Epoch: [1][253/254]Elapsed 37.02s | Loss: 0.3180 Grad: 33653.7812 LR: 9.3978e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [1][0/64]Elapsed 0.10s | Loss: 0.2577
Epoch: [1][50/64]Elapsed 4.93s | Loss: 0.2290


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.3180 | Average Valid Loss: 0.2232 | Time: 43.37s
Best model found in epoch 1 | valid loss: 0.2232


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [2][0/254]Elapsed 0.10s | Loss: 0.2132 Grad: 109356.7344 LR: 9.3978e-05
Epoch: [2][50/254]Elapsed 7.37s | Loss: 0.2171 Grad: 47822.6797 LR: 1.0000e-04
Epoch: [2][100/254]Elapsed 14.69s | Loss: 0.2043 Grad: 46837.4961 LR: 9.9914e-05
Epoch: [2][150/254]Elapsed 22.02s | Loss: 0.2041 Grad: 42336.0273 LR: 9.9665e-05
Epoch: [2][200/254]Elapsed 29.32s | Loss: 0.2018 Grad: 30491.4941 LR: 9.9253e-05
Epoch: [2][250/254]Elapsed 36.62s | Loss: 0.1985 Grad: 39904.0234 LR: 9.8679e-05
Epoch: [2][253/254]Elapsed 37.14s | Loss: 0.1982 Grad: 53006.1758 LR: 9.8626e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [2][0/64]Elapsed 0.10s | Loss: 0.2142
Epoch: [2][50/64]Elapsed 4.95s | Loss: 0.2041


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.1982 | Average Valid Loss: 0.2006 | Time: 43.50s
Best model found in epoch 2 | valid loss: 0.2006


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [3][0/254]Elapsed 0.10s | Loss: 0.1256 Grad: 56706.9102 LR: 9.8626e-05
Epoch: [3][50/254]Elapsed 7.35s | Loss: 0.1806 Grad: 68520.1484 LR: 9.7881e-05
Epoch: [3][100/254]Elapsed 14.62s | Loss: 0.1749 Grad: 68433.0234 LR: 9.6978e-05
Epoch: [3][150/254]Elapsed 21.90s | Loss: 0.1772 Grad: 80484.8828 LR: 9.5922e-05
Epoch: [3][200/254]Elapsed 29.18s | Loss: 0.1751 Grad: 99480.6875 LR: 9.4715e-05
Epoch: [3][250/254]Elapsed 36.44s | Loss: 0.1732 Grad: 37062.1914 LR: 9.3361e-05
Epoch: [3][253/254]Elapsed 36.96s | Loss: 0.1731 Grad: 60957.0195 LR: 9.3247e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [3][0/64]Elapsed 0.10s | Loss: 0.2147
Epoch: [3][50/64]Elapsed 4.93s | Loss: 0.1982


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.1731 | Average Valid Loss: 0.1950 | Time: 43.30s
Best model found in epoch 3 | valid loss: 0.1950


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [4][0/254]Elapsed 0.10s | Loss: 0.1168 Grad: 59330.8047 LR: 9.3247e-05
Epoch: [4][50/254]Elapsed 7.33s | Loss: 0.1628 Grad: 33653.8086 LR: 9.1740e-05
Epoch: [4][100/254]Elapsed 14.61s | Loss: 0.1564 Grad: 36603.9180 LR: 9.0096e-05
Epoch: [4][150/254]Elapsed 21.93s | Loss: 0.1573 Grad: 48003.0430 LR: 8.8322e-05
Epoch: [4][200/254]Elapsed 29.24s | Loss: 0.1564 Grad: 33171.6875 LR: 8.6421e-05
Epoch: [4][250/254]Elapsed 36.55s | Loss: 0.1546 Grad: 39802.9922 LR: 8.4402e-05
Epoch: [4][253/254]Elapsed 37.08s | Loss: 0.1545 Grad: 80632.8281 LR: 8.4235e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [4][0/64]Elapsed 0.10s | Loss: 0.2169
Epoch: [4][50/64]Elapsed 4.93s | Loss: 0.1965


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1545 | Average Valid Loss: 0.1920 | Time: 43.43s
Best model found in epoch 4 | valid loss: 0.1920


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [5][0/254]Elapsed 0.11s | Loss: 0.1172 Grad: 72463.8750 LR: 8.4235e-05
Epoch: [5][50/254]Elapsed 7.41s | Loss: 0.1490 Grad: 103783.6719 LR: 8.2094e-05
Epoch: [5][100/254]Elapsed 14.69s | Loss: 0.1406 Grad: 72172.0703 LR: 7.9848e-05
Epoch: [5][150/254]Elapsed 21.98s | Loss: 0.1435 Grad: 72284.4766 LR: 7.7504e-05
Epoch: [5][200/254]Elapsed 29.27s | Loss: 0.1428 Grad: 55809.1523 LR: 7.5069e-05
Epoch: [5][250/254]Elapsed 36.56s | Loss: 0.1411 Grad: 80139.0312 LR: 7.2553e-05
Epoch: [5][253/254]Elapsed 37.08s | Loss: 0.1410 Grad: 122534.1797 LR: 7.2349e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [5][0/64]Elapsed 0.10s | Loss: 0.2267
Epoch: [5][50/64]Elapsed 4.93s | Loss: 0.1989


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1410 | Average Valid Loss: 0.1943 | Time: 43.42s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [6][0/254]Elapsed 0.10s | Loss: 0.1116 Grad: 93758.1562 LR: 7.2349e-05
Epoch: [6][50/254]Elapsed 7.35s | Loss: 0.1374 Grad: 89348.2109 LR: 6.9753e-05
Epoch: [6][100/254]Elapsed 14.68s | Loss: 0.1319 Grad: 70856.4297 LR: 6.7093e-05
Epoch: [6][150/254]Elapsed 22.03s | Loss: 0.1312 Grad: 81686.4766 LR: 6.4376e-05
Epoch: [6][200/254]Elapsed 29.38s | Loss: 0.1303 Grad: 72483.0625 LR: 6.1613e-05
Epoch: [6][250/254]Elapsed 36.73s | Loss: 0.1291 Grad: 71663.3594 LR: 5.8812e-05
Epoch: [6][253/254]Elapsed 37.25s | Loss: 0.1290 Grad: 97012.4922 LR: 5.8586e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [6][0/64]Elapsed 0.10s | Loss: 0.2373
Epoch: [6][50/64]Elapsed 4.93s | Loss: 0.2046


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1290 | Average Valid Loss: 0.1995 | Time: 43.59s
Fold: 1 Second training


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [1][0/254]Elapsed 0.11s | Loss: 0.5852 Grad: nan LR: 4.0000e-06




Epoch: [1][50/254]Elapsed 7.38s | Loss: 0.4523 Grad: 66118.3516 LR: 1.0315e-05
Epoch: [1][100/254]Elapsed 14.68s | Loss: 0.4040 Grad: 46231.1641 LR: 2.7599e-05
Epoch: [1][150/254]Elapsed 22.02s | Loss: 0.3704 Grad: 36143.0039 LR: 5.1303e-05
Epoch: [1][200/254]Elapsed 29.35s | Loss: 0.3414 Grad: 41557.5391 LR: 7.5190e-05
Epoch: [1][250/254]Elapsed 36.70s | Loss: 0.3179 Grad: 30041.0664 LR: 9.2976e-05
Epoch: [1][253/254]Elapsed 37.22s | Loss: 0.3165 Grad: 33281.6641 LR: 9.3978e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [1][0/64]Elapsed 0.10s | Loss: 0.3404
Epoch: [1][50/64]Elapsed 4.94s | Loss: 0.2220


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.3165 | Average Valid Loss: 0.2223 | Time: 43.58s
Best model found in epoch 1 | valid loss: 0.2223


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [2][0/254]Elapsed 0.10s | Loss: 0.2341 Grad: 123563.6328 LR: 9.3978e-05
Epoch: [2][50/254]Elapsed 7.38s | Loss: 0.2112 Grad: 74523.4141 LR: 1.0000e-04
Epoch: [2][100/254]Elapsed 14.71s | Loss: 0.2015 Grad: 44682.9570 LR: 9.9914e-05
Epoch: [2][150/254]Elapsed 22.03s | Loss: 0.1995 Grad: 46132.9297 LR: 9.9665e-05
Epoch: [2][200/254]Elapsed 29.35s | Loss: 0.1952 Grad: 41957.8398 LR: 9.9253e-05
Epoch: [2][250/254]Elapsed 36.66s | Loss: 0.1921 Grad: 38059.7188 LR: 9.8679e-05
Epoch: [2][253/254]Elapsed 37.17s | Loss: 0.1918 Grad: 49950.9922 LR: 9.8626e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [2][0/64]Elapsed 0.10s | Loss: 0.2953
Epoch: [2][50/64]Elapsed 4.94s | Loss: 0.2047


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.1918 | Average Valid Loss: 0.2051 | Time: 43.53s
Best model found in epoch 2 | valid loss: 0.2051


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [3][0/254]Elapsed 0.10s | Loss: 0.2103 Grad: 114310.0625 LR: 9.8626e-05
Epoch: [3][50/254]Elapsed 7.37s | Loss: 0.1807 Grad: 83486.0938 LR: 9.7881e-05
Epoch: [3][100/254]Elapsed 14.65s | Loss: 0.1751 Grad: 90905.8594 LR: 9.6978e-05
Epoch: [3][150/254]Elapsed 21.94s | Loss: 0.1736 Grad: 74957.3047 LR: 9.5922e-05
Epoch: [3][200/254]Elapsed 29.24s | Loss: 0.1713 Grad: 99093.4219 LR: 9.4715e-05
Epoch: [3][250/254]Elapsed 36.54s | Loss: 0.1693 Grad: 88052.4531 LR: 9.3361e-05
Epoch: [3][253/254]Elapsed 37.05s | Loss: 0.1691 Grad: 98246.1172 LR: 9.3247e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [3][0/64]Elapsed 0.10s | Loss: 0.2867
Epoch: [3][50/64]Elapsed 4.94s | Loss: 0.2008


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.1691 | Average Valid Loss: 0.2012 | Time: 43.40s
Best model found in epoch 3 | valid loss: 0.2012


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [4][0/254]Elapsed 0.10s | Loss: 0.1738 Grad: 97686.0781 LR: 9.3247e-05
Epoch: [4][50/254]Elapsed 7.34s | Loss: 0.1594 Grad: 65340.0664 LR: 9.1740e-05
Epoch: [4][100/254]Elapsed 14.60s | Loss: 0.1554 Grad: 94977.9844 LR: 9.0096e-05
Epoch: [4][150/254]Elapsed 21.91s | Loss: 0.1535 Grad: 86301.1641 LR: 8.8322e-05
Epoch: [4][200/254]Elapsed 29.18s | Loss: 0.1518 Grad: 87319.4141 LR: 8.6421e-05
Epoch: [4][250/254]Elapsed 36.45s | Loss: 0.1501 Grad: 78591.8906 LR: 8.4402e-05
Epoch: [4][253/254]Elapsed 36.96s | Loss: 0.1499 Grad: 106400.5078 LR: 8.4235e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [4][0/64]Elapsed 0.10s | Loss: 0.2969
Epoch: [4][50/64]Elapsed 4.94s | Loss: 0.1981


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1499 | Average Valid Loss: 0.1998 | Time: 43.31s
Best model found in epoch 4 | valid loss: 0.1998


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [5][0/254]Elapsed 0.10s | Loss: 0.1460 Grad: 98147.9922 LR: 8.4235e-05
Epoch: [5][50/254]Elapsed 7.34s | Loss: 0.1455 Grad: 101339.1250 LR: 8.2094e-05
Epoch: [5][100/254]Elapsed 14.61s | Loss: 0.1389 Grad: 83883.4375 LR: 7.9848e-05
Epoch: [5][150/254]Elapsed 21.92s | Loss: 0.1394 Grad: 83196.0703 LR: 7.7504e-05
Epoch: [5][200/254]Elapsed 29.27s | Loss: 0.1383 Grad: 84331.9141 LR: 7.5069e-05
Epoch: [5][250/254]Elapsed 36.59s | Loss: 0.1373 Grad: 101549.3906 LR: 7.2553e-05
Epoch: [5][253/254]Elapsed 37.10s | Loss: 0.1372 Grad: 115683.5859 LR: 7.2349e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [5][0/64]Elapsed 0.10s | Loss: 0.3033
Epoch: [5][50/64]Elapsed 4.91s | Loss: 0.2015


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1372 | Average Valid Loss: 0.2031 | Time: 43.41s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [6][0/254]Elapsed 0.10s | Loss: 0.1584 Grad: 102523.9531 LR: 7.2349e-05
Epoch: [6][50/254]Elapsed 7.34s | Loss: 0.1309 Grad: 110219.3594 LR: 6.9753e-05
Epoch: [6][100/254]Elapsed 14.64s | Loss: 0.1285 Grad: 92134.7891 LR: 6.7093e-05
Epoch: [6][150/254]Elapsed 21.93s | Loss: 0.1271 Grad: 76032.3438 LR: 6.4376e-05
Epoch: [6][200/254]Elapsed 29.22s | Loss: 0.1264 Grad: 86765.5234 LR: 6.1613e-05
Epoch: [6][250/254]Elapsed 36.52s | Loss: 0.1247 Grad: 63939.5664 LR: 5.8812e-05
Epoch: [6][253/254]Elapsed 37.03s | Loss: 0.1247 Grad: 120457.8438 LR: 5.8586e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [6][0/64]Elapsed 0.10s | Loss: 0.3067
Epoch: [6][50/64]Elapsed 4.92s | Loss: 0.2028


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1247 | Average Valid Loss: 0.2046 | Time: 43.36s
Fold: 2 Second training


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [1][0/254]Elapsed 0.10s | Loss: 0.5106 Grad: nan LR: 4.0000e-06




Epoch: [1][50/254]Elapsed 7.32s | Loss: 0.4347 Grad: 29929.9688 LR: 1.0315e-05
Epoch: [1][100/254]Elapsed 14.64s | Loss: 0.3946 Grad: 20855.5215 LR: 2.7599e-05
Epoch: [1][150/254]Elapsed 21.98s | Loss: 0.3670 Grad: 24915.6953 LR: 5.1303e-05
Epoch: [1][200/254]Elapsed 29.31s | Loss: 0.3444 Grad: 16015.0713 LR: 7.5190e-05
Epoch: [1][250/254]Elapsed 36.63s | Loss: 0.3212 Grad: 16536.2363 LR: 9.2976e-05
Epoch: [1][253/254]Elapsed 37.15s | Loss: 0.3199 Grad: 19200.6074 LR: 9.3978e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [1][0/64]Elapsed 0.10s | Loss: 0.2008
Epoch: [1][50/64]Elapsed 4.94s | Loss: 0.2107


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.3199 | Average Valid Loss: 0.2097 | Time: 43.50s
Best model found in epoch 1 | valid loss: 0.2097


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [2][0/254]Elapsed 0.10s | Loss: 0.2756 Grad: inf LR: 9.3978e-05
Epoch: [2][50/254]Elapsed 7.36s | Loss: 0.2127 Grad: 75568.4766 LR: 1.0000e-04
Epoch: [2][100/254]Elapsed 14.72s | Loss: 0.2015 Grad: 45644.6484 LR: 9.9914e-05
Epoch: [2][150/254]Elapsed 22.06s | Loss: 0.2013 Grad: 39351.0781 LR: 9.9665e-05
Epoch: [2][200/254]Elapsed 29.40s | Loss: 0.1997 Grad: 35251.6445 LR: 9.9253e-05
Epoch: [2][250/254]Elapsed 36.74s | Loss: 0.1960 Grad: 37373.5977 LR: 9.8679e-05
Epoch: [2][253/254]Elapsed 37.26s | Loss: 0.1957 Grad: 54838.2891 LR: 9.8626e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [2][0/64]Elapsed 0.10s | Loss: 0.1558
Epoch: [2][50/64]Elapsed 4.94s | Loss: 0.1915


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.1957 | Average Valid Loss: 0.1903 | Time: 43.61s
Best model found in epoch 2 | valid loss: 0.1903


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [3][0/254]Elapsed 0.10s | Loss: 0.2525 Grad: 125721.5469 LR: 9.8626e-05
Epoch: [3][50/254]Elapsed 7.37s | Loss: 0.1809 Grad: 49354.4805 LR: 9.7881e-05
Epoch: [3][100/254]Elapsed 14.69s | Loss: 0.1725 Grad: 41724.5664 LR: 9.6978e-05
Epoch: [3][150/254]Elapsed 22.04s | Loss: 0.1722 Grad: 32985.7227 LR: 9.5922e-05
Epoch: [3][200/254]Elapsed 29.37s | Loss: 0.1730 Grad: 35544.2109 LR: 9.4715e-05
Epoch: [3][250/254]Elapsed 36.70s | Loss: 0.1716 Grad: 36170.8477 LR: 9.3361e-05
Epoch: [3][253/254]Elapsed 37.22s | Loss: 0.1712 Grad: 59469.9180 LR: 9.3247e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [3][0/64]Elapsed 0.10s | Loss: 0.1635
Epoch: [3][50/64]Elapsed 4.94s | Loss: 0.1889


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.1712 | Average Valid Loss: 0.1875 | Time: 43.57s
Best model found in epoch 3 | valid loss: 0.1875


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [4][0/254]Elapsed 0.10s | Loss: 0.2627 Grad: 130913.5547 LR: 9.3247e-05
Epoch: [4][50/254]Elapsed 7.36s | Loss: 0.1669 Grad: 60948.2695 LR: 9.1740e-05
Epoch: [4][100/254]Elapsed 14.67s | Loss: 0.1559 Grad: 50351.5820 LR: 9.0096e-05
Epoch: [4][150/254]Elapsed 22.00s | Loss: 0.1558 Grad: 36269.7812 LR: 8.8322e-05
Epoch: [4][200/254]Elapsed 29.33s | Loss: 0.1567 Grad: 35751.5234 LR: 8.6421e-05
Epoch: [4][250/254]Elapsed 36.66s | Loss: 0.1550 Grad: 56153.2266 LR: 8.4402e-05
Epoch: [4][253/254]Elapsed 37.18s | Loss: 0.1548 Grad: 53976.3125 LR: 8.4235e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [4][0/64]Elapsed 0.10s | Loss: 0.1735
Epoch: [4][50/64]Elapsed 4.93s | Loss: 0.1883


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1548 | Average Valid Loss: 0.1866 | Time: 43.53s
Best model found in epoch 4 | valid loss: 0.1866


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [5][0/254]Elapsed 0.11s | Loss: 0.1653 Grad: 83956.9297 LR: 8.4235e-05
Epoch: [5][50/254]Elapsed 7.37s | Loss: 0.1501 Grad: 89277.8828 LR: 8.2094e-05
Epoch: [5][100/254]Elapsed 14.70s | Loss: 0.1434 Grad: 84779.1719 LR: 7.9848e-05
Epoch: [5][150/254]Elapsed 22.03s | Loss: 0.1420 Grad: 82026.3438 LR: 7.7504e-05
Epoch: [5][200/254]Elapsed 29.36s | Loss: 0.1423 Grad: 55826.6055 LR: 7.5069e-05
Epoch: [5][250/254]Elapsed 36.69s | Loss: 0.1421 Grad: 83541.8125 LR: 7.2553e-05
Epoch: [5][253/254]Elapsed 37.21s | Loss: 0.1418 Grad: 117815.7266 LR: 7.2349e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [5][0/64]Elapsed 0.10s | Loss: 0.1782
Epoch: [5][50/64]Elapsed 4.95s | Loss: 0.1915


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1418 | Average Valid Loss: 0.1889 | Time: 43.57s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [6][0/254]Elapsed 0.10s | Loss: 0.1913 Grad: 99886.6953 LR: 7.2349e-05
Epoch: [6][50/254]Elapsed 7.36s | Loss: 0.1388 Grad: 85468.6641 LR: 6.9753e-05
Epoch: [6][100/254]Elapsed 14.68s | Loss: 0.1314 Grad: 101586.6094 LR: 6.7093e-05
Epoch: [6][150/254]Elapsed 22.01s | Loss: 0.1311 Grad: 78105.8594 LR: 6.4376e-05
Epoch: [6][200/254]Elapsed 29.35s | Loss: 0.1321 Grad: 32168.9258 LR: 6.1613e-05
Epoch: [6][250/254]Elapsed 36.68s | Loss: 0.1305 Grad: 35648.2891 LR: 5.8812e-05
Epoch: [6][253/254]Elapsed 37.20s | Loss: 0.1302 Grad: 48388.8984 LR: 5.8586e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [6][0/64]Elapsed 0.10s | Loss: 0.1735
Epoch: [6][50/64]Elapsed 4.93s | Loss: 0.1916


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1302 | Average Valid Loss: 0.1888 | Time: 43.55s
Fold: 3 Second training


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [1][0/254]Elapsed 0.10s | Loss: 0.5112 Grad: nan LR: 4.0000e-06




Epoch: [1][50/254]Elapsed 7.34s | Loss: 0.4241 Grad: 58816.8125 LR: 1.0315e-05
Epoch: [1][100/254]Elapsed 14.67s | Loss: 0.3786 Grad: 66286.3125 LR: 2.7599e-05
Epoch: [1][150/254]Elapsed 22.01s | Loss: 0.3563 Grad: 41847.3555 LR: 5.1303e-05
Epoch: [1][200/254]Elapsed 29.34s | Loss: 0.3298 Grad: 32710.6445 LR: 7.5190e-05
Epoch: [1][250/254]Elapsed 36.67s | Loss: 0.3072 Grad: 26346.9219 LR: 9.2976e-05
Epoch: [1][253/254]Elapsed 37.19s | Loss: 0.3058 Grad: 37752.0078 LR: 9.3978e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [1][0/64]Elapsed 0.10s | Loss: 0.2134
Epoch: [1][50/64]Elapsed 4.92s | Loss: 0.2062


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.3058 | Average Valid Loss: 0.2095 | Time: 43.51s
Best model found in epoch 1 | valid loss: 0.2095


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [2][0/254]Elapsed 0.11s | Loss: 0.2995 Grad: 118751.7812 LR: 9.3978e-05
Epoch: [2][50/254]Elapsed 7.36s | Loss: 0.2222 Grad: 40063.3047 LR: 1.0000e-04
Epoch: [2][100/254]Elapsed 14.70s | Loss: 0.2084 Grad: 59376.6055 LR: 9.9914e-05
Epoch: [2][150/254]Elapsed 22.02s | Loss: 0.2049 Grad: 61835.6211 LR: 9.9665e-05
Epoch: [2][200/254]Elapsed 29.37s | Loss: 0.2000 Grad: 27845.7734 LR: 9.9253e-05
Epoch: [2][250/254]Elapsed 36.70s | Loss: 0.1957 Grad: 39465.1094 LR: 9.8679e-05
Epoch: [2][253/254]Elapsed 37.22s | Loss: 0.1952 Grad: 72218.8594 LR: 9.8626e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [2][0/64]Elapsed 0.10s | Loss: 0.1937
Epoch: [2][50/64]Elapsed 4.92s | Loss: 0.1892


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.1952 | Average Valid Loss: 0.1929 | Time: 43.56s
Best model found in epoch 2 | valid loss: 0.1929


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [3][0/254]Elapsed 0.10s | Loss: 0.2399 Grad: 104227.5000 LR: 9.8626e-05
Epoch: [3][50/254]Elapsed 7.36s | Loss: 0.1902 Grad: 43651.5000 LR: 9.7881e-05
Epoch: [3][100/254]Elapsed 14.66s | Loss: 0.1805 Grad: 55913.3086 LR: 9.6978e-05
Epoch: [3][150/254]Elapsed 21.98s | Loss: 0.1786 Grad: 43512.3828 LR: 9.5922e-05
Epoch: [3][200/254]Elapsed 29.31s | Loss: 0.1765 Grad: 34755.1992 LR: 9.4715e-05
Epoch: [3][250/254]Elapsed 36.61s | Loss: 0.1725 Grad: 32931.9453 LR: 9.3361e-05
Epoch: [3][253/254]Elapsed 37.13s | Loss: 0.1721 Grad: 47866.7656 LR: 9.3247e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [3][0/64]Elapsed 0.10s | Loss: 0.1997
Epoch: [3][50/64]Elapsed 4.91s | Loss: 0.1944


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.1721 | Average Valid Loss: 0.1984 | Time: 43.44s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [4][0/254]Elapsed 0.10s | Loss: 0.2134 Grad: 112752.1172 LR: 9.3247e-05
Epoch: [4][50/254]Elapsed 7.33s | Loss: 0.1739 Grad: 65576.3438 LR: 9.1740e-05
Epoch: [4][100/254]Elapsed 14.64s | Loss: 0.1630 Grad: 112715.4062 LR: 9.0096e-05
Epoch: [4][150/254]Elapsed 21.96s | Loss: 0.1618 Grad: 96007.9766 LR: 8.8322e-05
Epoch: [4][200/254]Elapsed 29.27s | Loss: 0.1591 Grad: 59188.8438 LR: 8.6421e-05
Epoch: [4][250/254]Elapsed 36.61s | Loss: 0.1560 Grad: 79525.5938 LR: 8.4402e-05
Epoch: [4][253/254]Elapsed 37.13s | Loss: 0.1557 Grad: 86304.1562 LR: 8.4235e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [4][0/64]Elapsed 0.10s | Loss: 0.1928
Epoch: [4][50/64]Elapsed 4.91s | Loss: 0.1934


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1557 | Average Valid Loss: 0.1971 | Time: 43.45s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [5][0/254]Elapsed 0.10s | Loss: 0.1748 Grad: 119989.6328 LR: 8.4235e-05
Epoch: [5][50/254]Elapsed 7.37s | Loss: 0.1583 Grad: 60586.6875 LR: 8.2094e-05
Epoch: [5][100/254]Elapsed 14.72s | Loss: 0.1476 Grad: 88457.0078 LR: 7.9848e-05
Epoch: [5][150/254]Elapsed 22.04s | Loss: 0.1468 Grad: 39765.1914 LR: 7.7504e-05
Epoch: [5][200/254]Elapsed 29.34s | Loss: 0.1445 Grad: 24737.5645 LR: 7.5069e-05
Epoch: [5][250/254]Elapsed 36.63s | Loss: 0.1413 Grad: 35256.5859 LR: 7.2553e-05
Epoch: [5][253/254]Elapsed 37.15s | Loss: 0.1411 Grad: 61890.3047 LR: 7.2349e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [5][0/64]Elapsed 0.11s | Loss: 0.1975
Epoch: [5][50/64]Elapsed 4.96s | Loss: 0.1917


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1411 | Average Valid Loss: 0.1954 | Time: 43.52s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [6][0/254]Elapsed 0.10s | Loss: 0.2124 Grad: 132984.3750 LR: 7.2349e-05
Epoch: [6][50/254]Elapsed 7.36s | Loss: 0.1428 Grad: 36367.6953 LR: 6.9753e-05
Epoch: [6][100/254]Elapsed 14.64s | Loss: 0.1339 Grad: 46531.6328 LR: 6.7093e-05
Epoch: [6][150/254]Elapsed 21.96s | Loss: 0.1336 Grad: 33856.1172 LR: 6.4376e-05
Epoch: [6][200/254]Elapsed 29.30s | Loss: 0.1315 Grad: 25729.3027 LR: 6.1613e-05
Epoch: [6][250/254]Elapsed 36.63s | Loss: 0.1290 Grad: 32318.6738 LR: 5.8812e-05
Epoch: [6][253/254]Elapsed 37.15s | Loss: 0.1287 Grad: 53953.7734 LR: 5.8586e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [6][0/64]Elapsed 0.10s | Loss: 0.1935
Epoch: [6][50/64]Elapsed 4.92s | Loss: 0.1910


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1287 | Average Valid Loss: 0.1957 | Time: 43.48s
Fold: 4 Second training


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [1][0/254]Elapsed 0.11s | Loss: 0.5538 Grad: nan LR: 4.0000e-06




Epoch: [1][50/254]Elapsed 7.34s | Loss: 0.4372 Grad: 45230.6641 LR: 1.0315e-05
Epoch: [1][100/254]Elapsed 14.65s | Loss: 0.3958 Grad: 55778.6562 LR: 2.7599e-05
Epoch: [1][150/254]Elapsed 22.01s | Loss: 0.3678 Grad: 60513.8906 LR: 5.1303e-05
Epoch: [1][200/254]Elapsed 29.36s | Loss: 0.3401 Grad: 38824.5820 LR: 7.5190e-05
Epoch: [1][250/254]Elapsed 36.69s | Loss: 0.3170 Grad: 33280.3398 LR: 9.2976e-05
Epoch: [1][253/254]Elapsed 37.21s | Loss: 0.3158 Grad: 50638.8789 LR: 9.3978e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [1][0/64]Elapsed 0.11s | Loss: 0.3047
Epoch: [1][50/64]Elapsed 4.94s | Loss: 0.2438


----------------------------------------------------------------------------------------------------
Epoch 1 - Average Train Loss: 0.3158 | Average Valid Loss: 0.2469 | Time: 43.57s
Best model found in epoch 1 | valid loss: 0.2469


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [2][0/254]Elapsed 0.10s | Loss: 0.2793 Grad: 136404.1250 LR: 9.3978e-05
Epoch: [2][50/254]Elapsed 7.37s | Loss: 0.2148 Grad: 39663.6953 LR: 1.0000e-04
Epoch: [2][100/254]Elapsed 14.69s | Loss: 0.2050 Grad: 42862.3828 LR: 9.9914e-05
Epoch: [2][150/254]Elapsed 22.02s | Loss: 0.2027 Grad: 57446.2617 LR: 9.9665e-05
Epoch: [2][200/254]Elapsed 29.37s | Loss: 0.1998 Grad: 29325.2031 LR: 9.9253e-05
Epoch: [2][250/254]Elapsed 36.68s | Loss: 0.1944 Grad: 41506.7422 LR: 9.8679e-05
Epoch: [2][253/254]Elapsed 37.20s | Loss: 0.1944 Grad: 81042.2188 LR: 9.8626e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [2][0/64]Elapsed 0.10s | Loss: 0.2580
Epoch: [2][50/64]Elapsed 4.93s | Loss: 0.2159


----------------------------------------------------------------------------------------------------
Epoch 2 - Average Train Loss: 0.1944 | Average Valid Loss: 0.2168 | Time: 43.54s
Best model found in epoch 2 | valid loss: 0.2168


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [3][0/254]Elapsed 0.10s | Loss: 0.2638 Grad: inf LR: 9.8626e-05
Epoch: [3][50/254]Elapsed 7.37s | Loss: 0.1826 Grad: 42227.5977 LR: 9.7881e-05
Epoch: [3][100/254]Elapsed 14.72s | Loss: 0.1751 Grad: 41308.7188 LR: 9.6978e-05
Epoch: [3][150/254]Elapsed 22.09s | Loss: 0.1758 Grad: 46724.4766 LR: 9.5922e-05
Epoch: [3][200/254]Elapsed 29.47s | Loss: 0.1750 Grad: 27721.8398 LR: 9.4715e-05
Epoch: [3][250/254]Elapsed 36.80s | Loss: 0.1710 Grad: 42542.1797 LR: 9.3361e-05
Epoch: [3][253/254]Elapsed 37.32s | Loss: 0.1712 Grad: 71443.6641 LR: 9.3247e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [3][0/64]Elapsed 0.11s | Loss: 0.2346
Epoch: [3][50/64]Elapsed 4.95s | Loss: 0.2159


----------------------------------------------------------------------------------------------------
Epoch 3 - Average Train Loss: 0.1712 | Average Valid Loss: 0.2150 | Time: 43.69s
Best model found in epoch 3 | valid loss: 0.2150


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [4][0/254]Elapsed 0.10s | Loss: 0.2635 Grad: 120073.6406 LR: 9.3247e-05
Epoch: [4][50/254]Elapsed 7.35s | Loss: 0.1669 Grad: 23849.1738 LR: 9.1740e-05
Epoch: [4][100/254]Elapsed 14.66s | Loss: 0.1581 Grad: 44651.0586 LR: 9.0096e-05
Epoch: [4][150/254]Elapsed 21.98s | Loss: 0.1577 Grad: 49922.8984 LR: 8.8322e-05
Epoch: [4][200/254]Elapsed 29.28s | Loss: 0.1573 Grad: 28206.8613 LR: 8.6421e-05
Epoch: [4][250/254]Elapsed 36.59s | Loss: 0.1530 Grad: 50388.7969 LR: 8.4402e-05
Epoch: [4][253/254]Elapsed 37.11s | Loss: 0.1529 Grad: 67646.7734 LR: 8.4235e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [4][0/64]Elapsed 0.10s | Loss: 0.2266
Epoch: [4][50/64]Elapsed 4.93s | Loss: 0.2159


----------------------------------------------------------------------------------------------------
Epoch 4 - Average Train Loss: 0.1529 | Average Valid Loss: 0.2151 | Time: 43.45s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [5][0/254]Elapsed 0.10s | Loss: 0.2206 Grad: 131372.7812 LR: 8.4235e-05
Epoch: [5][50/254]Elapsed 7.33s | Loss: 0.1442 Grad: 27570.7188 LR: 8.2094e-05
Epoch: [5][100/254]Elapsed 14.61s | Loss: 0.1404 Grad: 30644.4609 LR: 7.9848e-05
Epoch: [5][150/254]Elapsed 21.92s | Loss: 0.1420 Grad: 41539.5000 LR: 7.7504e-05
Epoch: [5][200/254]Elapsed 29.25s | Loss: 0.1415 Grad: 35526.4102 LR: 7.5069e-05
Epoch: [5][250/254]Elapsed 36.58s | Loss: 0.1385 Grad: 38238.4453 LR: 7.2553e-05
Epoch: [5][253/254]Elapsed 37.10s | Loss: 0.1385 Grad: 74740.6406 LR: 7.2349e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [5][0/64]Elapsed 0.11s | Loss: 0.2294
Epoch: [5][50/64]Elapsed 4.95s | Loss: 0.2194


----------------------------------------------------------------------------------------------------
Epoch 5 - Average Train Loss: 0.1385 | Average Valid Loss: 0.2185 | Time: 43.46s


Train:   0%|          | 0/254 [00:00<?, ?batch/s]

Epoch: [6][0/254]Elapsed 0.10s | Loss: 0.2169 Grad: 181035.7656 LR: 7.2349e-05
Epoch: [6][50/254]Elapsed 7.36s | Loss: 0.1354 Grad: 23853.5156 LR: 6.9753e-05
Epoch: [6][100/254]Elapsed 14.67s | Loss: 0.1311 Grad: 31862.6777 LR: 6.7093e-05
Epoch: [6][150/254]Elapsed 22.00s | Loss: 0.1309 Grad: 54163.2031 LR: 6.4376e-05
Epoch: [6][200/254]Elapsed 29.35s | Loss: 0.1300 Grad: 24321.5469 LR: 6.1613e-05
Epoch: [6][250/254]Elapsed 36.66s | Loss: 0.1273 Grad: 41695.8906 LR: 5.8812e-05
Epoch: [6][253/254]Elapsed 37.19s | Loss: 0.1273 Grad: 78853.5000 LR: 5.8586e-05


Valid:   0%|          | 0/64 [00:00<?, ?batch/s]

Epoch: [6][0/64]Elapsed 0.11s | Loss: 0.2278
Epoch: [6][50/64]Elapsed 4.94s | Loss: 0.2193


----------------------------------------------------------------------------------------------------
Epoch 6 - Average Train Loss: 0.1273 | Average Valid Loss: 0.2184 | Time: 43.55s
CV Result (Stage=2): 0.402791597945404 (torch) | 0.4027915972923205 (kaggle)
Elapse: 73.49 min 


In [None]:
dataset = CustomDataset(train_easy, TARGETS, ModelConfig, all_specs, all_eegs, mode='test')

X, y = dataset[0]
print(X.shape, y.shape)

model = CustomModel(ModelConfig, num_classes=6, pretrained=True)
y_pred = model(X.unsqueeze(0))

print(y_pred.shape)

In [None]:
from kl_divergence import score as kl_score


def calc_kl_div(p, q, criterion):
    
    p = torch.tensor(p.astype(np.float32)).unsqueeze(0)
    q = torch.tensor(q.astype(np.float32)).unsqueeze(0)
    return criterion(F.log_softmax(p, dim=1), q).item()

def calc_kaggle_score(solution, submission):
    solution = solution.to_frame().T
    solution[TARGETS] = solution[TARGETS].astype(np.float32)
    submission = submission.to_frame().T
    submission.columns = ['eeg_id'] + TARGETS
    submission[TARGETS] = submission[TARGETS].astype(np.float32)
    
    return kl_score(solution, submission, 'eeg_id')

In [None]:
def evaluate_oof(oof_csv_path):
    oof_df = pd.read_csv(oof_csv_path)
    softmax = nn.Softmax(dim=1)
    criterion = nn.KLDivLoss(reduction="batchmean")

    oof_df["kl_loss"] = oof_df.apply(lambda row: 
        calc_kl_div(row[TARGETS_PRED].values, row[TARGETS].values, criterion), axis=1
        )

    kl_loss_all = criterion(
        F.log_softmax(torch.tensor(oof_df[TARGETS_PRED].values.astype(np.float32)), dim=1),
        torch.tensor(oof_df[TARGETS].values.astype(np.float32)),
        )

    print(f"KL Loss All: {kl_loss_all}")
    print(f"KL Loss Mean: {oof_df['kl_loss'].mean()}")

    y_pred = oof_df[TARGETS].values.astype(np.float32)
    oof_df[TARGETS_PRED] = softmax(torch.tensor(y_pred)).numpy()

    solution = oof_df[['eeg_id'] + TARGETS].copy()
    submission = oof_df[['eeg_id'] + TARGETS_PRED].copy()
    submission.columns = ['eeg_id'] + TARGETS

    kaggle_score_all = kl_score(solution, submission, 'eeg_id')
    
    oof_df['kaggle_score'] = oof_df.apply(lambda row:
        calc_kaggle_score(row[['eeg_id'] + TARGETS], row[['eeg_id'] + TARGETS_PRED]), axis=1
        )

    print(f"Kaggle Score All: {kaggle_score_all}")
    print(f"Kaggle Score Mean: {oof_df['kaggle_score'].mean()}")

    return oof_df, kl_loss_all, kaggle_score_all


In [None]:
oof_1, kl_loss_all, kaggle_score_all = evaluate_oof(f"{JobConfig.OUTPUT_DIR}/oof_1.csv")
oof_2, kl_loss_all, kaggle_score_all = evaluate_oof(f"{JobConfig.OUTPUT_DIR}/oof_2.csv")

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(10, 10), sharex=True, sharey=True)

# rows = oof_df.iloc[-len(axes.ravel()):, :]
rows = oof_1.sample(len(axes.ravel()))

for i, (idx, row) in enumerate(rows.iterrows()):

    ax = axes.ravel()[i]
    ax.plot(row[TARGETS].values, label='True')
    ax.plot(row[TARGETS_PRED].values, label='Pred')
    ax.set_title(f"{idx} | {row['target']} | KL: {row['kl_loss']:.4f}")
    ax.set_xticks(range(6))
    ax.set_xticklabels(BRAIN_ACTIVITY)
    ax.grid(True)
    ax.legend()

fig.tight_layout()
plt.show()

In [None]:
row = oof_1.loc[2619]

min_pred = row[TARGETS_PRED].min()
max_pred = row[TARGETS_PRED].max()

targets_norm = (row[TARGETS] - row[TARGETS].min()) / (row[TARGETS].max() - row[TARGETS].min())

print(targets_norm)

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(10, 10), sharex=True, sharey=True)

rows = oof_2.iloc[5:5+len(axes.ravel()), :]
# rows = oof_df.sample(len(axes.ravel()))

for i, (idx, row) in enumerate(rows.iterrows()):

    ax = axes.ravel()[i]
    y_true = row[TARGETS].values
    y_pred = row[TARGETS_PRED].values
    y_norm = (y_true - y_true.min()) / (y_true.max() - y_true.min())

    ax.plot(row[TARGETS].values, label='True')
    ax.plot(row[TARGETS_PRED].values, label='Pred')
    ax.plot(y_norm, "b:", label='True Norm')

    ax.set_title(f"{idx} | {row['target']} | KL: {row['kl_loss']:.4f}")
    ax.set_xticks(range(6))
    ax.set_xticklabels(BRAIN_ACTIVITY)
    ax.grid(True)
    ax.legend()

fig.tight_layout()
plt.show()

In [None]:
row = oof_2.loc[6]

min_pred = row[TARGETS_PRED].min()
max_pred = row[TARGETS_PRED].max()
print(min_pred, max_pred)

print(row[TARGETS_PRED])

targets_norm = (row[TARGETS] - row[TARGETS].min()) / (row[TARGETS].max() - row[TARGETS].min())

targets_norm = targets_norm / targets_norm.sum()

print(targets_norm)