In [None]:
!pip download transformers==4.41.0
!pip download tokenizers==0.11.0
!pip download torch==2.4.0

In [2]:
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

tokenizers.__version__: 0.21.1
transformers.__version__: 4.51.3


In [3]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

os.environ["WANDB_DISABLED"] = "true"


In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    competition='NBME'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="/kaggle/input/nbme-tapt/taptdb" ###
    scheduler='cosine' # 
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=325
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip uninstall -y transformers')
#s.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.21.1
transformers.__version__: 4.51.3


2025-06-01 12:50:57.070519: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748782257.290039      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748782257.351679      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


env: TOKENIZERS_PARALLELISM=true


In [6]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [7]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

In [8]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [9]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [10]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [11]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [12]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

annotation_length
1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: count, dtype: int64

In [13]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [14]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [15]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model, use_fast=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [16]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 323


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 354


In [17]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

In [18]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [19]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        save_path = os.path.join(OUTPUT_DIR, f"model_fold{fold}_best.pth")
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),'predictions': predictions},save_path)
    load_path = os.path.join(OUTPUT_DIR, f"model_fold{fold}_best.pth")
    predictions = torch.load(load_path, map_location=torch.device('cpu'),weights_only=False)['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        



Epoch: [1][0/953] Elapsed 0m 1s (remain 29m 41s) Loss: 0.8067(0.8067) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 45s (remain 6m 20s) Loss: 0.0773(0.1183) Grad: 3050.7864  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 29s (remain 5m 35s) Loss: 0.0258(0.0756) Grad: 2596.7168  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 15s (remain 4m 54s) Loss: 0.0033(0.0575) Grad: 1192.5962  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 4s (remain 4m 14s) Loss: 0.0103(0.0478) Grad: 2201.5979  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 3m 53s (remain 3m 30s) Loss: 0.0042(0.0419) Grad: 1293.7928  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 4m 41s (remain 2m 45s) Loss: 0.0093(0.0378) Grad: 1905.3389  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 5m 30s (remain 1m 58s) Loss: 0.0068(0.0347) Grad: 1191.5128  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 6m 19s (remain 1m 11s) Loss: 0.0073(0.0325) Grad: 787.9381  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 7m 7s (remain 0m 24s) Loss: 

Epoch 1 - avg_train_loss: 0.0296  avg_val_loss: 0.0168  time: 563s
Epoch 1 - Score: 0.8263
Epoch 1 - Save Best Score: 0.8263 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 10m 18s) Loss: 0.0148(0.0148) Grad: 18094.4395  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 49s (remain 6m 57s) Loss: 0.0082(0.0102) Grad: 11677.6445  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0163(0.0106) Grad: 41299.3242  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0266(0.0113) Grad: 20439.8594  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0207(0.0109) Grad: 26130.4277  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0085(0.0107) Grad: 22995.0195  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0030(0.0107) Grad: 6617.8037  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0062(0.0108) Grad: 7590.0967  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0054(0.0108) Grad: 7674.2407  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 7m 18s (remain 0m

Epoch 2 - avg_train_loss: 0.0106  avg_val_loss: 0.0133  time: 574s
Epoch 2 - Score: 0.8593
Epoch 2 - Save Best Score: 0.8593 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 31s) Loss: 0.0070(0.0070) Grad: 11598.1211  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0072(0.0096) Grad: 11925.8486  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0198(0.0090) Grad: 40082.3633  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 26s (remain 5m 17s) Loss: 0.0037(0.0093) Grad: 9266.2236  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0032(0.0093) Grad: 12528.9229  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0053(0.0094) Grad: 29072.5039  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0210(0.0094) Grad: 27439.7930  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0092(0.0094) Grad: 16526.9512  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0198(0.0095) Grad: 30180.6777  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 7m 18s (remain 

Epoch 3 - avg_train_loss: 0.0094  avg_val_loss: 0.0130  time: 573s
Epoch 3 - Score: 0.8661
Epoch 3 - Save Best Score: 0.8661 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 10m 54s) Loss: 0.0136(0.0136) Grad: 15265.6836  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 49s (remain 6m 57s) Loss: 0.0016(0.0094) Grad: 6693.9287  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0154(0.0087) Grad: 21518.9355  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 26s (remain 5m 17s) Loss: 0.0074(0.0087) Grad: 10435.9980  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 15s (remain 4m 28s) Loss: 0.0067(0.0086) Grad: 6397.7744  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 3s (remain 3m 40s) Loss: 0.0079(0.0088) Grad: 8827.7510  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0044(0.0086) Grad: 15095.3760  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0040(0.0087) Grad: 6619.1372  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 6m 29s (remain 1m 13s) Loss: 0.0126(0.0087) Grad: 10964.8291  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 7m 18s (remain 0m 

Epoch 4 - avg_train_loss: 0.0086  avg_val_loss: 0.0130  time: 572s
Epoch 4 - Score: 0.8663
Epoch 4 - Save Best Score: 0.8663 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 10m 52s) Loss: 0.0132(0.0132) Grad: 25350.4453  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0026(0.0076) Grad: 3317.8613  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0019(0.0079) Grad: 6743.4844  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0060(0.0078) Grad: 11803.3340  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0023(0.0078) Grad: 3939.5618  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0181(0.0078) Grad: 36298.8438  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0062(0.0080) Grad: 11593.4561  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0014(0.0081) Grad: 6979.9321  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 6m 29s (remain 1m 14s) Loss: 0.0057(0.0081) Grad: 16180.1309  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 7m 18s (remain 0m 

Epoch 5 - avg_train_loss: 0.0081  avg_val_loss: 0.0132  time: 573s
Epoch 5 - Score: 0.8661
Score: 0.8663


Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 33s) Loss: 0.7930(0.7930) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 50s (remain 7m 2s) Loss: 0.0487(0.1192) Grad: 5178.9404  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 38s (remain 6m 9s) Loss: 0.0189(0.0758) Grad: 1878.3608  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 27s (remain 5m 19s) Loss: 0.0217(0.0578) Grad: 3801.3281  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 16s (remain 4m 30s) Loss: 0.0187(0.0484) Grad: 2645.9028  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0095(0.0420) Grad: 2202.4060  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0622(0.0380) Grad: 8690.4707  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0116(0.0350) Grad: 1268.7106  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0160(0.0324) Grad: 2692.2246  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 7m 19s (remain 0m 25s) Loss: 0

Epoch 1 - avg_train_loss: 0.0295  avg_val_loss: 0.0133  time: 574s
Epoch 1 - Score: 0.8380
Epoch 1 - Save Best Score: 0.8380 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 10m 54s) Loss: 0.0040(0.0040) Grad: 4406.3042  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 49s (remain 6m 57s) Loss: 0.0093(0.0110) Grad: 7046.0811  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0088(0.0108) Grad: 7823.5532  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 26s (remain 5m 17s) Loss: 0.0087(0.0107) Grad: 13685.4512  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 15s (remain 4m 28s) Loss: 0.0024(0.0111) Grad: 6963.7446  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 3s (remain 3m 40s) Loss: 0.0230(0.0111) Grad: 30542.5742  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0277(0.0111) Grad: 15155.0557  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0275(0.0110) Grad: 30713.5332  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0109(0.0110) Grad: 11580.8086  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 7m 18s (remain 0m 

Epoch 2 - avg_train_loss: 0.0109  avg_val_loss: 0.0116  time: 574s
Epoch 2 - Score: 0.8623
Epoch 2 - Save Best Score: 0.8623 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 47s) Loss: 0.0302(0.0302) Grad: 41270.4062  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 49s (remain 6m 59s) Loss: 0.0152(0.0089) Grad: 15648.2500  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0208(0.0092) Grad: 50400.5938  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0039(0.0096) Grad: 11731.8857  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0017(0.0097) Grad: 3092.1433  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0089(0.0095) Grad: 13316.4521  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0092(0.0095) Grad: 23836.0391  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0074(0.0096) Grad: 11690.1201  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0091(0.0096) Grad: 22463.5566  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 7m 18s (remain 

Epoch 3 - avg_train_loss: 0.0097  avg_val_loss: 0.0114  time: 573s
Epoch 3 - Score: 0.8628
Epoch 3 - Save Best Score: 0.8628 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 10m 58s) Loss: 0.0058(0.0058) Grad: 11570.0127  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 49s (remain 6m 59s) Loss: 0.0220(0.0091) Grad: 13669.9912  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 38s (remain 6m 8s) Loss: 0.0096(0.0091) Grad: 13627.3662  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 27s (remain 5m 19s) Loss: 0.0037(0.0088) Grad: 6737.9316  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0132(0.0089) Grad: 13286.4023  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0038(0.0088) Grad: 20280.3164  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0585(0.0087) Grad: 32843.1367  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0150(0.0087) Grad: 16745.5156  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0132(0.0088) Grad: 24942.2930  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 7m 19s (remain 

Epoch 4 - avg_train_loss: 0.0089  avg_val_loss: 0.0112  time: 574s
Epoch 4 - Score: 0.8659
Epoch 4 - Save Best Score: 0.8659 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 10m 55s) Loss: 0.0041(0.0041) Grad: 6740.3472  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0096(0.0086) Grad: 12511.3281  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0049(0.0089) Grad: 9688.3818  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0022(0.0086) Grad: 10245.6855  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0004(0.0083) Grad: 3222.1006  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0076(0.0083) Grad: 13142.6934  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0128(0.0084) Grad: 12663.7002  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0024(0.0082) Grad: 5523.5200  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 6m 29s (remain 1m 13s) Loss: 0.0049(0.0081) Grad: 8859.2422  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 7m 18s (remain 0m 2

Epoch 5 - avg_train_loss: 0.0083  avg_val_loss: 0.0114  time: 573s
Epoch 5 - Score: 0.8667
Epoch 5 - Save Best Score: 0.8667 Model
Score: 0.8667


Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 45s) Loss: 0.5096(0.5096) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 50s (remain 7m 4s) Loss: 0.0515(0.0921) Grad: 5135.8521  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 38s (remain 6m 10s) Loss: 0.0199(0.0615) Grad: 5268.4526  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 27s (remain 5m 20s) Loss: 0.0182(0.0488) Grad: 4797.2417  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 16s (remain 4m 30s) Loss: 0.0124(0.0419) Grad: 5802.1426  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 5s (remain 3m 41s) Loss: 0.0148(0.0370) Grad: 4527.4209  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 4m 54s (remain 2m 52s) Loss: 0.0325(0.0338) Grad: 8514.0957  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 5m 42s (remain 2m 3s) Loss: 0.0064(0.0314) Grad: 1840.2559  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 6m 31s (remain 1m 14s) Loss: 0.0084(0.0295) Grad: 1950.3625  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 7m 19s (remain 0m 25s) Loss: 

Epoch 1 - avg_train_loss: 0.0271  avg_val_loss: 0.0141  time: 574s
Epoch 1 - Score: 0.8534
Epoch 1 - Save Best Score: 0.8534 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 10m 33s) Loss: 0.0095(0.0095) Grad: 10661.6426  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0109(0.0114) Grad: 13763.4170  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0065(0.0115) Grad: 11251.9961  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0062(0.0113) Grad: 6604.7383  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0181(0.0114) Grad: 28503.5098  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0043(0.0111) Grad: 13274.0762  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0047(0.0113) Grad: 6537.4468  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0034(0.0111) Grad: 4776.5195  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0076(0.0109) Grad: 11088.9463  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 7m 18s (remain 0m

Epoch 2 - avg_train_loss: 0.0110  avg_val_loss: 0.0119  time: 574s
Epoch 2 - Score: 0.8739
Epoch 2 - Save Best Score: 0.8739 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 40s) Loss: 0.0057(0.0057) Grad: 7190.5732  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 49s (remain 6m 57s) Loss: 0.0143(0.0082) Grad: 25046.2559  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0113(0.0086) Grad: 21202.9336  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 26s (remain 5m 17s) Loss: 0.0009(0.0094) Grad: 3532.9373  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 15s (remain 4m 28s) Loss: 0.0107(0.0096) Grad: 8612.6006  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0154(0.0096) Grad: 8181.4507  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0061(0.0097) Grad: 15649.1680  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0037(0.0096) Grad: 10196.4707  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0035(0.0095) Grad: 6588.5566  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 7m 19s (remain 0m 2

Epoch 3 - avg_train_loss: 0.0095  avg_val_loss: 0.0123  time: 574s
Epoch 3 - Score: 0.8747
Epoch 3 - Save Best Score: 0.8747 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 10m 34s) Loss: 0.0063(0.0063) Grad: 7981.7163  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0146(0.0082) Grad: 10899.0518  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0143(0.0079) Grad: 102454.9141  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0123(0.0082) Grad: 11920.1426  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0021(0.0080) Grad: 7675.0547  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0155(0.0081) Grad: 18044.5449  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0084(0.0082) Grad: 9101.9004  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0001(0.0083) Grad: 555.4482  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0051(0.0083) Grad: 13429.7236  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 7m 18s (remain 0m 

Epoch 4 - avg_train_loss: 0.0084  avg_val_loss: 0.0125  time: 574s
Epoch 4 - Score: 0.8781
Epoch 4 - Save Best Score: 0.8781 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 10m 42s) Loss: 0.0048(0.0048) Grad: 14445.1104  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 49s (remain 6m 59s) Loss: 0.0067(0.0064) Grad: 6956.5029  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0031(0.0073) Grad: 7275.4033  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 27s (remain 5m 18s) Loss: 0.0043(0.0076) Grad: 7623.1147  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0004(0.0076) Grad: 1306.4213  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0088(0.0075) Grad: 15342.8105  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0088(0.0077) Grad: 17358.9746  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0059(0.0076) Grad: 8256.9541  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0007(0.0076) Grad: 2429.6963  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 7m 18s (remain 0m 25

Epoch 5 - avg_train_loss: 0.0077  avg_val_loss: 0.0128  time: 574s
Epoch 5 - Score: 0.8792
Epoch 5 - Save Best Score: 0.8792 Model
Score: 0.8792


Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 40s) Loss: 0.6396(0.6396) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 50s (remain 7m 4s) Loss: 0.0228(0.1011) Grad: 1560.5671  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 38s (remain 6m 10s) Loss: 0.0329(0.0658) Grad: 2898.4536  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 27s (remain 5m 20s) Loss: 0.0181(0.0514) Grad: 3341.6335  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 16s (remain 4m 30s) Loss: 0.0365(0.0433) Grad: 4801.5225  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 5s (remain 3m 41s) Loss: 0.0027(0.0379) Grad: 1249.2786  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0197(0.0346) Grad: 1881.8121  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 5m 42s (remain 2m 3s) Loss: 0.0190(0.0320) Grad: 3583.4795  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0271(0.0299) Grad: 3682.1040  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 7m 19s (remain 0m 25s) Loss: 

Epoch 1 - avg_train_loss: 0.0275  avg_val_loss: 0.0146  time: 575s
Epoch 1 - Score: 0.8395
Epoch 1 - Save Best Score: 0.8395 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 11m 12s) Loss: 0.0043(0.0043) Grad: 8088.1143  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0086(0.0121) Grad: 8420.3330  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0087(0.0121) Grad: 9391.4541  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0085(0.0121) Grad: 17885.6133  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0064(0.0116) Grad: 6043.0176  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0169(0.0115) Grad: 40937.4219  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0019(0.0113) Grad: 3043.8782  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0044(0.0112) Grad: 5373.1382  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0457(0.0112) Grad: 32289.6523  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 7m 18s (remain 0m 25

Epoch 2 - avg_train_loss: 0.0110  avg_val_loss: 0.0131  time: 573s
Epoch 2 - Score: 0.8606
Epoch 2 - Save Best Score: 0.8606 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 45s) Loss: 0.0028(0.0028) Grad: 4897.6348  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 49s (remain 6m 59s) Loss: 0.0032(0.0088) Grad: 9682.3379  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0028(0.0088) Grad: 4400.3892  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0052(0.0094) Grad: 4559.0425  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0019(0.0094) Grad: 6260.1172  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0049(0.0093) Grad: 12027.3740  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0145(0.0093) Grad: 15569.8057  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0013(0.0094) Grad: 5922.8340  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0074(0.0095) Grad: 10594.9062  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 7m 19s (remain 0m 25

Epoch 3 - avg_train_loss: 0.0097  avg_val_loss: 0.0128  time: 574s
Epoch 3 - Score: 0.8670
Epoch 3 - Save Best Score: 0.8670 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 10m 49s) Loss: 0.0015(0.0015) Grad: 3524.9746  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 49s (remain 6m 57s) Loss: 0.0085(0.0084) Grad: 10230.0830  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0252(0.0092) Grad: 24848.4805  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 26s (remain 5m 17s) Loss: 0.0053(0.0091) Grad: 8535.3828  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0111(0.0089) Grad: 12056.5723  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0145(0.0089) Grad: 27253.0469  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0034(0.0088) Grad: 10071.0898  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0182(0.0088) Grad: 18939.4727  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0036(0.0087) Grad: 6929.9868  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 7m 18s (remain 0m

Epoch 4 - avg_train_loss: 0.0087  avg_val_loss: 0.0132  time: 573s
Epoch 4 - Score: 0.8680
Epoch 4 - Save Best Score: 0.8680 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 10m 53s) Loss: 0.0040(0.0040) Grad: 7470.7778  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 49s (remain 6m 59s) Loss: 0.0056(0.0067) Grad: 8163.5464  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0035(0.0075) Grad: 8536.3447  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0043(0.0079) Grad: 9888.6084  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0103(0.0079) Grad: 13222.7744  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0090(0.0082) Grad: 23206.1113  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0097(0.0084) Grad: 12811.6074  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0105(0.0083) Grad: 40009.8828  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0028(0.0081) Grad: 9987.9053  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 7m 19s (remain 0m 2

Epoch 5 - avg_train_loss: 0.0082  avg_val_loss: 0.0134  time: 574s
Epoch 5 - Score: 0.8685
Epoch 5 - Save Best Score: 0.8685 Model
Score: 0.8685


Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 56s) Loss: 0.6688(0.6688) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 50s (remain 7m 3s) Loss: 0.0348(0.1071) Grad: 1891.1439  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 38s (remain 6m 9s) Loss: 0.0551(0.0693) Grad: 4293.3052  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 27s (remain 5m 20s) Loss: 0.0272(0.0541) Grad: 3665.4856  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 16s (remain 4m 30s) Loss: 0.0137(0.0458) Grad: 1740.7332  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 5s (remain 3m 41s) Loss: 0.0123(0.0403) Grad: 2343.0647  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0200(0.0363) Grad: 1963.9048  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 5m 42s (remain 2m 3s) Loss: 0.0077(0.0335) Grad: 3785.6934  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0077(0.0311) Grad: 2981.6062  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 7m 19s (remain 0m 25s) Loss: 0

Epoch 1 - avg_train_loss: 0.0285  avg_val_loss: 0.0149  time: 574s
Epoch 1 - Score: 0.8455
Epoch 1 - Save Best Score: 0.8455 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 11m 8s) Loss: 0.0118(0.0118) Grad: 20779.7090  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 49s (remain 6m 59s) Loss: 0.0080(0.0115) Grad: 9374.5908  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 38s (remain 6m 8s) Loss: 0.0530(0.0123) Grad: 78142.4297  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 27s (remain 5m 18s) Loss: 0.0134(0.0115) Grad: 12270.6426  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0087(0.0115) Grad: 6309.0381  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0090(0.0114) Grad: 12582.5664  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0063(0.0112) Grad: 11119.8467  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0199(0.0111) Grad: 39918.3867  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0081(0.0109) Grad: 11272.9678  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 7m 19s (remain 0m

Epoch 2 - avg_train_loss: 0.0108  avg_val_loss: 0.0145  time: 574s
Epoch 2 - Score: 0.8599
Epoch 2 - Save Best Score: 0.8599 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 46s) Loss: 0.0106(0.0106) Grad: 8480.0830  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 49s (remain 6m 57s) Loss: 0.0045(0.0095) Grad: 7556.2925  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0086(0.0096) Grad: 11354.2666  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 26s (remain 5m 17s) Loss: 0.0139(0.0094) Grad: 16950.1992  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 15s (remain 4m 28s) Loss: 0.0090(0.0095) Grad: 7878.6855  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 3s (remain 3m 40s) Loss: 0.0059(0.0094) Grad: 8608.7334  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0166(0.0096) Grad: 15181.2295  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0039(0.0097) Grad: 5924.1816  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0185(0.0097) Grad: 36567.2812  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 7m 18s (remain 0m 2

Epoch 3 - avg_train_loss: 0.0096  avg_val_loss: 0.0143  time: 574s
Epoch 3 - Score: 0.8609
Epoch 3 - Save Best Score: 0.8609 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 11m 4s) Loss: 0.0019(0.0019) Grad: 5544.1445  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0025(0.0083) Grad: 5993.1499  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0321(0.0088) Grad: 37618.0234  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0018(0.0092) Grad: 3413.5413  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0028(0.0089) Grad: 7138.8862  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0155(0.0088) Grad: 11133.7939  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 4m 53s (remain 2m 51s) Loss: 0.0064(0.0088) Grad: 15641.4883  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0046(0.0087) Grad: 9668.0723  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0067(0.0087) Grad: 24904.0625  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 7m 19s (remain 0m 25

Epoch 4 - avg_train_loss: 0.0087  avg_val_loss: 0.0141  time: 574s
Epoch 4 - Score: 0.8642
Epoch 4 - Save Best Score: 0.8642 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 11m 12s) Loss: 0.0038(0.0038) Grad: 6178.9312  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 49s (remain 6m 58s) Loss: 0.0028(0.0073) Grad: 6474.2622  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 38s (remain 6m 7s) Loss: 0.0033(0.0076) Grad: 10317.3984  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 26s (remain 5m 18s) Loss: 0.0064(0.0077) Grad: 9648.2637  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 15s (remain 4m 29s) Loss: 0.0018(0.0076) Grad: 4925.7700  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 4s (remain 3m 40s) Loss: 0.0043(0.0079) Grad: 9027.4326  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 4m 52s (remain 2m 51s) Loss: 0.0053(0.0082) Grad: 8786.6934  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 5m 41s (remain 2m 2s) Loss: 0.0060(0.0080) Grad: 8710.0010  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 6m 30s (remain 1m 14s) Loss: 0.0068(0.0081) Grad: 11386.6465  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 7m 18s (remain 0m 25s

Epoch 5 - avg_train_loss: 0.0082  avg_val_loss: 0.0146  time: 573s
Epoch 5 - Score: 0.8649
Epoch 5 - Save Best Score: 0.8649 Model
Score: 0.8649
Score: 0.8692
