**Download Datasets**

In [None]:
# ====================================================
# Download datasets
# ====================================================
%cd /content
%rm -r project
%mkdir project
%cd project
%mkdir input
%cd input
!wget https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/nbme-score-clinical-patient-notes.zip
!unzip nbme-score-clinical-patient-notes.zip -d nbme-score-clinical-patient-notes
!wget https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/deberta%20v2_3%20fast%20tokenizer.zip
!unzip "deberta v2_3 fast tokenizer.zip" -d "deberta-v2-3-fast-tokenizer"
%cd ..
%ls

/content
/content/project
/content/project/input
--2022-05-01 01:12:05--  https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/nbme-score-clinical-patient-notes.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Ameer-eng/nbme-score-clinical-patient-notes-files/main/nbme-score-clinical-patient-notes.zip [following]
--2022-05-01 01:12:05--  https://raw.githubusercontent.com/Ameer-eng/nbme-score-clinical-patient-notes-files/main/nbme-score-clinical-patient-notes.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10305097 (9.8M) [application/zip]
Saving to: ‘nbme-score-cl

In [None]:
%mkdir output

In [None]:
%cd output

/content/project/output


**Directory Settings**

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

**Config**

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

**Library**

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip uninstall -y transformers')
#os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
!pip install transformers
!pip install tokenizers
!pip install sentencepiece



In [None]:
# The following is necessary to use the fast tokenizer for deberta v2 or v3, which is needed to get character offsets of tokens
import shutil
from pathlib import Path

transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path / convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in [
    "tokenization_deberta_v2.py",
    "tokenization_deberta_v2_fast.py",
    "deberta__init__.py",
]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path / str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path / filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir / filename, filepath)

In [None]:
import tokenizers
import transformers

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [None]:
# Testing the tokenizer
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model)
sequences1 = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
sequences2 = ["good", "bad"]
batch = tokenizer("good tokenizer ather", add_special_tokens=True,  padding=True, truncation=True, return_offsets_mapping=True)
display(batch)
tokens = tokenizer.convert_ids_to_tokens(batch["input_ids"])
display(tokens)
display(batch.sequence_ids())
#np.where()[0]
np.where(np.array(batch.sequence_ids()) != 0)[0]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [1, 397, 10704, 15299, 288, 4441, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 4), (4, 10), (10, 14), (14, 17), (17, 20), (0, 0)]}

['[CLS]', '▁good', '▁token', 'izer', '▁at', 'her', '[SEP]']

[None, 0, 0, 0, 0, 0, None]

array([0, 6])

**Helper Functions for Scoring**

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

**Utils**

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

**Load Data**

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [None]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8181
0    4399
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

**CV Split**

In [None]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [None]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

**Tokenizer**

In [None]:
# ====================================================
# tokenizer
# ====================================================
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**Dataset**

In [None]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 323


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 354


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    """
    Takes in patient note and feature text pair, and returns tokenized input with values converted into torch tensors.

    Args:
        cfg (configuration class): configuration.
        text (string): patient note.
        feature_text (string): feature text.

    Returns:
       inputs: tokenizer output.
    """
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    """
    Takes in patient note and location list, and returns label tensor with the ith entry being the label of the ith token of the patient note

    Args:
        cfg (configuration class): configuration.
        text (string): patient note.
        location_list (list of location strings): locations
        annotation_length (int): number of annotations in the location list.


    Returns:
       labels: tensor of token labels.
    """
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0] # indexes of special tokens
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
              # loc is a start end pair of a span
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                  # Mark all tokens that intersect the location as positive
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

In [None]:
"k k".split()

['k', 'k']

**Model**

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
          # Initialize linear head with normally distributed weights, as in deberta
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
m = nn.Linear(5, 1)
input = torch.tensor([[1.,2.,3.,4.,5.]])
output = m(input)
display(output)
m.weight.data.fill_(0)
display(m.weight)
m(input)

tensor([[2.0136]], grad_fn=<AddmmBackward0>)

Parameter containing:
tensor([[0., 0., 0., 0., 0.]], requires_grad=True)

tensor([[0.0902]], grad_fn=<AddmmBackward0>)

**Helper Functions**

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trains an epoch and returns the average loss over the epoch.
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg

# returns average loss and predicted probabilities on the validation data
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_li
            near_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 1s (remain 19m 44s) Loss: 0.6325(0.6325) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 55s (remain 7m 46s) Loss: 0.0270(0.1007) Grad: 4791.7188  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 51s (remain 6m 56s) Loss: 0.0243(0.0667) Grad: 4872.5552  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 47s (remain 6m 1s) Loss: 0.0231(0.0534) Grad: 2887.8792  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 42s (remain 5m 6s) Loss: 0.0150(0.0459) Grad: 1945.5377  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 38s (remain 4m 11s) Loss: 0.0125(0.0409) Grad: 2050.3523  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0215(0.0373) Grad: 3083.3281  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0145(0.0345) Grad: 3096.8455  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 7m 26s (remain 1m 24s) Loss: 0.0273(0.0322) Grad: 5382.0942  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 8m 22s (remain 0m 29s) Loss:

Epoch 1 - avg_train_loss: 0.0297  avg_val_loss: 0.0149  time: 631s
Epoch 1 - Score: 0.8369
Epoch 1 - Save Best Score: 0.8369 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 20m 31s) Loss: 0.0077(0.0077) Grad: 8760.8018  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 58s (remain 8m 17s) Loss: 0.0213(0.0124) Grad: 27182.2617  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 56s (remain 7m 15s) Loss: 0.0022(0.0127) Grad: 6075.8027  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 54s (remain 6m 17s) Loss: 0.0056(0.0121) Grad: 10123.5254  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 51s (remain 5m 18s) Loss: 0.0081(0.0120) Grad: 43455.9531  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 49s (remain 4m 20s) Loss: 0.0056(0.0117) Grad: 11445.3262  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 5m 46s (remain 3m 23s) Loss: 0.0029(0.0115) Grad: 6487.0635  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 6m 44s (remain 2m 25s) Loss: 0.0094(0.0115) Grad: 12834.7314  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 7m 41s (remain 1m 27s) Loss: 0.0066(0.0116) Grad: 12947.7129  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 8m 39s (remain

Epoch 2 - avg_train_loss: 0.0115  avg_val_loss: 0.0137  time: 656s
Epoch 2 - Score: 0.8573
Epoch 2 - Save Best Score: 0.8573 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 2s) Loss: 0.0121(0.0121) Grad: 23047.6602  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 58s (remain 8m 16s) Loss: 0.0166(0.0105) Grad: 19886.5117  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 56s (remain 7m 15s) Loss: 0.0013(0.0106) Grad: 4658.4321  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 53s (remain 6m 16s) Loss: 0.0140(0.0105) Grad: 24620.1523  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 51s (remain 5m 18s) Loss: 0.0032(0.0102) Grad: 8419.4219  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 48s (remain 4m 20s) Loss: 0.0050(0.0104) Grad: 11215.7051  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 5m 46s (remain 3m 22s) Loss: 0.0074(0.0104) Grad: 10711.1377  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 6m 43s (remain 2m 25s) Loss: 0.0032(0.0103) Grad: 6767.1377  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 7m 41s (remain 1m 27s) Loss: 0.0148(0.0102) Grad: 19685.9844  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 8m 38s (remain 

Epoch 3 - avg_train_loss: 0.0102  avg_val_loss: 0.0130  time: 651s
Epoch 3 - Score: 0.8600
Epoch 3 - Save Best Score: 0.8600 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 15m 7s) Loss: 0.0039(0.0039) Grad: 6573.1660  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 57s (remain 8m 5s) Loss: 0.0067(0.0089) Grad: 18190.6250  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 53s (remain 7m 6s) Loss: 0.0158(0.0093) Grad: 31213.8242  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 50s (remain 6m 8s) Loss: 0.0053(0.0091) Grad: 10338.5918  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 46s (remain 5m 12s) Loss: 0.0047(0.0089) Grad: 18512.3340  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 43s (remain 4m 15s) Loss: 0.0055(0.0091) Grad: 16640.1973  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 5m 39s (remain 3m 18s) Loss: 0.0053(0.0094) Grad: 14098.4971  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 6m 36s (remain 2m 22s) Loss: 0.0100(0.0094) Grad: 20496.7129  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 7m 32s (remain 1m 25s) Loss: 0.0147(0.0093) Grad: 21183.2422  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 8m 28s (remain 0

Epoch 4 - avg_train_loss: 0.0092  avg_val_loss: 0.0136  time: 639s
Epoch 4 - Score: 0.8615
Epoch 4 - Save Best Score: 0.8615 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 15m 52s) Loss: 0.0059(0.0059) Grad: 8278.7158  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0039(0.0088) Grad: 7482.4946  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 52s (remain 7m 0s) Loss: 0.0075(0.0088) Grad: 12922.8965  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0124(0.0087) Grad: 24002.7148  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 43s (remain 5m 8s) Loss: 0.0076(0.0088) Grad: 7889.8472  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 39s (remain 4m 12s) Loss: 0.0072(0.0087) Grad: 9126.3105  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0094(0.0087) Grad: 16912.9980  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 6m 30s (remain 2m 20s) Loss: 0.0051(0.0085) Grad: 9331.1396  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 7m 26s (remain 1m 24s) Loss: 0.0231(0.0086) Grad: 31079.0742  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 8m 22s (remain 0m 2

Epoch 5 - avg_train_loss: 0.0086  avg_val_loss: 0.0135  time: 636s
Epoch 5 - Score: 0.8616
Epoch 5 - Save Best Score: 0.8616 Model
Score: 0.8616
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expe

Epoch: [1][0/953] Elapsed 0m 0s (remain 14m 58s) Loss: 0.5841(0.5841) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 56s (remain 8m 0s) Loss: 0.0375(0.0868) Grad: 6841.1895  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0323(0.0590) Grad: 7040.6865  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0182(0.0474) Grad: 3285.3870  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 43s (remain 5m 8s) Loss: 0.0119(0.0404) Grad: 1299.3373  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 39s (remain 4m 12s) Loss: 0.0121(0.0363) Grad: 2545.2024  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0063(0.0333) Grad: 1030.3280  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0323(0.0314) Grad: 3313.3352  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 7m 26s (remain 1m 24s) Loss: 0.0239(0.0295) Grad: 6454.3784  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 8m 22s (remain 0m 28s) Loss: 0

Epoch 1 - avg_train_loss: 0.0277  avg_val_loss: 0.0147  time: 631s
Epoch 1 - Score: 0.8393
Epoch 1 - Save Best Score: 0.8393 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 15m 46s) Loss: 0.0029(0.0029) Grad: 9095.8730  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 57s (remain 8m 1s) Loss: 0.0107(0.0134) Grad: 12974.9873  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 52s (remain 7m 2s) Loss: 0.0023(0.0124) Grad: 2814.0134  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0183(0.0125) Grad: 16201.2900  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0143(0.0127) Grad: 17413.9746  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 40s (remain 4m 13s) Loss: 0.0031(0.0123) Grad: 9750.4824  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 5m 36s (remain 3m 16s) Loss: 0.0301(0.0119) Grad: 23988.6602  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 6m 32s (remain 2m 21s) Loss: 0.0281(0.0119) Grad: 22561.9453  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 7m 28s (remain 1m 25s) Loss: 0.0024(0.0119) Grad: 5011.9727  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 8m 23s (remain 0m 2

Epoch 2 - avg_train_loss: 0.0116  avg_val_loss: 0.0135  time: 633s
Epoch 2 - Score: 0.8549
Epoch 2 - Save Best Score: 0.8549 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 15m 45s) Loss: 0.0077(0.0077) Grad: 15338.0674  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0045(0.0108) Grad: 11072.2236  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0264(0.0106) Grad: 37687.0977  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0286(0.0104) Grad: 33048.0039  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0016(0.0101) Grad: 7006.0977  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0063(0.0101) Grad: 21358.8770  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0170(0.0104) Grad: 27837.7383  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0087(0.0102) Grad: 12556.7598  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0048(0.0101) Grad: 13335.6045  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 8m 23s (remain 

Epoch 3 - avg_train_loss: 0.0100  avg_val_loss: 0.0138  time: 632s
Epoch 3 - Score: 0.8573
Epoch 3 - Save Best Score: 0.8573 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 14m 29s) Loss: 0.0089(0.0089) Grad: 29467.1836  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 56s (remain 7m 58s) Loss: 0.0064(0.0082) Grad: 15633.3330  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 52s (remain 7m 0s) Loss: 0.0033(0.0083) Grad: 12213.8506  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0074(0.0083) Grad: 11497.4668  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 43s (remain 5m 7s) Loss: 0.0081(0.0082) Grad: 14440.2646  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 39s (remain 4m 12s) Loss: 0.0084(0.0086) Grad: 16385.5371  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0103(0.0087) Grad: 12276.4189  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 6m 30s (remain 2m 20s) Loss: 0.0053(0.0088) Grad: 7693.9863  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 7m 26s (remain 1m 24s) Loss: 0.0052(0.0089) Grad: 10424.5020  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 8m 22s (remain 

Epoch 4 - avg_train_loss: 0.0090  avg_val_loss: 0.0134  time: 631s
Epoch 4 - Score: 0.8633
Epoch 4 - Save Best Score: 0.8633 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 7s) Loss: 0.0026(0.0026) Grad: 6648.4517  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 56s (remain 8m 0s) Loss: 0.0112(0.0085) Grad: 23956.0918  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0102(0.0087) Grad: 14938.9307  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0018(0.0083) Grad: 3669.2937  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0129(0.0086) Grad: 18900.4824  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0072(0.0085) Grad: 15878.4443  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0031(0.0084) Grad: 12752.5654  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0023(0.0084) Grad: 6125.4668  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0141(0.0085) Grad: 23793.3945  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 8m 23s (remain 0m 2

Epoch 5 - avg_train_loss: 0.0084  avg_val_loss: 0.0136  time: 632s
Epoch 5 - Score: 0.8643
Epoch 5 - Save Best Score: 0.8643 Model
Score: 0.8643
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expe

Epoch: [1][0/953] Elapsed 0m 0s (remain 15m 11s) Loss: 0.2833(0.2833) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 57s (remain 8m 0s) Loss: 0.0384(0.0650) Grad: 7271.9932  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0429(0.0484) Grad: 17328.7383  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0217(0.0407) Grad: 7726.3184  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0316(0.0355) Grad: 11681.3701  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 39s (remain 4m 12s) Loss: 0.0088(0.0325) Grad: 2545.7996  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0169(0.0301) Grad: 4386.5894  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0161(0.0285) Grad: 8347.5371  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0102(0.0269) Grad: 4161.8271  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 8m 23s (remain 0m 29s) Loss:

Epoch 1 - avg_train_loss: 0.0251  avg_val_loss: 0.0159  time: 632s
Epoch 1 - Score: 0.8119
Epoch 1 - Save Best Score: 0.8119 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 17m 22s) Loss: 0.0534(0.0534) Grad: 86812.6328  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 57s (remain 8m 1s) Loss: 0.0025(0.0130) Grad: 5722.5005  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 53s (remain 7m 3s) Loss: 0.0079(0.0128) Grad: 37133.4727  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 49s (remain 6m 6s) Loss: 0.0163(0.0123) Grad: 18501.6465  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 45s (remain 5m 9s) Loss: 0.0057(0.0123) Grad: 20124.3203  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 41s (remain 4m 13s) Loss: 0.0098(0.0122) Grad: 16600.7500  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 5m 37s (remain 3m 17s) Loss: 0.0084(0.0122) Grad: 12996.7139  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 6m 33s (remain 2m 21s) Loss: 0.0159(0.0122) Grad: 24887.1367  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 7m 29s (remain 1m 25s) Loss: 0.0132(0.0120) Grad: 23411.2344  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 8m 25s (remain 0

Epoch 2 - avg_train_loss: 0.0118  avg_val_loss: 0.0131  time: 635s
Epoch 2 - Score: 0.8566
Epoch 2 - Save Best Score: 0.8566 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 19s) Loss: 0.0049(0.0049) Grad: 11561.6523  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0101(0.0093) Grad: 19466.3164  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0023(0.0097) Grad: 6262.3706  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0085(0.0097) Grad: 21641.7324  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0145(0.0102) Grad: 13644.6719  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0012(0.0101) Grad: 3582.8213  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 5m 36s (remain 3m 16s) Loss: 0.0079(0.0099) Grad: 20627.0684  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 6m 32s (remain 2m 20s) Loss: 0.0209(0.0100) Grad: 38610.3359  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 7m 28s (remain 1m 25s) Loss: 0.0239(0.0101) Grad: 36272.8672  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 8m 24s (remain 0

Epoch 3 - avg_train_loss: 0.0100  avg_val_loss: 0.0135  time: 633s
Epoch 3 - Score: 0.8613
Epoch 3 - Save Best Score: 0.8613 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 16m 39s) Loss: 0.0011(0.0011) Grad: 8988.3779  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0246(0.0091) Grad: 40938.5703  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 52s (remain 7m 2s) Loss: 0.0045(0.0094) Grad: 42879.4883  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0096(0.0090) Grad: 31326.3594  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0075(0.0089) Grad: 17543.1270  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 40s (remain 4m 13s) Loss: 0.0241(0.0086) Grad: 18296.5938  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 5m 36s (remain 3m 17s) Loss: 0.0057(0.0085) Grad: 8752.5078  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 6m 32s (remain 2m 21s) Loss: 0.0216(0.0086) Grad: 17018.3281  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 7m 28s (remain 1m 25s) Loss: 0.0013(0.0086) Grad: 3989.7292  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 8m 24s (remain 0m

Epoch 4 - avg_train_loss: 0.0086  avg_val_loss: 0.0131  time: 634s
Epoch 4 - Score: 0.8661
Epoch 4 - Save Best Score: 0.8661 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 20s) Loss: 0.0083(0.0083) Grad: 9512.6270  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 56s (remain 7m 58s) Loss: 0.0214(0.0086) Grad: 62217.8086  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0089(0.0086) Grad: 22191.6621  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0078(0.0083) Grad: 16323.6260  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0031(0.0085) Grad: 15154.5771  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0057(0.0083) Grad: 8248.5479  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 5m 36s (remain 3m 16s) Loss: 0.0033(0.0081) Grad: 10812.9766  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 6m 32s (remain 2m 20s) Loss: 0.0098(0.0079) Grad: 21135.4238  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0089(0.0079) Grad: 36131.4453  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 8m 23s (remain 0

Epoch 5 - avg_train_loss: 0.0078  avg_val_loss: 0.0133  time: 633s
Epoch 5 - Score: 0.8661
Score: 0.8661
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing

Epoch: [1][0/953] Elapsed 0m 0s (remain 14m 41s) Loss: 1.2440(1.2440) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0488(0.1514) Grad: 1116.7002  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0250(0.0953) Grad: 1316.1116  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0203(0.0724) Grad: 2352.5986  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0275(0.0595) Grad: 1272.9961  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0239(0.0515) Grad: 1240.9419  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0303(0.0460) Grad: 687.4151  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0136(0.0418) Grad: 1389.7552  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0234(0.0388) Grad: 1123.1578  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 8m 22s (remain 0m 29s) Loss: 0

Epoch 1 - avg_train_loss: 0.0353  avg_val_loss: 0.0158  time: 632s
Epoch 1 - Score: 0.8296
Epoch 1 - Save Best Score: 0.8296 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 16m 41s) Loss: 0.0263(0.0263) Grad: 17257.6738  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 57s (remain 8m 1s) Loss: 0.0013(0.0140) Grad: 3104.8445  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 52s (remain 7m 2s) Loss: 0.0019(0.0127) Grad: 4048.4014  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0038(0.0128) Grad: 11712.8760  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0364(0.0127) Grad: 16240.8086  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 40s (remain 4m 13s) Loss: 0.0074(0.0122) Grad: 10888.9658  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 5m 36s (remain 3m 17s) Loss: 0.0171(0.0120) Grad: 24338.3691  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 6m 32s (remain 2m 21s) Loss: 0.0056(0.0118) Grad: 13329.0801  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 7m 28s (remain 1m 25s) Loss: 0.0022(0.0119) Grad: 7986.5464  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 8m 24s (remain 0m 

Epoch 2 - avg_train_loss: 0.0116  avg_val_loss: 0.0140  time: 633s
Epoch 2 - Score: 0.8563
Epoch 2 - Save Best Score: 0.8563 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 15m 41s) Loss: 0.0152(0.0152) Grad: 22719.5605  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 56s (remain 8m 0s) Loss: 0.0115(0.0112) Grad: 17065.5156  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0056(0.0114) Grad: 11509.0684  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0122(0.0108) Grad: 18811.7109  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0388(0.0108) Grad: 30578.5820  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0097(0.0105) Grad: 15086.7539  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 5m 36s (remain 3m 16s) Loss: 0.0253(0.0104) Grad: 33156.6523  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0021(0.0104) Grad: 3744.9819  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0086(0.0104) Grad: 11628.2197  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 8m 23s (remain 0

Epoch 3 - avg_train_loss: 0.0104  avg_val_loss: 0.0135  time: 632s
Epoch 3 - Score: 0.8616
Epoch 3 - Save Best Score: 0.8616 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 15m 48s) Loss: 0.0035(0.0035) Grad: 9704.5479  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0044(0.0098) Grad: 6691.7886  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0044(0.0091) Grad: 4136.0239  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0114(0.0096) Grad: 17939.2910  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0034(0.0094) Grad: 15111.5703  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 39s (remain 4m 12s) Loss: 0.0065(0.0094) Grad: 7679.8135  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0023(0.0096) Grad: 5168.5215  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0033(0.0100) Grad: 5715.0312  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0086(0.0099) Grad: 18823.5820  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 8m 22s (remain 0m 29

Epoch 4 - avg_train_loss: 0.0099  avg_val_loss: 0.0139  time: 631s
Epoch 4 - Score: 0.8604


Epoch: [5][0/953] Elapsed 0m 0s (remain 15m 0s) Loss: 0.0024(0.0024) Grad: 6767.1919  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0054(0.0087) Grad: 14920.7412  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0135(0.0087) Grad: 24867.8770  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0077(0.0087) Grad: 10293.4873  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0114(0.0088) Grad: 13030.3506  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 40s (remain 4m 13s) Loss: 0.0027(0.0090) Grad: 5702.5054  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 5m 36s (remain 3m 16s) Loss: 0.0045(0.0093) Grad: 11720.5244  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 6m 32s (remain 2m 20s) Loss: 0.0051(0.0095) Grad: 9323.4893  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 7m 27s (remain 1m 25s) Loss: 0.0027(0.0094) Grad: 7416.7617  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 8m 23s (remain 0m 2

Epoch 5 - avg_train_loss: 0.0093  avg_val_loss: 0.0140  time: 633s
Epoch 5 - Score: 0.8611
Score: 0.8616
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing

Epoch: [1][0/953] Elapsed 0m 0s (remain 14m 54s) Loss: 0.7371(0.7371) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 56s (remain 8m 0s) Loss: 0.0239(0.1021) Grad: 2168.4497  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 52s (remain 7m 1s) Loss: 0.0164(0.0665) Grad: 4387.1943  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 48s (remain 6m 4s) Loss: 0.0107(0.0526) Grad: 2305.9619  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 44s (remain 5m 8s) Loss: 0.0095(0.0455) Grad: 2289.4265  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 40s (remain 4m 12s) Loss: 0.0116(0.0405) Grad: 2245.0413  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 5m 35s (remain 3m 16s) Loss: 0.0299(0.0371) Grad: 2730.5737  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 6m 31s (remain 2m 20s) Loss: 0.0043(0.0344) Grad: 863.8657  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 7m 27s (remain 1m 24s) Loss: 0.0304(0.0322) Grad: 5839.6255  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 8m 23s (remain 0m 29s) Loss: 0.

Epoch 1 - avg_train_loss: 0.0296  avg_val_loss: 0.0140  time: 632s
Epoch 1 - Score: 0.8441
Epoch 1 - Save Best Score: 0.8441 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 15m 57s) Loss: 0.0026(0.0026) Grad: 5493.4834  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 57s (remain 8m 1s) Loss: 0.0230(0.0111) Grad: 32209.4023  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 53s (remain 7m 2s) Loss: 0.0072(0.0111) Grad: 16271.6533  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0066(0.0114) Grad: 10108.2090  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0125(0.0111) Grad: 35264.3984  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 40s (remain 4m 13s) Loss: 0.0052(0.0112) Grad: 5816.7246  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 5m 36s (remain 3m 17s) Loss: 0.0074(0.0113) Grad: 9804.8086  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 6m 33s (remain 2m 21s) Loss: 0.0068(0.0113) Grad: 7752.7227  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 7m 29s (remain 1m 25s) Loss: 0.0049(0.0113) Grad: 10019.1670  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 8m 25s (remain 0m 2

Epoch 2 - avg_train_loss: 0.0115  avg_val_loss: 0.0132  time: 635s
Epoch 2 - Score: 0.8526
Epoch 2 - Save Best Score: 0.8526 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 58s) Loss: 0.0115(0.0115) Grad: 12356.6875  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 57s (remain 8m 1s) Loss: 0.0070(0.0096) Grad: 11055.3555  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 53s (remain 7m 3s) Loss: 0.0055(0.0101) Grad: 16892.2070  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 2m 49s (remain 6m 6s) Loss: 0.0101(0.0099) Grad: 11008.9072  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 3m 45s (remain 5m 11s) Loss: 0.0064(0.0098) Grad: 18427.7637  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 42s (remain 4m 14s) Loss: 0.0183(0.0100) Grad: 34408.0469  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 5m 38s (remain 3m 18s) Loss: 0.0141(0.0102) Grad: 20380.5625  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 6m 35s (remain 2m 22s) Loss: 0.0069(0.0100) Grad: 11135.4795  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 7m 31s (remain 1m 25s) Loss: 0.0142(0.0102) Grad: 21890.0293  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 8m 27s (remain

Epoch 3 - avg_train_loss: 0.0102  avg_val_loss: 0.0133  time: 638s
Epoch 3 - Score: 0.8602
Epoch 3 - Save Best Score: 0.8602 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 15m 32s) Loss: 0.0090(0.0090) Grad: 11742.6279  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 56s (remain 7m 59s) Loss: 0.0111(0.0093) Grad: 17451.2637  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 1m 53s (remain 7m 4s) Loss: 0.0037(0.0095) Grad: 15199.1318  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 49s (remain 6m 6s) Loss: 0.0091(0.0093) Grad: 31537.2578  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 45s (remain 5m 10s) Loss: 0.0163(0.0095) Grad: 50349.2188  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 4m 41s (remain 4m 13s) Loss: 0.0092(0.0093) Grad: 21412.4004  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 5m 37s (remain 3m 17s) Loss: 0.0088(0.0091) Grad: 19009.1953  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 6m 33s (remain 2m 21s) Loss: 0.0172(0.0093) Grad: 73268.1250  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 7m 29s (remain 1m 25s) Loss: 0.0295(0.0094) Grad: 31070.7539  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 8m 25s (remai

Epoch 4 - avg_train_loss: 0.0093  avg_val_loss: 0.0133  time: 635s
Epoch 4 - Score: 0.8610
Epoch 4 - Save Best Score: 0.8610 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 42s) Loss: 0.0058(0.0058) Grad: 8590.7256  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 57s (remain 8m 1s) Loss: 0.0033(0.0082) Grad: 4141.9570  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 1m 52s (remain 7m 2s) Loss: 0.0027(0.0087) Grad: 9442.6299  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 2m 48s (remain 6m 5s) Loss: 0.0134(0.0082) Grad: 19283.7305  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 3m 44s (remain 5m 9s) Loss: 0.0026(0.0082) Grad: 11781.5322  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 4m 40s (remain 4m 13s) Loss: 0.0133(0.0081) Grad: 43817.0938  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 5m 36s (remain 3m 17s) Loss: 0.0030(0.0083) Grad: 8214.8740  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 6m 32s (remain 2m 21s) Loss: 0.0044(0.0084) Grad: 10001.9580  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 7m 28s (remain 1m 25s) Loss: 0.0015(0.0085) Grad: 5671.8335  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 8m 24s (remain 0m 29

Epoch 5 - avg_train_loss: 0.0087  avg_val_loss: 0.0137  time: 634s
Epoch 5 - Score: 0.8608
Score: 0.8610
Score: 0.8629


In [None]:
!zip -r /content/project.zip /content/project

  adding: content/project/ (stored 0%)
  adding: content/project/input/ (stored 0%)
  adding: content/project/input/nbme-score-clinical-patient-notes/ (stored 0%)
  adding: content/project/input/nbme-score-clinical-patient-notes/features.csv (deflated 52%)
  adding: content/project/input/nbme-score-clinical-patient-notes/test.csv (deflated 55%)
  adding: content/project/input/nbme-score-clinical-patient-notes/patient_notes.csv (deflated 72%)
  adding: content/project/input/nbme-score-clinical-patient-notes/sample_submission.csv (deflated 34%)
  adding: content/project/input/nbme-score-clinical-patient-notes/train.csv (deflated 74%)
  adding: content/project/input/deberta v2_3 fast tokenizer.zip (stored 0%)
  adding: content/project/input/nbme-score-clinical-patient-notes.zip (stored 0%)
  adding: content/project/input/deberta-v2-3-fast-tokenizer/ (stored 0%)
  adding: content/project/input/deberta-v2-3-fast-tokenizer/transformers__init__.py (deflated 86%)
  adding: content/project/inpu

In [None]:
model = CustomModel(CFG, config_path=None, pretrained=True)

NameError: ignored