**Download Datasets**

In [None]:
# ====================================================
# Download datasets
# ====================================================
%cd /content
%rm -r project
%mkdir project
%cd project
%mkdir input
%cd input
!wget https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/nbme-score-clinical-patient-notes.zip
!unzip nbme-score-clinical-patient-notes.zip -d nbme-score-clinical-patient-notes
!wget https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/deberta%20v2_3%20fast%20tokenizer.zip
!unzip "deberta v2_3 fast tokenizer.zip" -d "deberta-v2-3-fast-tokenizer"
%cd ..
%ls

/content
rm: cannot remove 'project': No such file or directory
/content/project
/content/project/input
--2022-05-01 06:53:50--  https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/nbme-score-clinical-patient-notes.zip
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Ameer-eng/nbme-score-clinical-patient-notes-files/main/nbme-score-clinical-patient-notes.zip [following]
--2022-05-01 06:53:50--  https://raw.githubusercontent.com/Ameer-eng/nbme-score-clinical-patient-notes-files/main/nbme-score-clinical-patient-notes.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 

In [None]:
%mkdir output

In [None]:
%cd output

/content/project/output


**Directory Settings**

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

**Config**

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-xsmall"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

**Library**

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip uninstall -y transformers')
#os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
!pip install transformers
!pip install tokenizers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 10.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 9.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [None]:
# The following is necessary to use the fast tokenizer for deberta v2 or v3, which is needed to get character offsets of tokens
import shutil
from pathlib import Path

transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path / convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in [
    "tokenization_deberta_v2.py",
    "tokenization_deberta_v2_fast.py",
    "deberta__init__.py",
]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path / str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path / filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir / filename, filepath)

In [None]:
import tokenizers
import transformers

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [None]:
# Testing the tokenizer
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model)
sequences1 = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
sequences2 = ["good", "bad"]
batch = tokenizer("good tokenizer ather", add_special_tokens=True,  padding=True, truncation=True, return_offsets_mapping=True)
display(batch)
tokens = tokenizer.convert_ids_to_tokens(batch["input_ids"])
display(tokens)
display(batch.sequence_ids())
#np.where()[0]
np.where(np.array(batch.sequence_ids()) != 0)[0]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [1, 397, 10704, 15299, 288, 4441, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 4), (4, 10), (10, 14), (14, 17), (17, 20), (0, 0)]}

['[CLS]', '▁good', '▁token', 'izer', '▁at', 'her', '[SEP]']

[None, 0, 0, 0, 0, 0, None]

array([0, 6])

**Helper Functions for Scoring**

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

**Utils**

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

**Load Data**

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [None]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8181
0    4399
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

**CV Split**

In [None]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [None]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

**Tokenizer**

In [None]:
# ====================================================
# tokenizer
# ====================================================
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**Dataset**

In [None]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 323


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 354


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    """
    Takes in patient note and feature text pair, and returns tokenized input with values converted into torch tensors.

    Args:
        cfg (configuration class): configuration.
        text (string): patient note.
        feature_text (string): feature text.

    Returns:
       inputs: tokenizer output.
    """
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    """
    Takes in patient note and location list, and returns label tensor with the ith entry being the label of the ith token

    Args:
        cfg (configuration class): configuration.
        text (string): patient note.
        location_list (list of location strings): locations
        annotation_length (int): number of annotations in the location list.


    Returns:
       labels: tensor of token labels.
    """
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0] # indexes of special tokens
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
              # loc is a start end pair of a span
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                  # Mark all tokens that intersect the location as positive
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

In [None]:
"k k".split()

['k', 'k']

**Model**

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
          # Initialize linear head with normally distributed weights, as in deberta
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
m = nn.Linear(5, 1)
input = torch.tensor([[1.,2.,3.,4.,5.]])
output = m(input)
display(output)
m.weight.data.fill_(0)
display(m.weight)
m(input)

tensor([[2.0136]], grad_fn=<AddmmBackward0>)

Parameter containing:
tensor([[0., 0., 0., 0., 0.]], requires_grad=True)

tensor([[0.0902]], grad_fn=<AddmmBackward0>)

**Helper Functions**

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trains an epoch and returns the average loss over the epoch.
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg

# returns average loss and predicted probabilities on the validation data
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()



Downloading:   0%|          | 0.00/230M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-xsmall were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.word_embeddings._weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequ

Epoch: [1][0/953] Elapsed 0m 0s (remain 13m 1s) Loss: 0.6742(0.6742) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 26s (remain 3m 42s) Loss: 0.0741(0.1575) Grad: 1514.6948  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 0m 52s (remain 3m 17s) Loss: 0.0803(0.1050) Grad: 3745.3494  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 1m 19s (remain 2m 51s) Loss: 0.0483(0.0818) Grad: 4575.2705  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 1m 45s (remain 2m 24s) Loss: 0.0269(0.0700) Grad: 4979.1367  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 2m 11s (remain 1m 58s) Loss: 0.0340(0.0628) Grad: 3063.6462  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 2m 37s (remain 1m 32s) Loss: 0.0203(0.0575) Grad: 1836.3763  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 3m 4s (remain 1m 6s) Loss: 0.0178(0.0533) Grad: 2749.2512  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 3m 30s (remain 0m 40s) Loss: 0.0148(0.0498) Grad: 3171.3123  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 3m 57s (remain 0m 13s) Loss: 

Epoch 1 - avg_train_loss: 0.0455  avg_val_loss: 0.0211  time: 291s
Epoch 1 - Score: 0.7548
Epoch 1 - Save Best Score: 0.7548 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 11m 39s) Loss: 0.0218(0.0218) Grad: 17740.8906  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0301(0.0229) Grad: 43735.5586  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0268(0.0218) Grad: 20503.7012  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 1m 21s (remain 2m 57s) Loss: 0.0162(0.0207) Grad: 20861.1230  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0275(0.0205) Grad: 24416.2090  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0125(0.0204) Grad: 22091.3691  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 2m 43s (remain 1m 35s) Loss: 0.0162(0.0204) Grad: 12996.3223  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 3m 10s (remain 1m 8s) Loss: 0.0092(0.0200) Grad: 15467.1113  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 3m 37s (remain 0m 41s) Loss: 0.0158(0.0200) Grad: 16694.8672  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 4m 4s (remain

Epoch 2 - avg_train_loss: 0.0200  avg_val_loss: 0.0190  time: 301s
Epoch 2 - Score: 0.7870
Epoch 2 - Save Best Score: 0.7870 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 46s) Loss: 0.0322(0.0322) Grad: 23660.9062  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 28s (remain 3m 57s) Loss: 0.0158(0.0197) Grad: 26562.1992  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 0m 55s (remain 3m 27s) Loss: 0.0125(0.0194) Grad: 19757.0527  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 1m 22s (remain 2m 59s) Loss: 0.0096(0.0188) Grad: 13362.2119  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 1m 50s (remain 2m 31s) Loss: 0.0323(0.0187) Grad: 52089.9492  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 2m 17s (remain 2m 4s) Loss: 0.0164(0.0186) Grad: 15081.0195  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 2m 45s (remain 1m 36s) Loss: 0.0483(0.0185) Grad: 86609.2344  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 3m 12s (remain 1m 9s) Loss: 0.0214(0.0183) Grad: 18405.8945  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 3m 40s (remain 0m 41s) Loss: 0.0177(0.0182) Grad: 22327.9590  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 4m 7s (remain

Epoch 3 - avg_train_loss: 0.0181  avg_val_loss: 0.0183  time: 303s
Epoch 3 - Score: 0.7968
Epoch 3 - Save Best Score: 0.7968 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 9m 59s) Loss: 0.0210(0.0210) Grad: 20849.7617  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0228(0.0161) Grad: 53765.7109  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0176(0.0165) Grad: 28355.1113  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0185(0.0170) Grad: 23925.0664  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0135(0.0169) Grad: 29594.6289  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0395(0.0170) Grad: 36904.1914  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 2m 42s (remain 1m 34s) Loss: 0.0156(0.0170) Grad: 26675.6582  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 3m 9s (remain 1m 7s) Loss: 0.0281(0.0172) Grad: 46544.7227  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 3m 36s (remain 0m 40s) Loss: 0.0104(0.0172) Grad: 12783.0762  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 4m 2s (remain 0

Epoch 4 - avg_train_loss: 0.0170  avg_val_loss: 0.0178  time: 299s
Epoch 4 - Score: 0.7996
Epoch 4 - Save Best Score: 0.7996 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 13m 22s) Loss: 0.0094(0.0094) Grad: 17879.9355  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 28s (remain 3m 58s) Loss: 0.0070(0.0159) Grad: 18980.7891  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 0m 55s (remain 3m 28s) Loss: 0.0064(0.0152) Grad: 14284.1934  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 1m 23s (remain 3m 0s) Loss: 0.0241(0.0156) Grad: 40932.3984  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 1m 50s (remain 2m 32s) Loss: 0.0108(0.0163) Grad: 10311.9766  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 2m 18s (remain 2m 4s) Loss: 0.0042(0.0166) Grad: 5143.0693  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 2m 45s (remain 1m 37s) Loss: 0.0032(0.0166) Grad: 7383.3237  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 3m 13s (remain 1m 9s) Loss: 0.0101(0.0165) Grad: 11673.1611  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 3m 40s (remain 0m 41s) Loss: 0.0167(0.0165) Grad: 15720.8125  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 4m 8s (remain 0m

Epoch 5 - avg_train_loss: 0.0166  avg_val_loss: 0.0178  time: 305s
Epoch 5 - Score: 0.7989
Score: 0.7996
Some weights of the model checkpoint at microsoft/deberta-v3-xsmall were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.word_embeddings._weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that 

Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 38s) Loss: 0.7317(0.7317) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 27s (remain 3m 51s) Loss: 0.0754(0.1621) Grad: 1090.7075  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0311(0.1094) Grad: 2268.3779  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 1m 21s (remain 2m 55s) Loss: 0.0371(0.0871) Grad: 5626.5547  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 1m 48s (remain 2m 28s) Loss: 0.0364(0.0743) Grad: 1952.5448  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 2m 14s (remain 2m 1s) Loss: 0.0295(0.0657) Grad: 4767.0503  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 2m 41s (remain 1m 34s) Loss: 0.0543(0.0599) Grad: 9959.7881  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 3m 8s (remain 1m 7s) Loss: 0.0247(0.0552) Grad: 5366.2812  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0207(0.0518) Grad: 3059.5752  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 4m 2s (remain 0m 13s) Loss: 0

Epoch 1 - avg_train_loss: 0.0473  avg_val_loss: 0.0202  time: 297s
Epoch 1 - Score: 0.7582
Epoch 1 - Save Best Score: 0.7582 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 10m 48s) Loss: 0.0516(0.0516) Grad: 108699.4766  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0393(0.0231) Grad: 45961.6992  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0163(0.0215) Grad: 22874.1016  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 1m 21s (remain 2m 55s) Loss: 0.0445(0.0213) Grad: 32590.9492  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 1m 48s (remain 2m 28s) Loss: 0.0259(0.0216) Grad: 32214.4805  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 2m 14s (remain 2m 1s) Loss: 0.0137(0.0214) Grad: 15376.0635  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 2m 41s (remain 1m 34s) Loss: 0.0505(0.0213) Grad: 63275.8398  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 3m 8s (remain 1m 7s) Loss: 0.0127(0.0210) Grad: 12509.3633  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0277(0.0210) Grad: 28590.7090  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 4m 2s (remain

Epoch 2 - avg_train_loss: 0.0210  avg_val_loss: 0.0193  time: 297s
Epoch 2 - Score: 0.7847
Epoch 2 - Save Best Score: 0.7847 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 11m 11s) Loss: 0.0206(0.0206) Grad: 21689.3418  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0150(0.0192) Grad: 11147.9199  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0152(0.0190) Grad: 20381.5137  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0102(0.0187) Grad: 16297.6211  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0408(0.0189) Grad: 25758.6680  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0154(0.0191) Grad: 17237.5527  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0044(0.0192) Grad: 11042.5703  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0162(0.0192) Grad: 31906.4766  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0129(0.0191) Grad: 17158.5527  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 4m 3s (remain 

Epoch 3 - avg_train_loss: 0.0191  avg_val_loss: 0.0179  time: 298s
Epoch 3 - Score: 0.7986
Epoch 3 - Save Best Score: 0.7986 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 10m 39s) Loss: 0.0155(0.0155) Grad: 22772.6641  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0177(0.0175) Grad: 20560.6387  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0071(0.0175) Grad: 10545.3037  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0167(0.0173) Grad: 26317.2988  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 1m 48s (remain 2m 28s) Loss: 0.0056(0.0173) Grad: 17881.7812  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 2m 15s (remain 2m 1s) Loss: 0.0175(0.0178) Grad: 20753.9395  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 2m 41s (remain 1m 34s) Loss: 0.0757(0.0180) Grad: 77093.8672  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 3m 12s (remain 1m 9s) Loss: 0.0078(0.0177) Grad: 14656.6416  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 3m 40s (remain 0m 41s) Loss: 0.0180(0.0179) Grad: 18349.8848  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 4m 7s (remain

Epoch 4 - avg_train_loss: 0.0180  avg_val_loss: 0.0179  time: 302s
Epoch 4 - Score: 0.8031
Epoch 4 - Save Best Score: 0.8031 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 11m 8s) Loss: 0.0261(0.0261) Grad: 23971.6758  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0109(0.0175) Grad: 19944.1387  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0247(0.0173) Grad: 25657.1191  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0153(0.0174) Grad: 24218.8242  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0069(0.0173) Grad: 12356.4062  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0108(0.0174) Grad: 12319.2559  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0150(0.0174) Grad: 23490.6016  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0060(0.0175) Grad: 15991.6348  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0508(0.0175) Grad: 45666.8242  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 4m 3s (remain 0

Epoch 5 - avg_train_loss: 0.0172  avg_val_loss: 0.0176  time: 298s
Epoch 5 - Score: 0.8063
Epoch 5 - Save Best Score: 0.8063 Model
Score: 0.8063
Some weights of the model checkpoint at microsoft/deberta-v3-xsmall were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.word_embeddings._weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Mo

Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 16s) Loss: 0.4755(0.4755) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 27s (remain 3m 51s) Loss: 0.0312(0.1178) Grad: 6631.6650  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0493(0.0856) Grad: 4563.2114  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0303(0.0708) Grad: 6084.1367  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 1m 48s (remain 2m 28s) Loss: 0.0336(0.0619) Grad: 9490.1797  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 2m 14s (remain 2m 1s) Loss: 0.0099(0.0557) Grad: 2666.0525  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 2m 41s (remain 1m 34s) Loss: 0.0197(0.0516) Grad: 5210.2344  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 3m 8s (remain 1m 7s) Loss: 0.0412(0.0481) Grad: 11330.9189  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0348(0.0453) Grad: 11796.2471  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 4m 2s (remain 0m 13s) Loss:

Epoch 1 - avg_train_loss: 0.0421  avg_val_loss: 0.0203  time: 297s
Epoch 1 - Score: 0.7629
Epoch 1 - Save Best Score: 0.7629 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 11m 26s) Loss: 0.0375(0.0375) Grad: 21758.3301  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 27s (remain 3m 54s) Loss: 0.0371(0.0207) Grad: 26312.5605  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0350(0.0210) Grad: 41627.4375  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 1m 21s (remain 2m 57s) Loss: 0.0157(0.0204) Grad: 26368.5430  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0093(0.0205) Grad: 18051.1680  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0313(0.0204) Grad: 32408.4648  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0127(0.0202) Grad: 13189.5459  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0438(0.0204) Grad: 47037.8242  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0227(0.0203) Grad: 20408.3125  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 4m 3s (remain 

Epoch 2 - avg_train_loss: 0.0203  avg_val_loss: 0.0182  time: 298s
Epoch 2 - Score: 0.7913
Epoch 2 - Save Best Score: 0.7913 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 10m 55s) Loss: 0.0252(0.0252) Grad: 25394.3223  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0342(0.0190) Grad: 49369.9062  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0083(0.0183) Grad: 12524.3838  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0051(0.0182) Grad: 5408.6279  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0288(0.0180) Grad: 23544.4824  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0214(0.0180) Grad: 21622.1719  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 2m 42s (remain 1m 34s) Loss: 0.0148(0.0180) Grad: 19211.2734  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 3m 9s (remain 1m 7s) Loss: 0.0091(0.0181) Grad: 13964.6738  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 3m 36s (remain 0m 40s) Loss: 0.0226(0.0179) Grad: 23259.9219  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 4m 2s (remain 0

Epoch 3 - avg_train_loss: 0.0179  avg_val_loss: 0.0176  time: 297s
Epoch 3 - Score: 0.8082
Epoch 3 - Save Best Score: 0.8082 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 11m 20s) Loss: 0.0080(0.0080) Grad: 15397.6035  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0102(0.0172) Grad: 28718.2871  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0133(0.0171) Grad: 30997.7539  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0090(0.0166) Grad: 13796.9863  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0084(0.0167) Grad: 12839.3047  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 2m 15s (remain 2m 1s) Loss: 0.0265(0.0168) Grad: 20339.3301  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 2m 42s (remain 1m 34s) Loss: 0.0261(0.0165) Grad: 43891.0586  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 3m 8s (remain 1m 7s) Loss: 0.0134(0.0164) Grad: 32593.3262  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0127(0.0164) Grad: 20906.7422  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 4m 2s (remain 

Epoch 4 - avg_train_loss: 0.0163  avg_val_loss: 0.0174  time: 297s
Epoch 4 - Score: 0.8134
Epoch 4 - Save Best Score: 0.8134 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 10m 57s) Loss: 0.0109(0.0109) Grad: 13080.4902  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0121(0.0154) Grad: 17196.3457  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0082(0.0152) Grad: 26002.1289  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0169(0.0152) Grad: 15289.7256  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0414(0.0151) Grad: 18385.8984  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 2m 15s (remain 2m 1s) Loss: 0.0028(0.0154) Grad: 5430.8477  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 2m 42s (remain 1m 34s) Loss: 0.0101(0.0152) Grad: 14376.4482  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 3m 9s (remain 1m 7s) Loss: 0.0312(0.0156) Grad: 73419.4375  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0070(0.0156) Grad: 13232.7070  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 4m 2s (remain 0

Epoch 5 - avg_train_loss: 0.0156  avg_val_loss: 0.0171  time: 298s
Epoch 5 - Score: 0.8134
Score: 0.8134
Some weights of the model checkpoint at microsoft/deberta-v3-xsmall were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.word_embeddings._weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that 

Epoch: [1][0/953] Elapsed 0m 0s (remain 10m 9s) Loss: 0.6427(0.6427) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 27s (remain 3m 51s) Loss: 0.0575(0.1441) Grad: 1717.2068  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0419(0.1000) Grad: 3495.2080  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 1m 21s (remain 2m 55s) Loss: 0.0106(0.0805) Grad: 1161.7284  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 1m 48s (remain 2m 28s) Loss: 0.0254(0.0686) Grad: 2494.6938  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 2m 15s (remain 2m 1s) Loss: 0.0271(0.0612) Grad: 3300.4231  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 2m 41s (remain 1m 34s) Loss: 0.0306(0.0561) Grad: 3489.4783  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 3m 8s (remain 1m 7s) Loss: 0.0131(0.0516) Grad: 2245.0664  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0074(0.0482) Grad: 1508.3207  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 4m 2s (remain 0m 13s) Loss: 0.

Epoch 1 - avg_train_loss: 0.0443  avg_val_loss: 0.0209  time: 297s
Epoch 1 - Score: 0.7366
Epoch 1 - Save Best Score: 0.7366 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 10m 37s) Loss: 0.0253(0.0253) Grad: 26829.7129  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0235(0.0207) Grad: 19048.4609  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0131(0.0194) Grad: 17074.7461  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0248(0.0196) Grad: 23628.8262  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0159(0.0194) Grad: 22835.2070  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0078(0.0194) Grad: 17276.0391  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0138(0.0194) Grad: 24287.1543  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0397(0.0192) Grad: 42747.5664  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0268(0.0196) Grad: 25642.1719  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 4m 3s (remain 

Epoch 2 - avg_train_loss: 0.0195  avg_val_loss: 0.0192  time: 299s
Epoch 2 - Score: 0.7834
Epoch 2 - Save Best Score: 0.7834 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 11m 53s) Loss: 0.0357(0.0357) Grad: 40195.5039  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0299(0.0175) Grad: 48035.7539  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0094(0.0181) Grad: 26476.0098  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0248(0.0178) Grad: 24064.2129  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0129(0.0179) Grad: 14694.7188  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0255(0.0182) Grad: 30550.6074  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0044(0.0180) Grad: 11549.1494  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0174(0.0180) Grad: 19878.7969  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0060(0.0179) Grad: 6159.6504  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 4m 3s (remain 0

Epoch 3 - avg_train_loss: 0.0179  avg_val_loss: 0.0181  time: 298s
Epoch 3 - Score: 0.7981
Epoch 3 - Save Best Score: 0.7981 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 12m 14s) Loss: 0.0168(0.0168) Grad: 17147.9902  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0106(0.0171) Grad: 12058.0215  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0250(0.0169) Grad: 45192.7305  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0196(0.0169) Grad: 26902.9316  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0284(0.0171) Grad: 16983.0605  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0179(0.0169) Grad: 19988.3164  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0119(0.0168) Grad: 34270.8555  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0137(0.0168) Grad: 18831.0273  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0257(0.0169) Grad: 24968.4375  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 4m 3s (remain 

Epoch 4 - avg_train_loss: 0.0168  avg_val_loss: 0.0178  time: 298s
Epoch 4 - Score: 0.7997
Epoch 4 - Save Best Score: 0.7997 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 10m 37s) Loss: 0.0018(0.0018) Grad: 5086.6748  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0088(0.0160) Grad: 13396.0625  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0083(0.0159) Grad: 11912.0674  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0147(0.0157) Grad: 11915.6377  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0182(0.0160) Grad: 53411.5742  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 2m 15s (remain 2m 1s) Loss: 0.0084(0.0161) Grad: 22407.2031  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 2m 42s (remain 1m 34s) Loss: 0.0175(0.0160) Grad: 23674.7812  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 3m 9s (remain 1m 7s) Loss: 0.0132(0.0162) Grad: 18531.5098  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0084(0.0161) Grad: 22939.8750  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 4m 2s (remain 0

Epoch 5 - avg_train_loss: 0.0161  avg_val_loss: 0.0180  time: 297s
Epoch 5 - Score: 0.8002
Epoch 5 - Save Best Score: 0.8002 Model
Score: 0.8002
Some weights of the model checkpoint at microsoft/deberta-v3-xsmall were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.word_embeddings._weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Mo

Epoch: [1][0/953] Elapsed 0m 0s (remain 11m 1s) Loss: 0.6633(0.6633) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 0m 27s (remain 3m 51s) Loss: 0.0417(0.1472) Grad: 5734.2993  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 0m 54s (remain 3m 23s) Loss: 0.0240(0.0994) Grad: 1912.3892  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 1m 21s (remain 2m 55s) Loss: 0.0236(0.0793) Grad: 2164.9587  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 1m 48s (remain 2m 28s) Loss: 0.0315(0.0680) Grad: 2641.0327  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 2m 14s (remain 2m 1s) Loss: 0.0411(0.0611) Grad: 5156.9307  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 2m 41s (remain 1m 34s) Loss: 0.0289(0.0558) Grad: 3925.5107  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 3m 8s (remain 1m 7s) Loss: 0.0156(0.0518) Grad: 3180.8286  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 3m 35s (remain 0m 40s) Loss: 0.0121(0.0486) Grad: 2494.8289  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 4m 2s (remain 0m 13s) Loss: 0.

Epoch 1 - avg_train_loss: 0.0446  avg_val_loss: 0.0207  time: 297s
Epoch 1 - Score: 0.7644
Epoch 1 - Save Best Score: 0.7644 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 9m 45s) Loss: 0.0138(0.0138) Grad: 18508.4746  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 0m 27s (remain 3m 52s) Loss: 0.0124(0.0212) Grad: 17421.3730  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0374(0.0204) Grad: 38305.2539  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0127(0.0203) Grad: 14650.0684  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0135(0.0202) Grad: 25522.3730  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0268(0.0201) Grad: 23183.4238  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0123(0.0202) Grad: 17440.8477  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0120(0.0199) Grad: 19836.3789  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0348(0.0200) Grad: 50464.1719  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 4m 3s (remain 0

Epoch 2 - avg_train_loss: 0.0200  avg_val_loss: 0.0185  time: 298s
Epoch 2 - Score: 0.7921
Epoch 2 - Save Best Score: 0.7921 Model


Epoch: [3][0/953] Elapsed 0m 0s (remain 11m 29s) Loss: 0.0045(0.0045) Grad: 8201.5098  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 0m 27s (remain 3m 54s) Loss: 0.0121(0.0185) Grad: 20288.7910  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 0m 54s (remain 3m 25s) Loss: 0.0596(0.0193) Grad: 25469.4062  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 1m 21s (remain 2m 57s) Loss: 0.0175(0.0198) Grad: 13940.1074  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0108(0.0195) Grad: 15851.3174  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0127(0.0192) Grad: 20479.9219  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0090(0.0188) Grad: 19860.5020  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0215(0.0187) Grad: 21047.9395  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0045(0.0186) Grad: 10721.0186  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 4m 3s (remain 0

Epoch 3 - avg_train_loss: 0.0185  avg_val_loss: 0.0177  time: 299s
Epoch 3 - Score: 0.8045
Epoch 3 - Save Best Score: 0.8045 Model


Epoch: [4][0/953] Elapsed 0m 0s (remain 10m 52s) Loss: 0.0170(0.0170) Grad: 53445.2891  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0176(0.0184) Grad: 15453.7715  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0085(0.0187) Grad: 9871.5762  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 1m 21s (remain 2m 56s) Loss: 0.0077(0.0176) Grad: 10115.2422  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0095(0.0173) Grad: 12121.6895  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0076(0.0175) Grad: 10136.3838  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0170(0.0175) Grad: 24298.7363  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0149(0.0175) Grad: 27636.5410  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0195(0.0175) Grad: 38050.3125  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 4m 3s (remain 0

Epoch 4 - avg_train_loss: 0.0173  avg_val_loss: 0.0173  time: 298s
Epoch 4 - Score: 0.8124
Epoch 4 - Save Best Score: 0.8124 Model


Epoch: [5][0/953] Elapsed 0m 0s (remain 11m 53s) Loss: 0.0129(0.0129) Grad: 21011.4043  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0205(0.0176) Grad: 44643.1016  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 0m 54s (remain 3m 24s) Loss: 0.0104(0.0172) Grad: 12042.8506  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 1m 21s (remain 2m 57s) Loss: 0.0151(0.0170) Grad: 21388.1230  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 1m 48s (remain 2m 29s) Loss: 0.0096(0.0167) Grad: 25731.3301  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 2m 15s (remain 2m 2s) Loss: 0.0110(0.0167) Grad: 13108.6562  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 2m 42s (remain 1m 35s) Loss: 0.0206(0.0168) Grad: 31413.4570  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 3m 9s (remain 1m 8s) Loss: 0.0091(0.0168) Grad: 18393.0020  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 3m 36s (remain 0m 41s) Loss: 0.0047(0.0169) Grad: 6908.9985  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 4m 3s (remain 0

Epoch 5 - avg_train_loss: 0.0168  avg_val_loss: 0.0174  time: 299s
Epoch 5 - Score: 0.8098
Score: 0.8124
Score: 0.8064


In [None]:
!zip -r /content/project.zip /content/project

  adding: content/project/ (stored 0%)
  adding: content/project/input/ (stored 0%)
  adding: content/project/input/nbme-score-clinical-patient-notes/ (stored 0%)
  adding: content/project/input/nbme-score-clinical-patient-notes/features.csv (deflated 52%)
  adding: content/project/input/nbme-score-clinical-patient-notes/test.csv (deflated 55%)
  adding: content/project/input/nbme-score-clinical-patient-notes/patient_notes.csv (deflated 72%)
  adding: content/project/input/nbme-score-clinical-patient-notes/sample_submission.csv (deflated 34%)
  adding: content/project/input/nbme-score-clinical-patient-notes/train.csv (deflated 74%)
  adding: content/project/input/deberta v2_3 fast tokenizer.zip (stored 0%)
  adding: content/project/input/nbme-score-clinical-patient-notes.zip (stored 0%)
  adding: content/project/input/deberta-v2-3-fast-tokenizer/ (stored 0%)
  adding: content/project/input/deberta-v2-3-fast-tokenizer/transformers__init__.py (deflated 86%)
  adding: content/project/inpu

In [None]:
time.time()

1651395864.7975044

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp "/content/project.zip" "/content/drive/MyDrive/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
