In [1]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=8
    epochs=3
    encoder_lr=5e-5
    decoder_lr=1e-4
    min_lr=1e-7
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=1
    max_len=None
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    llrd = 0.97
    loss_weight = [0.21,0.16,0.10, 0.16, 0.21,0.16]
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [2]:
OUTPUT_DIR = '/home/u210810417/'

In [3]:
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


[34m[1mwandb[0m: Currently logged in as: [33manony-mouse-430222[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('pip uninstall -y tokenizers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Defaulting to user installation because normal site-packages is not writeable
tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [6]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('/home/u210810417/train.csv')
test = pd.read_csv('/home/u210810417/test.csv')
submission = pd.read_csv('/home/u210810417/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [7]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [8]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR)
CFG.tokenizer = tokenizer

wandb: Network error (TransientError), entering retry loop.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
# CFG.max_len = 512
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1429


In [10]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [11]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
    
# class WeightedLayerPooling(nn.Module):
#     def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
#         super(WeightedLayerPooling, self).__init__()
#         self.layer_start = layer_start
#         self.num_hidden_layers = num_hidden_layers
#         self.layer_weights = layer_weights if layer_weights is not None \
#             else nn.Parameter(
#                 torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
#             )

#     def forward(self, features):

#         all_layer_embedding = torch.stack(features)
#         all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

#         weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
#         weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

#         return weighted_average
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
            self.config.output_hidden_layers = True
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.config.output_hidden_layers = True
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
#         self.config.update({'output_hidden_states':True})
        # self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
        self.pool = MeanPooling()
        
        in_dim = self.config.hidden_size
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1),
            )
        
        self.fc = nn.Linear(self.config.hidden_size, 6)

        self._init_weights(self.fc)
        
        
        
        for module in self.attention.modules():
            self._init_weights(module)
        
        
        
        
        
        
        REINIT_LAYERS = 1
        for layer in self.model.encoder.layer[-REINIT_LAYERS:]:
            for module in layer.modules():
                self._init_weights(module)
            print('Done')
     
            
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight = torch.nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            

    
    

##############################################
#     def forward(self, inputs):
# #         self.config.update({'output_hidden_states':True})
#         outputs = self.model(**inputs)
        
        
#         feature = self.pool(outputs.hidden_states)

#         return self.fc(feature)


#######################################################
#     def feature(self, inputs):
#         outputs = self.model(**inputs)
#         last_hidden_states = outputs[0]
#         feature = self.pool(last_hidden_states, inputs['attention_mask'])
#         return feature

#     def forward(self, inputs):
#         feature = self.feature(inputs)
#         output = self.fc(feature)
#         return output

##########################################################
    def forward(self,inputs):

        outputs = self.model(**inputs)
        x = outputs[0]
        mask = inputs['attention_mask']
        w = self.attention(x).float()
        w[mask==0]=float('-inf')
        w = torch.softmax(w,1)
        x = torch.sum(w * x, dim=1)
        return self.fc(x)


In [12]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [13]:
class Weighted_Loss(nn.Module):
    def __init__(self,criterion):
        super().__init__()
        self.criterion = criterion
    def forward(self,y_pred,y_true):
        loss = 0
        for i in range(len(CFG.target_cols)):
            loss += self.criterion(y_pred[:,i],y_true[:,i])*CFG.loss_weight[i]
        return loss
        

In [14]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [15]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.9):
        
        
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        layer_names = []

        for idx, (name, param) in enumerate(model.model.named_parameters()):
            if not any(nd in name for nd in no_decay):
                layer_names.append(name)
            # print(f'{idx}: {name}')
            
        layer_names.reverse()
        lr      = encoder_lr
        lr_mult = CFG.llrd
        #0.995 0.4444
        # print(layer_names)
        # placeholder
        parameters = []
        
        
        
        # store params & learning rates
        for idx, name in enumerate(layer_names):
    # display info
            # print(f'{idx}: lr = {lr:.6f}, {name}')
    
    # append layer parameters
            parameters += [{'params': [p for n, p in model.model.named_parameters() if n == name],
                    'lr':lr, 'weight_decay':weight_decay}]
    
    # update learning rate
            lr *= lr_mult
        parameters +=[{'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}]
        for n,p in model.named_parameters():
            if "model" not in n:
                if "attention" in n:
                    print(n)
        # print(parameters)
        # print(parameters)
        return parameters

#     def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.9):
        
        
#         no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
#         layer_names = []

#         for idx, (name, param) in enumerate(model.model.named_parameters()):
#             if not any(nd in name for nd in no_decay):
#                 layer_names.append(name)
#             # print(f'{idx}: {name}')
            
#         layer_names.reverse()
#         lr      = encoder_lr
#         lr_mult = 0.993
#         #0.995 0.4444
#         # print(layer_names)
#         # placeholder
#         parameters = []
        
        
        
#         # store params & learning rates
#         for idx, name in enumerate(layer_names):
#     # display info
#             # print(f'{idx}: lr = {lr:.6f}, {name}')
    
#     # append layer parameters
#             parameters += [{'params': [p for n, p in model.model.named_parameters() if n == name],
#                     'lr':lr, 'weight_decay':weight_decay}]
    
#     # update learning rate
#             lr *= lr_mult
#         parameters +=[{'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
#              'lr': encoder_lr, 'weight_decay': 0.0},
#             {'params': [p for n, p in model.named_parameters() if "model" not in n and "attention" in n],
#              'lr': encoder_lr, 'weight_decay': 0.0},
#             {'params': [p for n, p in model.named_parameters() if "model" not in n and "attention" not in n],
#              'lr': decoder_lr, 'weight_decay': 0.0}]
#         for n,p in model.named_parameters():
#             if "model" not in n:
#                 if "attention" in n:
#                     print(n)
#         # print(parameters)
#         # print(parameters)
#         return parameters
    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = Weighted_Loss(nn.SmoothL1Loss(reduction='mean')) # RMSELoss(reduction="mean")
 
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [16]:
# if __name__ == '__main__':
    
#     def get_result(oof_df):
#         labels = oof_df[CFG.target_cols].values
#         preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
#         score, scores = get_score(labels, preds)
#         LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
#     if CFG.train:
#         oof_df = pd.DataFrame()
#         for fold in range(CFG.n_fold):
#             if fold in CFG.trn_fold:
#                 _oof_df = train_loop(train, fold)
#                 oof_df = pd.concat([oof_df, _oof_df])
#                 LOGGER.info(f"========== fold: {fold} result ==========")
#                 get_result(_oof_df)
#         oof_df = oof_df.reset_index(drop=True)
#         LOGGER.info(f"========== CV ==========")
#         get_result(oof_df)
#         oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
#     if CFG.wandb:
#         wandb.finish()

In [17]:
import optuna
    
def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
        return score
def objective(trial):

    CFG.print_freq = np.inf
    # CFG.num_warmup_step = trial.suggest_int('num_warmup_step',
    CFG.epoch = trial.suggest_int('epoch',3,4)
    encoder_lr = trial.suggest_float('encoder_lr',5e-6,5e-4)
    decoder_lr = 2*encoder_lr
    llrd = trial.suggest_float('llrd',0.8,0.99)

    oof_df = pd.DataFrame()
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            _oof_df = train_loop(train, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            # LOGGER.info(f"========== fold: {fold} result ==========")
            
    oof_df = oof_df.reset_index(drop=True)
    print(11111111111111111111111111111111)
    LOGGER.info(f"========== CV ==========")
    score = get_result(oof_df)
    print(score)
    return score
        
            
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=40000)

    pruned_trials = study.get_trials(deepcopy=False)
    complete_trials = study.get_trials(deepcopy=False)

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-12-14 01:48:52,156][0m A new study created in memory with name: no-name-7f767321-9887-4d3b-8957-db8e28813799[0m
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_siz

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 1s (remain 88m 37s) Loss: 2.1010(2.1010) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 6m 50s (remain 0m 0s) Loss: 0.0917(0.1762) Grad: 186656.6875  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 44s) Loss: 0.0836(0.0836) 


Epoch 1 - avg_train_loss: 0.1762  avg_val_loss: 0.1180  time: 450s
Epoch 1 - Score: 0.4783  Scores: [0.5388780363028517, 0.4407438434289395, 0.41600740238790335, 0.4945145057247125, 0.500524024628813, 0.4792564375476278]
Epoch 1 - Save Best Score: 0.4783 Model


EVAL: [488/489] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0891(0.1180) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 18m 42s) Loss: 0.0764(0.0764) Grad: 360640.2500  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 6m 50s (remain 0m 0s) Loss: 0.4474(0.1145) Grad: 406937.9375  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 35s) Loss: 0.0994(0.0994) 


Epoch 2 - avg_train_loss: 0.1145  avg_val_loss: 0.1094  time: 450s
Epoch 2 - Score: 0.4607  Scores: [0.48272728814153354, 0.44295246831876356, 0.4121399877199979, 0.4548005388909946, 0.5205580824786445, 0.4509041387020568]
Epoch 2 - Save Best Score: 0.4607 Model


EVAL: [488/489] Elapsed 0m 39s (remain 0m 0s) Loss: 0.1718(0.1094) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 40s) Loss: 0.1107(0.1107) Grad: 397374.0938  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 6m 49s (remain 0m 0s) Loss: 0.0237(0.0890) Grad: 91700.8984  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 31s) Loss: 0.0913(0.0913) 


Epoch 3 - avg_train_loss: 0.0890  avg_val_loss: 0.1030  time: 450s
Epoch 3 - Score: 0.4495  Scores: [0.4811280288916606, 0.4441163215575171, 0.4110403757782563, 0.4533870881421869, 0.4688590498454551, 0.43830030843951695]
Epoch 3 - Save Best Score: 0.4495 Model


EVAL: [488/489] Elapsed 0m 39s (remain 0m 0s) Loss: 0.1122(0.1030) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 18m 5s) Loss: 4.3462(4.3462) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 6m 48s (remain 0m 0s) Loss: 0.0462(0.1807) Grad: 282606.0625  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 18s) Loss: 0.1055(0.1055) 


Epoch 1 - avg_train_loss: 0.1807  avg_val_loss: 0.1234  time: 448s
Epoch 1 - Score: 0.4968  Scores: [0.51673493800886, 0.49730093920826296, 0.47394026247299686, 0.5727011567283309, 0.4694405477912625, 0.4509633824699739]
Epoch 1 - Save Best Score: 0.4968 Model


EVAL: [488/489] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0971(0.1234) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 19m 58s) Loss: 0.1161(0.1161) Grad: 408108.3125  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 6m 49s (remain 0m 0s) Loss: 0.1033(0.1155) Grad: 420775.7500  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 24s) Loss: 0.0567(0.0567) 


Epoch 2 - avg_train_loss: 0.1155  avg_val_loss: 0.1117  time: 449s
Epoch 2 - Score: 0.4703  Scores: [0.4890629472804949, 0.4622692624942983, 0.4379964911910309, 0.4821853611819679, 0.48392889906933284, 0.4662387538648796]
Epoch 2 - Save Best Score: 0.4703 Model


EVAL: [488/489] Elapsed 0m 40s (remain 0m 0s) Loss: 0.0419(0.1117) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 21m 36s) Loss: 0.0357(0.0357) Grad: 259511.8125  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 6m 48s (remain 0m 0s) Loss: 0.1169(0.0932) Grad: 465966.0000  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 24s) Loss: 0.0863(0.0863) 


Epoch 3 - avg_train_loss: 0.0932  avg_val_loss: 0.1042  time: 449s
Epoch 3 - Score: 0.4527  Scores: [0.48920672879097804, 0.44195898061803535, 0.41820679143375455, 0.4503278038647289, 0.4668670196960002, 0.4496337719271793]
Epoch 3 - Save Best Score: 0.4527 Model


EVAL: [488/489] Elapsed 0m 40s (remain 0m 0s) Loss: 0.0425(0.1042) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 18m 50s) Loss: 2.3520(2.3520) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 6m 50s (remain 0m 0s) Loss: 0.0111(0.1776) Grad: 65440.8477  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 33s) Loss: 0.2617(0.2617) 


Epoch 1 - avg_train_loss: 0.1776  avg_val_loss: 0.1158  time: 447s
Epoch 1 - Score: 0.4791  Scores: [0.49814686945509584, 0.4610343148919261, 0.4409630091356523, 0.5270578848672223, 0.48544386761629854, 0.46165991469839784]
Epoch 1 - Save Best Score: 0.4791 Model


EVAL: [488/489] Elapsed 0m 36s (remain 0m 0s) Loss: 0.1011(0.1158) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 24s) Loss: 0.1494(0.1494) Grad: 489782.4688  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 6m 52s (remain 0m 0s) Loss: 0.1102(0.1141) Grad: 198802.5000  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 35s) Loss: 0.1867(0.1867) 


Epoch 2 - avg_train_loss: 0.1141  avg_val_loss: 0.1123  time: 449s
Epoch 2 - Score: 0.4701  Scores: [0.5001465287388327, 0.4519217289503702, 0.4249288970994163, 0.4762567705072353, 0.4990014464985306, 0.4684161287185546]
Epoch 2 - Save Best Score: 0.4701 Model


EVAL: [488/489] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0594(0.1123) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 35s) Loss: 0.1429(0.1429) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 6m 52s (remain 0m 0s) Loss: 0.0598(0.0886) Grad: 153859.1406  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 32s) Loss: 0.1907(0.1907) 


Epoch 3 - avg_train_loss: 0.0886  avg_val_loss: 0.1073  time: 449s
Epoch 3 - Score: 0.4598  Scores: [0.4808619067652019, 0.4509463579061564, 0.4174169582469625, 0.466814477951571, 0.4854742354098303, 0.45725063395989335]
Epoch 3 - Save Best Score: 0.4598 Model


EVAL: [488/489] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0629(0.1073) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 46s) Loss: 1.8758(1.8758) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 6m 38s (remain 0m 0s) Loss: 0.3153(0.1814) Grad: 387102.5938  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 4s) Loss: 0.2127(0.2127) 


Epoch 1 - avg_train_loss: 0.1814  avg_val_loss: 0.1290  time: 424s
Epoch 1 - Score: 0.5045  Scores: [0.5567756302532237, 0.48504074754136134, 0.46872509489333924, 0.4693187533023757, 0.5201707239366486, 0.5271573538909787]
Epoch 1 - Save Best Score: 0.5045 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0439(0.1290) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 21m 6s) Loss: 0.0690(0.0690) Grad: 348813.2188  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.1278(0.1206) Grad: 242910.6719  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 5s) Loss: 0.2326(0.2326) 


Epoch 2 - avg_train_loss: 0.1206  avg_val_loss: 0.1049  time: 352s
Epoch 2 - Score: 0.4552  Scores: [0.4972786798414288, 0.45074148783444984, 0.43478927350799174, 0.4403821987453609, 0.4630074298559645, 0.4449899899558672]
Epoch 2 - Save Best Score: 0.4552 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0486(0.1049) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 22m 13s) Loss: 0.0501(0.0501) Grad: 278780.5000  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.0384(0.0903) Grad: 115499.2969  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 8s) Loss: 0.1695(0.1695) 


Epoch 3 - avg_train_loss: 0.0903  avg_val_loss: 0.1007  time: 351s
Epoch 3 - Score: 0.4446  Scores: [0.48021089679527984, 0.4456582386875365, 0.40738386498453666, 0.43613685021756016, 0.46453912332623104, 0.43386311613546075]
Epoch 3 - Save Best Score: 0.4446 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0477(0.1007) 


Score: 0.4517  Scores: [0.48286431284891274, 0.44568327073251157, 0.41353526708610205, 0.45179847502684845, 0.4715081935079777, 0.44485634353796044]
[32m[I 2022-12-14 03:15:35,099][0m Trial 0 finished with value: 0.4517076437900522 and parameters: {'epoch': 4, 'encoder_lr': 0.00037409434091301814, 'llrd': 0.8885960944661698}. Best is trial 0 with value: 0.4517076437900522.[0m


11111111111111111111111111111111
0.4517076437900522


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 57s) Loss: 2.0664(2.0664) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.1119(0.1743) Grad: 451114.3750  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 42s) Loss: 0.1616(0.1616) 


Epoch 1 - avg_train_loss: 0.1743  avg_val_loss: 0.1428  time: 350s
Epoch 1 - Score: 0.5259  Scores: [0.5172925495775443, 0.47049240179294216, 0.4487275051595862, 0.49131722155170793, 0.595346902213994, 0.6321915305155291]
Epoch 1 - Save Best Score: 0.5259 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.2762(0.1428) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 18m 48s) Loss: 0.0785(0.0785) Grad: 375796.0625  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0910(0.1151) Grad: 407038.4062  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.0898(0.0898) 


Epoch 2 - avg_train_loss: 0.1151  avg_val_loss: 0.1051  time: 351s
Epoch 2 - Score: 0.4531  Scores: [0.4955007513478271, 0.43976369580647107, 0.4129197366178905, 0.4552416186147978, 0.47949940829845983, 0.4355701009638151]
Epoch 2 - Save Best Score: 0.4531 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1060(0.1051) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 9s) Loss: 0.0460(0.0460) Grad: 289529.2188  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 24s (remain 0m 0s) Loss: 0.1316(0.0934) Grad: 462550.5000  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 36s) Loss: 0.0771(0.0771) 


Epoch 3 - avg_train_loss: 0.0934  avg_val_loss: 0.1018  time: 352s
Epoch 3 - Score: 0.4468  Scores: [0.4805009643716676, 0.4399999697808841, 0.40669062342351975, 0.4529585995068709, 0.46345221081353577, 0.4369733074774361]
Epoch 3 - Save Best Score: 0.4468 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1276(0.1018) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 18m 55s) Loss: 3.0567(3.0567) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0699(0.1755) Grad: 328987.5938  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.1386(0.1386) 


Epoch 1 - avg_train_loss: 0.1755  avg_val_loss: 0.1432  time: 350s
Epoch 1 - Score: 0.5433  Scores: [0.4957198099544014, 0.4612282632733822, 0.6720436480939915, 0.4593341229582834, 0.5527103576477268, 0.6190037468036172]
Epoch 1 - Save Best Score: 0.5433 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0695(0.1432) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 19m 1s) Loss: 0.1108(0.1108) Grad: 399584.8750  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 24s (remain 0m 0s) Loss: 0.0444(0.1191) Grad: 142369.1094  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 34s) Loss: 0.0797(0.0797) 


Epoch 2 - avg_train_loss: 0.1191  avg_val_loss: 0.1312  time: 352s
Epoch 2 - Score: 0.5124  Scores: [0.497307178167604, 0.4916110052881887, 0.4975347021360474, 0.6019615586812828, 0.5259817647936514, 0.45979498987307643]
Epoch 2 - Save Best Score: 0.5124 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0425(0.1312) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 19m 27s) Loss: 0.0858(0.0858) Grad: 355359.9688  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0713(0.0927) Grad: 162784.6250  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 42s) Loss: 0.0904(0.0904) 


Epoch 3 - avg_train_loss: 0.0927  avg_val_loss: 0.1050  time: 350s
Epoch 3 - Score: 0.4546  Scores: [0.49168866694566293, 0.44196119899718145, 0.41956487157083794, 0.45440562527932804, 0.470203344308001, 0.4498101290624786]
Epoch 3 - Save Best Score: 0.4546 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0402(0.1050) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 5s) Loss: 2.7799(2.7799) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.2223(0.1702) Grad: 580507.9375  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 31s) Loss: 0.1288(0.1288) 


Epoch 1 - avg_train_loss: 0.1702  avg_val_loss: 0.1311  time: 351s
Epoch 1 - Score: 0.5097  Scores: [0.5109555439545256, 0.47272809682366224, 0.4574921959408412, 0.481233155189213, 0.47660170522866774, 0.6590328484021348]
Epoch 1 - Save Best Score: 0.5097 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0401(0.1311) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 7s) Loss: 0.0332(0.0332) Grad: 217874.4688  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 26s (remain 0m 0s) Loss: 0.1593(0.1182) Grad: 269678.1875  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 31s) Loss: 0.1713(0.1713) 


Epoch 2 - avg_train_loss: 0.1182  avg_val_loss: 0.1110  time: 364s
Epoch 2 - Score: 0.4689  Scores: [0.4891768190208495, 0.45054446453718144, 0.4437568932629798, 0.46721342974541996, 0.4994608659931428, 0.463352034229049]
Epoch 2 - Save Best Score: 0.4689 Model


EVAL: [488/489] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0813(0.1110) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 38s) Loss: 0.1419(0.1419) Grad: 449589.7500  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 7m 44s (remain 0m 0s) Loss: 0.0438(0.0919) Grad: 134821.9375  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 41s) Loss: 0.1816(0.1816) 


Epoch 3 - avg_train_loss: 0.0919  avg_val_loss: 0.1065  time: 509s
Epoch 3 - Score: 0.4579  Scores: [0.48269190272524765, 0.44931692231836956, 0.4160471995182135, 0.4668419567768865, 0.4790860560731076, 0.4536699476376514]
Epoch 3 - Save Best Score: 0.4579 Model


EVAL: [488/489] Elapsed 0m 44s (remain 0m 0s) Loss: 0.0777(0.1065) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 49s) Loss: 3.2280(3.2280) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 7m 41s (remain 0m 0s) Loss: 0.0376(0.1817) Grad: 134105.5625  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 44s) Loss: 0.2079(0.2079) 


Epoch 1 - avg_train_loss: 0.1817  avg_val_loss: 0.1097  time: 507s
Epoch 1 - Score: 0.4663  Scores: [0.4949315323210375, 0.47055972059854423, 0.4556814492589336, 0.44731396152492153, 0.4928946813848514, 0.4363076387835738]
Epoch 1 - Save Best Score: 0.4663 Model


EVAL: [488/489] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0454(0.1097) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 21m 14s) Loss: 0.1199(0.1199) Grad: 484387.1562  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 7m 43s (remain 0m 0s) Loss: 0.1670(0.1195) Grad: 247606.5000  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 49s) Loss: 0.3308(0.3308) 


Epoch 2 - avg_train_loss: 0.1195  avg_val_loss: 0.1435  time: 505s
Epoch 2 - Score: 0.5191  Scores: [0.69835379048965, 0.48968122714526024, 0.44022327360995844, 0.5547349926265721, 0.5017736566733302, 0.42971118541816095]


EVAL: [488/489] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0357(0.1435) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 18m 55s) Loss: 0.4311(0.4311) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 7m 42s (remain 0m 0s) Loss: 0.0198(0.0916) Grad: 96150.9531  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 47s) Loss: 0.1836(0.1836) 


Epoch 3 - avg_train_loss: 0.0916  avg_val_loss: 0.1000  time: 509s
Epoch 3 - Score: 0.4431  Scores: [0.4818257217273145, 0.44168492077621185, 0.4087597411186688, 0.43535810186097806, 0.46186833665873317, 0.4289999755199286]
Epoch 3 - Save Best Score: 0.4431 Model


EVAL: [488/489] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0523(0.1000) 


Score: 0.4507  Scores: [0.4841949358277174, 0.44325560130438735, 0.41279716051579396, 0.4525296440427369, 0.4687012652543952, 0.4424718999493954]
[32m[I 2022-12-14 04:36:57,437][0m Trial 1 finished with value: 0.45065841781573773 and parameters: {'epoch': 4, 'encoder_lr': 0.00014031439009558157, 'llrd': 0.9254840682752528}. Best is trial 1 with value: 0.45065841781573773.[0m


11111111111111111111111111111111
0.45065841781573773


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 21m 25s) Loss: 4.1739(4.1739) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 7m 38s (remain 0m 0s) Loss: 0.0818(0.1812) Grad: 197642.6562  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 53s) Loss: 0.1062(0.1062) 


Epoch 1 - avg_train_loss: 0.1812  avg_val_loss: 0.1376  time: 507s
Epoch 1 - Score: 0.5145  Scores: [0.6316107361598808, 0.4543700914886045, 0.44450277048438774, 0.499943073622846, 0.506262242328746, 0.5500731895468994]
Epoch 1 - Save Best Score: 0.5145 Model


EVAL: [488/489] Elapsed 0m 48s (remain 0m 0s) Loss: 0.0887(0.1376) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 13s) Loss: 0.1256(0.1256) Grad: inf  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 7m 41s (remain 0m 0s) Loss: 0.0763(0.1187) Grad: 168462.8750  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 8s) Loss: 0.0821(0.0821) 


Epoch 2 - avg_train_loss: 0.1187  avg_val_loss: 0.1115  time: 499s
Epoch 2 - Score: 0.4674  Scores: [0.49911010456538035, 0.44367211738581913, 0.40998376320464125, 0.47839465772770545, 0.4679318098336716, 0.5054983083284335]
Epoch 2 - Save Best Score: 0.4674 Model


EVAL: [488/489] Elapsed 0m 37s (remain 0m 0s) Loss: 0.1206(0.1115) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 18m 35s) Loss: 0.1409(0.1409) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0846(0.0913) Grad: 181711.1719  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 49s) Loss: 0.0862(0.0862) 


Epoch 3 - avg_train_loss: 0.0913  avg_val_loss: 0.1016  time: 351s
Epoch 3 - Score: 0.4462  Scores: [0.4813091055936086, 0.4397801422960884, 0.4075128580989938, 0.4521408286597996, 0.4608050154336848, 0.43569665679661373]
Epoch 3 - Save Best Score: 0.4462 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1421(0.1016) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 19m 17s) Loss: 2.0190(2.0190) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0508(0.1792) Grad: 139156.0469  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 32s) Loss: 0.0881(0.0881) 


Epoch 1 - avg_train_loss: 0.1792  avg_val_loss: 0.1197  time: 349s
Epoch 1 - Score: 0.4877  Scores: [0.5206245806652157, 0.4613757654463226, 0.49358772413769364, 0.4661502120924575, 0.5222108565809669, 0.4621443885139726]
Epoch 1 - Save Best Score: 0.4877 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0733(0.1197) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 21m 11s) Loss: 0.1315(0.1315) Grad: 471248.0625  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0440(0.1190) Grad: 132958.9219  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 50s) Loss: 0.0972(0.0972) 


Epoch 2 - avg_train_loss: 0.1190  avg_val_loss: 0.1078  time: 351s
Epoch 2 - Score: 0.4605  Scores: [0.497307008042644, 0.452712209665756, 0.4202407761960833, 0.46675684566326464, 0.4745998575223091, 0.4512267033914086]
Epoch 2 - Save Best Score: 0.4605 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0353(0.1078) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 19m 21s) Loss: 0.0423(0.0423) Grad: 265415.3125  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0406(0.0907) Grad: 120515.5000  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 33s) Loss: 0.1011(0.1011) 


Epoch 3 - avg_train_loss: 0.0907  avg_val_loss: 0.1062  time: 350s
Epoch 3 - Score: 0.4573  Scores: [0.49266607113804367, 0.4466852592145129, 0.42395670576942385, 0.456457081151788, 0.4731448337396238, 0.450846341325502]
Epoch 3 - Save Best Score: 0.4573 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0380(0.1062) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 26m 29s) Loss: 2.7384(2.7384) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 26s (remain 0m 0s) Loss: 0.0420(0.1769) Grad: 272049.2812  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 35s) Loss: 0.2896(0.2896) 


Epoch 1 - avg_train_loss: 0.1769  avg_val_loss: 0.1319  time: 351s
Epoch 1 - Score: 0.4988  Scores: [0.4914710867464422, 0.45950094778643763, 0.43241412539998564, 0.47933414780883077, 0.66796227513352, 0.4620266891637443]
Epoch 1 - Save Best Score: 0.4988 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1552(0.1319) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 36s) Loss: 0.0947(0.0947) Grad: 420111.5312  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.0466(0.1141) Grad: 287442.1562  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 29s) Loss: 0.2508(0.2508) 


Epoch 2 - avg_train_loss: 0.1141  avg_val_loss: 0.1222  time: 350s
Epoch 2 - Score: 0.4856  Scores: [0.5215671245345406, 0.4759792328680206, 0.42130945397283426, 0.4668892403548048, 0.5611007586730267, 0.4666329545121198]
Epoch 2 - Save Best Score: 0.4856 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1527(0.1222) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 48s) Loss: 0.0677(0.0677) Grad: 352095.9375  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.0530(0.0922) Grad: 291880.8438  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 31s) Loss: 0.1868(0.1868) 


Epoch 3 - avg_train_loss: 0.0922  avg_val_loss: 0.1067  time: 350s
Epoch 3 - Score: 0.4580  Scores: [0.4841516254442488, 0.4472475798828593, 0.4161722717945147, 0.46620197402142594, 0.48163440323888534, 0.45252945238841824]
Epoch 3 - Save Best Score: 0.4580 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0961(0.1067) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 22m 50s) Loss: 3.1727(3.1727) Grad: 1870995.2500  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.0846(0.1746) Grad: 389210.7188  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 4m 12s) Loss: 0.1740(0.1740) 


Epoch 1 - avg_train_loss: 0.1746  avg_val_loss: 0.1287  time: 352s
Epoch 1 - Score: 0.5144  Scores: [0.4941584628369391, 0.5079670381535866, 0.5947910400934159, 0.4488952684879778, 0.5160785218032955, 0.5244574614737169]
Epoch 1 - Save Best Score: 0.5144 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0799(0.1287) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 15s) Loss: 0.0473(0.0473) Grad: 290081.5000  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.1975(0.1174) Grad: 280602.5938  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 18s) Loss: 0.2156(0.2156) 


Epoch 2 - avg_train_loss: 0.1174  avg_val_loss: 0.1076  time: 351s
Epoch 2 - Score: 0.4604  Scores: [0.4946432506672705, 0.4541273703806408, 0.4233987217220821, 0.4824466148102546, 0.4672034055557276, 0.4404842635755959]
Epoch 2 - Save Best Score: 0.4604 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0723(0.1076) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 21m 30s) Loss: 0.0676(0.0676) Grad: 317312.8125  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.0271(0.0927) Grad: 106851.3281  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 7s) Loss: 0.1600(0.1600) 


Epoch 3 - avg_train_loss: 0.0927  avg_val_loss: 0.1010  time: 351s
Epoch 3 - Score: 0.4450  Scores: [0.48637049477379957, 0.4448585208362517, 0.40875979467288254, 0.4366560591331286, 0.46294545883204197, 0.43054588475526073]
Epoch 3 - Save Best Score: 0.4450 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0524(0.1010) 


Score: 0.4517  Scores: [0.4861406269172624, 0.4446520950777216, 0.4141501972861887, 0.45298840076109936, 0.46970580599745576, 0.4425039672363737]
[32m[I 2022-12-14 05:52:43,922][0m Trial 2 finished with value: 0.45169018221268353 and parameters: {'epoch': 3, 'encoder_lr': 0.00013261621572574755, 'llrd': 0.9270748439777742}. Best is trial 1 with value: 0.45065841781573773.[0m


11111111111111111111111111111111
0.45169018221268353


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 14s) Loss: 2.5176(2.5176) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0457(0.1801) Grad: 129124.3750  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 49s) Loss: 0.0844(0.0844) 


Epoch 1 - avg_train_loss: 0.1801  avg_val_loss: 0.1303  time: 351s
Epoch 1 - Score: 0.5146  Scores: [0.5213418737059692, 0.4425718603046448, 0.5557642143466208, 0.5964579743704863, 0.47113149106644614, 0.5006240229579185]
Epoch 1 - Save Best Score: 0.5146 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0862(0.1303) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 49s) Loss: 0.1059(0.1059) Grad: 399460.8750  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0443(0.1188) Grad: 126736.8906  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 43s) Loss: 0.0943(0.0943) 


Epoch 2 - avg_train_loss: 0.1188  avg_val_loss: 0.1118  time: 351s
Epoch 2 - Score: 0.4738  Scores: [0.48972304287255114, 0.4553688164307012, 0.4743417692991039, 0.48524294716577643, 0.46303980890348495, 0.4749794778930286]
Epoch 2 - Save Best Score: 0.4738 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1469(0.1118) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 20s) Loss: 0.2860(0.2860) Grad: 610537.1875  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0818(0.0893) Grad: 192452.8281  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 53s) Loss: 0.0730(0.0730) 


Epoch 3 - avg_train_loss: 0.0893  avg_val_loss: 0.1033  time: 350s
Epoch 3 - Score: 0.4501  Scores: [0.48171529408386254, 0.4417385489470384, 0.41128254464154146, 0.45707429731114846, 0.4697309147726542, 0.438908930829927]
Epoch 3 - Save Best Score: 0.4501 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1160(0.1033) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 19m 32s) Loss: 3.2935(3.2935) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1181(0.1771) Grad: 221073.6250  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.1016(0.1016) 


Epoch 1 - avg_train_loss: 0.1771  avg_val_loss: 0.1146  time: 349s
Epoch 1 - Score: 0.4789  Scores: [0.4956710619258705, 0.461315175082179, 0.47263159432412377, 0.46018089268239987, 0.48446179936629424, 0.49935280597519216]
Epoch 1 - Save Best Score: 0.4789 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0439(0.1146) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 25m 27s) Loss: 0.1113(0.1113) Grad: inf  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0363(0.1173) Grad: 107890.4219  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 44s) Loss: 0.1308(0.1308) 


Epoch 2 - avg_train_loss: 0.1173  avg_val_loss: 0.1246  time: 351s
Epoch 2 - Score: 0.4994  Scores: [0.4886274353054846, 0.4915804894656028, 0.471786922118749, 0.45753173645931866, 0.48714134888948274, 0.5994638364938102]


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0301(0.1246) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 28m 54s) Loss: 0.0581(0.0581) Grad: 289128.5625  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0578(0.0898) Grad: 147214.6562  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 44s) Loss: 0.0899(0.0899) 


Epoch 3 - avg_train_loss: 0.0898  avg_val_loss: 0.1051  time: 350s
Epoch 3 - Score: 0.4546  Scores: [0.48872603236540746, 0.4436187062620155, 0.4202814515721467, 0.4534089412615355, 0.47473587964006453, 0.44655748562054065]
Epoch 3 - Save Best Score: 0.4546 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0406(0.1051) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 48s) Loss: 3.2353(3.2353) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.1035(0.1752) Grad: 406972.9375  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0816(0.0816) 


Epoch 1 - avg_train_loss: 0.1752  avg_val_loss: 0.1710  time: 350s
Epoch 1 - Score: 0.5708  Scores: [0.7419943900887902, 0.6724515810643699, 0.45474003479852365, 0.5124870386518994, 0.4950098288521799, 0.5481956270669257]
Epoch 1 - Save Best Score: 0.5708 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0441(0.1710) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 37s) Loss: 0.1691(0.1691) Grad: 530808.2500  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.0973(0.1199) Grad: 204520.2031  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 44s) Loss: 0.1778(0.1778) 


Epoch 2 - avg_train_loss: 0.1199  avg_val_loss: 0.1113  time: 350s
Epoch 2 - Score: 0.4675  Scores: [0.48623497886700007, 0.45567860351345874, 0.41862894557751623, 0.4806976149409129, 0.5039428796346231, 0.45962717173691714]
Epoch 2 - Save Best Score: 0.4675 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0655(0.1113) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 11s) Loss: 0.0252(0.0252) Grad: 205921.8906  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 25s (remain 0m 0s) Loss: 0.2670(0.0907) Grad: 342759.5938  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 44s) Loss: 0.2025(0.2025) 


Epoch 3 - avg_train_loss: 0.0907  avg_val_loss: 0.1072  time: 350s
Epoch 3 - Score: 0.4593  Scores: [0.4797936051137622, 0.4485004755406109, 0.41867873905464853, 0.4650448450245268, 0.4892945420881495, 0.45440963631423475]
Epoch 3 - Save Best Score: 0.4593 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0784(0.1072) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 21m 35s) Loss: 4.1007(4.1007) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.1221(0.1760) Grad: 206247.1250  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 13s) Loss: 0.2541(0.2541) 


Epoch 1 - avg_train_loss: 0.1760  avg_val_loss: 0.1104  time: 348s
Epoch 1 - Score: 0.4653  Scores: [0.5045555811532232, 0.4756047736535551, 0.4239720509843639, 0.4512159093235945, 0.4893893162712012, 0.44716471801038177]
Epoch 1 - Save Best Score: 0.4653 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0691(0.1104) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 18m 26s) Loss: 0.1137(0.1137) Grad: inf  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1917(0.1185) Grad: 145997.0469  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 4m 24s) Loss: 0.1842(0.1842) 


Epoch 2 - avg_train_loss: 0.1185  avg_val_loss: 0.1065  time: 347s
Epoch 2 - Score: 0.4571  Scores: [0.4900594368081922, 0.4886430384484807, 0.4138129900401851, 0.4386609368232969, 0.4733903215264121, 0.43831477840012784]
Epoch 2 - Save Best Score: 0.4571 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0592(0.1065) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 18m 45s) Loss: 0.0436(0.0436) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0677(0.0904) Grad: 74417.7266  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 12s) Loss: 0.1835(0.1835) 


Epoch 3 - avg_train_loss: 0.0904  avg_val_loss: 0.1018  time: 349s
Epoch 3 - Score: 0.4470  Scores: [0.4859789942342716, 0.44701407293805107, 0.4097602553633659, 0.4372144063174024, 0.4652053132200701, 0.4365596429513926]
Epoch 3 - Save Best Score: 0.4470 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0549(0.1018) 


Score: 0.4528  Scores: [0.4840649774001258, 0.4452264090550391, 0.41502430451923333, 0.4532989146930708, 0.4748279976780669, 0.44416351060842435]
[32m[I 2022-12-14 07:03:13,734][0m Trial 3 finished with value: 0.45276768565899334 and parameters: {'epoch': 4, 'encoder_lr': 0.0001859353113930986, 'llrd': 0.8181481143144653}. Best is trial 1 with value: 0.45065841781573773.[0m


11111111111111111111111111111111
0.45276768565899334


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 11s) Loss: 2.2423(2.2423) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0694(0.1807) Grad: 173533.2188  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 46s) Loss: 0.0935(0.0935) 


Epoch 1 - avg_train_loss: 0.1807  avg_val_loss: 0.1429  time: 349s
Epoch 1 - Score: 0.5324  Scores: [0.5321247753009442, 0.5515747751116109, 0.47677813047794215, 0.5068260059868107, 0.5659841809732384, 0.5610722742877414]
Epoch 1 - Save Best Score: 0.5324 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1782(0.1429) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 46s) Loss: 0.1303(0.1303) Grad: 470590.9375  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0473(0.1190) Grad: 141066.3125  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 46s) Loss: 0.0926(0.0926) 


Epoch 2 - avg_train_loss: 0.1190  avg_val_loss: 0.1069  time: 350s
Epoch 2 - Score: 0.4584  Scores: [0.48677357049931236, 0.48798784694280667, 0.41524726414668856, 0.4511990428827202, 0.4685757263450551, 0.4405414645875082]
Epoch 2 - Save Best Score: 0.4584 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1657(0.1069) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 19s) Loss: 0.1606(0.1606) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0870(0.0903) Grad: 179241.7812  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 45s) Loss: 0.0937(0.0937) 


Epoch 3 - avg_train_loss: 0.0903  avg_val_loss: 0.1020  time: 350s
Epoch 3 - Score: 0.4476  Scores: [0.47572438237099846, 0.4419172479463272, 0.4099416774081043, 0.4527095299872726, 0.46523919665776464, 0.4400595538543649]
Epoch 3 - Save Best Score: 0.4476 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1163(0.1020) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 21m 4s) Loss: 3.4532(3.4532) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 23s (remain 0m 0s) Loss: 0.1311(0.1774) Grad: 239236.0625  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 4m 51s) Loss: 0.0908(0.0908) 


Epoch 1 - avg_train_loss: 0.1774  avg_val_loss: 0.1261  time: 351s
Epoch 1 - Score: 0.4967  Scores: [0.5602818340555407, 0.4857470700336868, 0.45638557908492977, 0.4882685612022331, 0.519498948105762, 0.4700438185257302]
Epoch 1 - Save Best Score: 0.4967 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0229(0.1261) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 19m 46s) Loss: 0.0789(0.0789) Grad: 381856.7500  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 24s (remain 0m 0s) Loss: 0.0388(0.1198) Grad: 119289.4375  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 49s) Loss: 0.0689(0.0689) 


Epoch 2 - avg_train_loss: 0.1198  avg_val_loss: 0.1145  time: 352s
Epoch 2 - Score: 0.4814  Scores: [0.49362594177740166, 0.46399764457203735, 0.5001400929937603, 0.5050946908073586, 0.46730221226495594, 0.45840879986134175]
Epoch 2 - Save Best Score: 0.4814 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0707(0.1145) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 19m 59s) Loss: 0.0812(0.0812) Grad: 371468.1250  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0567(0.0897) Grad: 163224.9219  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 44s) Loss: 0.0805(0.0805) 


Epoch 3 - avg_train_loss: 0.0897  avg_val_loss: 0.1050  time: 351s
Epoch 3 - Score: 0.4546  Scores: [0.4894816380945003, 0.44154445898206074, 0.41959452442196454, 0.45570747384955856, 0.46996180653870884, 0.45137372738145826]
Epoch 3 - Save Best Score: 0.4546 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0379(0.1050) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 22m 30s) Loss: 4.1793(4.1793) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.1165(0.1741) Grad: 454082.5000  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 33s) Loss: 0.1944(0.1944) 


Epoch 1 - avg_train_loss: 0.1741  avg_val_loss: 0.1206  time: 348s
Epoch 1 - Score: 0.4859  Scores: [0.5339033001925289, 0.4677371918810751, 0.42359135475025655, 0.47328680523030275, 0.5189797418127945, 0.49789493410278574]
Epoch 1 - Save Best Score: 0.4859 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0590(0.1206) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 12s) Loss: 0.0879(0.0879) Grad: 395980.3750  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0879(0.1128) Grad: 371777.5000  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.2208(0.2208) 


Epoch 2 - avg_train_loss: 0.1128  avg_val_loss: 0.1198  time: 348s
Epoch 2 - Score: 0.4827  Scores: [0.525863480488479, 0.46337426626250666, 0.4284900413153502, 0.4797375604885301, 0.5377336219096756, 0.4611920915672263]
Epoch 2 - Save Best Score: 0.4827 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1225(0.1198) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 32s) Loss: 0.0473(0.0473) Grad: 285675.5312  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0491(0.0907) Grad: 304970.6562  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 42s) Loss: 0.1814(0.1814) 


Epoch 3 - avg_train_loss: 0.0907  avg_val_loss: 0.1069  time: 347s
Epoch 3 - Score: 0.4587  Scores: [0.48296855698014196, 0.44980454116810964, 0.41476715173322104, 0.4662655157530499, 0.4833470983269082, 0.45521327029603403]
Epoch 3 - Save Best Score: 0.4587 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0660(0.1069) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 21m 17s) Loss: 3.6580(3.6580) Grad: 1763535.8750  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0930(0.1823) Grad: 204454.3906  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 13s) Loss: 0.1450(0.1450) 


Epoch 1 - avg_train_loss: 0.1823  avg_val_loss: 0.1302  time: 346s
Epoch 1 - Score: 0.5099  Scores: [0.5008121311267075, 0.5139014613396031, 0.5677200406783475, 0.44328386187701235, 0.5897727235616838, 0.4439572705183704]
Epoch 1 - Save Best Score: 0.5099 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0698(0.1302) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 43s) Loss: 0.2603(0.2603) Grad: inf  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0997(0.1183) Grad: 200279.4844  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 16s) Loss: 0.1471(0.1471) 


Epoch 2 - avg_train_loss: 0.1183  avg_val_loss: 0.1166  time: 346s
Epoch 2 - Score: 0.4751  Scores: [0.5389268307869152, 0.4544218738576506, 0.4147343035966772, 0.49743549065885156, 0.507528073631663, 0.4376808750430226]
Epoch 2 - Save Best Score: 0.4751 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1004(0.1166) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 18m 48s) Loss: 0.1974(0.1974) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0877(0.0915) Grad: 181081.5469  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 10s) Loss: 0.1845(0.1845) 


Epoch 3 - avg_train_loss: 0.0915  avg_val_loss: 0.1006  time: 347s
Epoch 3 - Score: 0.4442  Scores: [0.4836364168301551, 0.4436298914281619, 0.4097932589011007, 0.4352860896095163, 0.4630394879603115, 0.4296404763018155]
Epoch 3 - Save Best Score: 0.4442 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0541(0.1006) 


Score: 0.4513  Scores: [0.4829757508698728, 0.4442371010674738, 0.41354228720598013, 0.452628399325356, 0.47046309010579, 0.44418299395322347]
[32m[I 2022-12-14 08:13:34,315][0m Trial 4 finished with value: 0.45133827042128266 and parameters: {'epoch': 3, 'encoder_lr': 0.00021056197663927353, 'llrd': 0.8946433972125326}. Best is trial 1 with value: 0.45065841781573773.[0m


11111111111111111111111111111111
0.45133827042128266


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 17s) Loss: 2.8933(2.8933) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 19s (remain 0m 0s) Loss: 0.0993(0.1773) Grad: 213936.7656  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 42s) Loss: 0.1035(0.1035) 


Epoch 1 - avg_train_loss: 0.1773  avg_val_loss: 0.1159  time: 347s
Epoch 1 - Score: 0.4751  Scores: [0.49948726316095465, 0.4893178184868957, 0.41206102191823096, 0.4600399320193221, 0.5203573036978474, 0.4694634377403163]
Epoch 1 - Save Best Score: 0.4751 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0750(0.1159) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 12s) Loss: 0.1030(0.1030) Grad: 387766.6250  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0136(0.1151) Grad: 75311.3906  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 45s) Loss: 0.0782(0.0782) 


Epoch 2 - avg_train_loss: 0.1151  avg_val_loss: 0.1116  time: 348s
Epoch 2 - Score: 0.4659  Scores: [0.47968199188687877, 0.45204782935364124, 0.41042923142941595, 0.4925500567553454, 0.5162524971772287, 0.4445142608129674]
Epoch 2 - Save Best Score: 0.4659 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1615(0.1116) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 21s) Loss: 0.0459(0.0459) Grad: 267137.7188  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0470(0.0898) Grad: 129346.3203  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 52s) Loss: 0.0775(0.0775) 


Epoch 3 - avg_train_loss: 0.0898  avg_val_loss: 0.1020  time: 347s
Epoch 3 - Score: 0.4474  Scores: [0.4780496863622425, 0.4390593176636923, 0.410635235889175, 0.4519461087983157, 0.46571751251093585, 0.43923472525949625]
Epoch 3 - Save Best Score: 0.4474 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1188(0.1020) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 25m 22s) Loss: 3.4094(3.4094) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 19s (remain 0m 0s) Loss: 0.1156(0.1788) Grad: 195609.5312  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 4m 38s) Loss: 0.1255(0.1255) 


Epoch 1 - avg_train_loss: 0.1788  avg_val_loss: 0.1524  time: 347s
Epoch 1 - Score: 0.5310  Scores: [0.7407462551052834, 0.4814167481871011, 0.4429926225620006, 0.46949816655679416, 0.5481210453003261, 0.5034620774457406]
Epoch 1 - Save Best Score: 0.5310 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0632(0.1524) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 20m 23s) Loss: 0.1303(0.1303) Grad: inf  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.2377(0.1175) Grad: 313069.5312  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 36s) Loss: 0.0892(0.0892) 


Epoch 2 - avg_train_loss: 0.1175  avg_val_loss: 0.1166  time: 347s
Epoch 2 - Score: 0.4815  Scores: [0.49059553248892196, 0.44673186512969504, 0.4668317382248286, 0.5213062438492861, 0.5079522807942597, 0.4556962675214505]
Epoch 2 - Save Best Score: 0.4815 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0745(0.1166) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 20m 5s) Loss: 0.0881(0.0881) Grad: 372104.3750  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 19s (remain 0m 0s) Loss: 0.0569(0.0906) Grad: 148342.0469  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0895(0.0895) 


Epoch 3 - avg_train_loss: 0.0906  avg_val_loss: 0.1042  time: 347s
Epoch 3 - Score: 0.4527  Scores: [0.4913532292871371, 0.44053618070168904, 0.41880659632774964, 0.451373564957113, 0.4667549431825424, 0.44755909316983483]
Epoch 3 - Save Best Score: 0.4527 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0444(0.1042) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 10s) Loss: 2.3385(2.3385) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0500(0.1730) Grad: 148966.6875  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 46s) Loss: 0.4039(0.4039) 


Epoch 1 - avg_train_loss: 0.1730  avg_val_loss: 0.1738  time: 346s
Epoch 1 - Score: 0.5875  Scores: [0.709289189880479, 0.5030538816457579, 0.5877875363931852, 0.5509975483541382, 0.55850915200385, 0.6152128493373634]
Epoch 1 - Save Best Score: 0.5875 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.4092(0.1738) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 12s) Loss: 0.2322(0.2322) Grad: inf  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0722(0.1137) Grad: 175085.9062  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 36s) Loss: 0.1957(0.1957) 


Epoch 2 - avg_train_loss: 0.1137  avg_val_loss: 0.1081  time: 347s
Epoch 2 - Score: 0.4648  Scores: [0.4838869118529056, 0.4510574891552395, 0.45948694756208724, 0.46837691169838586, 0.4742297605191585, 0.4516995109573594]
Epoch 2 - Save Best Score: 0.4648 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1068(0.1081) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 59s) Loss: 0.0923(0.0923) Grad: 374714.2500  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0483(0.0888) Grad: 140327.5469  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 45s) Loss: 0.2132(0.2132) 


Epoch 3 - avg_train_loss: 0.0888  avg_val_loss: 0.1066  time: 346s
Epoch 3 - Score: 0.4579  Scores: [0.48095925414402074, 0.44895790819779424, 0.41642501413612387, 0.46539953713825005, 0.4841813401603877, 0.4517483716488569]
Epoch 3 - Save Best Score: 0.4579 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0779(0.1066) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 18m 45s) Loss: 3.2275(3.2275) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 19s (remain 0m 0s) Loss: 0.0698(0.1708) Grad: 332490.2188  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 8s) Loss: 0.3414(0.3414) 


Epoch 1 - avg_train_loss: 0.1708  avg_val_loss: 0.1445  time: 345s
Epoch 1 - Score: 0.5291  Scores: [0.5495151528514929, 0.7510931355745231, 0.43247165916447494, 0.4906714600716729, 0.46642974399895715, 0.4844975228376682]
Epoch 1 - Save Best Score: 0.5291 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0473(0.1445) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 50s) Loss: 0.0559(0.0559) Grad: 309358.5000  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0545(0.1154) Grad: 312996.9688  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 12s) Loss: 0.2435(0.2435) 


Epoch 2 - avg_train_loss: 0.1154  avg_val_loss: 0.1066  time: 346s
Epoch 2 - Score: 0.4574  Scores: [0.502876582580452, 0.44365097394425984, 0.41523936347914736, 0.45188063108837456, 0.4652430082504392, 0.4655865710388337]
Epoch 2 - Save Best Score: 0.4574 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0658(0.1066) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 0s) Loss: 0.1087(0.1087) Grad: 420485.5625  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0434(0.0929) Grad: 302936.3438  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 18s) Loss: 0.1857(0.1857) 


Epoch 3 - avg_train_loss: 0.0929  avg_val_loss: 0.0998  time: 346s
Epoch 3 - Score: 0.4426  Scores: [0.48040270623135267, 0.44356232142022395, 0.40566510867429306, 0.43690202901514724, 0.45878594684529156, 0.4303368282226238]
Epoch 3 - Save Best Score: 0.4426 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0623(0.0998) 


Score: 0.4502  Scores: [0.4827161330636967, 0.4430457692382024, 0.41291319395053366, 0.45151789160805145, 0.46895394164441534, 0.4422945491157398]
[32m[I 2022-12-14 09:23:27,585][0m Trial 5 finished with value: 0.45024024643677324 and parameters: {'epoch': 3, 'encoder_lr': 3.007918911639952e-05, 'llrd': 0.8329610635006021}. Best is trial 5 with value: 0.45024024643677324.[0m


11111111111111111111111111111111
0.45024024643677324


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 19m 42s) Loss: 3.0329(3.0329) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 19s (remain 0m 0s) Loss: 0.1411(0.1845) Grad: 238735.0625  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 43s) Loss: 0.1042(0.1042) 


Epoch 1 - avg_train_loss: 0.1845  avg_val_loss: 0.1461  time: 346s
Epoch 1 - Score: 0.5370  Scores: [0.5139165383173847, 0.4788407301557295, 0.522828784494016, 0.7696697438327399, 0.49374618096427025, 0.4430235120110605]
Epoch 1 - Save Best Score: 0.5370 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1538(0.1461) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 16s) Loss: 0.1008(0.1008) Grad: 449103.6250  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 17s (remain 0m 0s) Loss: 0.0467(0.1187) Grad: 133882.7812  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 52s) Loss: 0.0845(0.0845) 


Epoch 2 - avg_train_loss: 0.1187  avg_val_loss: 0.1059  time: 344s
Epoch 2 - Score: 0.4563  Scores: [0.48381886888021086, 0.4548722585003184, 0.4203152215446149, 0.46591046862337077, 0.47411893018242623, 0.43889882700512756]
Epoch 2 - Save Best Score: 0.4563 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1393(0.1059) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 18m 47s) Loss: 0.1734(0.1734) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 19s (remain 0m 0s) Loss: 0.0581(0.0910) Grad: 146521.7344  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 51s) Loss: 0.0797(0.0797) 


Epoch 3 - avg_train_loss: 0.0910  avg_val_loss: 0.1023  time: 346s
Epoch 3 - Score: 0.4483  Scores: [0.4780990380544392, 0.44213198660602454, 0.40963605193022884, 0.45661408177293145, 0.464141749103897, 0.43917488611553773]
Epoch 3 - Save Best Score: 0.4483 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1281(0.1023) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 20m 12s) Loss: 2.7532(2.7532) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 19s (remain 0m 0s) Loss: 0.0807(0.1750) Grad: 193144.5156  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 4m 50s) Loss: 0.1016(0.1016) 


Epoch 1 - avg_train_loss: 0.1750  avg_val_loss: 0.1219  time: 347s
Epoch 1 - Score: 0.4862  Scores: [0.49370619024589496, 0.4736693441058998, 0.4397178363082871, 0.4546538094473245, 0.5783572990075301, 0.4773105413045735]
Epoch 1 - Save Best Score: 0.4862 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0345(0.1219) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 19m 35s) Loss: 0.4023(0.4023) Grad: inf  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 18s (remain 0m 0s) Loss: 0.1340(0.1149) Grad: 226164.6250  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 40s) Loss: 0.1326(0.1326) 


Epoch 2 - avg_train_loss: 0.1149  avg_val_loss: 0.1182  time: 346s
Epoch 2 - Score: 0.4790  Scores: [0.5346811673123614, 0.4529020738568246, 0.42277199928274223, 0.45889854588353696, 0.5163922837590598, 0.4881954989182572]
Epoch 2 - Save Best Score: 0.4790 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0411(0.1182) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 21m 20s) Loss: 0.0476(0.0476) Grad: 262911.1562  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0853(0.0892) Grad: 186389.2031  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 46s) Loss: 0.0759(0.0759) 


Epoch 3 - avg_train_loss: 0.0892  avg_val_loss: 0.1032  time: 348s
Epoch 3 - Score: 0.4506  Scores: [0.4830526425653089, 0.44119883027202655, 0.4175186009553713, 0.44925697563633016, 0.4679162673723383, 0.4449050459338728]
Epoch 3 - Save Best Score: 0.4506 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0374(0.1032) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 19m 51s) Loss: 3.1147(3.1147) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0426(0.1746) Grad: 126278.9219  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.2997(0.2997) 


Epoch 1 - avg_train_loss: 0.1746  avg_val_loss: 0.1384  time: 345s
Epoch 1 - Score: 0.5231  Scores: [0.6060418966945579, 0.4756822919422532, 0.49872920459132886, 0.5683201806650514, 0.49535856762703434, 0.4943783175473378]
Epoch 1 - Save Best Score: 0.5231 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2805(0.1384) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 21s) Loss: 0.0968(0.0968) Grad: 379773.9062  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0382(0.1129) Grad: 123043.5547  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.1721(0.1721) 


Epoch 2 - avg_train_loss: 0.1129  avg_val_loss: 0.1136  time: 345s
Epoch 2 - Score: 0.4748  Scores: [0.4888514046116735, 0.4593363391959792, 0.4527538209958638, 0.4701288604441095, 0.5074030480200942, 0.47042142425842015]
Epoch 2 - Save Best Score: 0.4748 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0877(0.1136) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 7s) Loss: 0.0361(0.0361) Grad: 227393.2969  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1415(0.0881) Grad: 251803.8906  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.1939(0.1939) 


Epoch 3 - avg_train_loss: 0.0881  avg_val_loss: 0.1071  time: 347s
Epoch 3 - Score: 0.4590  Scores: [0.4820820765779571, 0.44994186531098473, 0.417744781564979, 0.4649935318825981, 0.4876864041930345, 0.4517005121883835]
Epoch 3 - Save Best Score: 0.4590 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0588(0.1071) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 52s) Loss: 2.9316(2.9316) Grad: 1728637.7500  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0350(0.1812) Grad: 127223.2344  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 6s) Loss: 0.2227(0.2227) 


Epoch 1 - avg_train_loss: 0.1812  avg_val_loss: 0.1156  time: 347s
Epoch 1 - Score: 0.4730  Scores: [0.5600533446892185, 0.4613790518467088, 0.4211237020823644, 0.4503993094925236, 0.48781898564339304, 0.45714834045473646]
Epoch 1 - Save Best Score: 0.4730 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0720(0.1156) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 14s) Loss: 0.1144(0.1144) Grad: inf  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0462(0.1200) Grad: 130134.3203  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 8s) Loss: 0.1849(0.1849) 


Epoch 2 - avg_train_loss: 0.1200  avg_val_loss: 0.1154  time: 348s
Epoch 2 - Score: 0.4725  Scores: [0.5482359877593409, 0.4458798281416888, 0.414557203264818, 0.5014676048060924, 0.4892761166106943, 0.43531576759218565]
Epoch 2 - Save Best Score: 0.4725 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0644(0.1154) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 18m 14s) Loss: 0.3064(0.3064) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.2436(0.0919) Grad: 313826.5938  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 7s) Loss: 0.1839(0.1839) 


Epoch 3 - avg_train_loss: 0.0919  avg_val_loss: 0.1004  time: 347s
Epoch 3 - Score: 0.4441  Scores: [0.4834708704547975, 0.44273810640797784, 0.41184560668629167, 0.43766901039089723, 0.4595603403764521, 0.42908810493148813]
Epoch 3 - Save Best Score: 0.4441 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0547(0.1004) 


Score: 0.4506  Scores: [0.4816804966161599, 0.4440169943932173, 0.4142004833515022, 0.45224555055693544, 0.46994916780702223, 0.44129405246185643]
[32m[I 2022-12-14 10:33:18,926][0m Trial 6 finished with value: 0.45056445753111557 and parameters: {'epoch': 4, 'encoder_lr': 0.0002259875438626923, 'llrd': 0.8349310270411133}. Best is trial 5 with value: 0.45024024643677324.[0m


11111111111111111111111111111111
0.45056445753111557


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 24m 56s) Loss: 2.3857(2.3857) Grad: 7455847.5000  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1518(0.1767) Grad: 505189.7500  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 40s) Loss: 0.1126(0.1126) 


Epoch 1 - avg_train_loss: 0.1767  avg_val_loss: 0.1169  time: 349s
Epoch 1 - Score: 0.4838  Scores: [0.48803070450807784, 0.5454331577411479, 0.46733538377081846, 0.4647515141747743, 0.47108069727842256, 0.46613838603225144]
Epoch 1 - Save Best Score: 0.4838 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1147(0.1169) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 36s) Loss: 0.0493(0.0493) Grad: 277640.5938  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0488(0.1215) Grad: 136103.4375  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.0875(0.0875) 


Epoch 2 - avg_train_loss: 0.1215  avg_val_loss: 0.1118  time: 348s
Epoch 2 - Score: 0.4687  Scores: [0.5056891926689322, 0.45831217509009886, 0.44143239956593994, 0.4581931572918767, 0.49714254115898143, 0.451324963922285]
Epoch 2 - Save Best Score: 0.4687 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1851(0.1118) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 16s) Loss: 0.0426(0.0426) Grad: 258435.4531  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0304(0.0915) Grad: 93213.8516  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0801(0.0801) 


Epoch 3 - avg_train_loss: 0.0915  avg_val_loss: 0.1031  time: 348s
Epoch 3 - Score: 0.4498  Scores: [0.478569821255421, 0.4437032652753412, 0.41320757404215763, 0.45643643032963116, 0.46895908445622564, 0.4380499701074223]
Epoch 3 - Save Best Score: 0.4498 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1361(0.1031) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 21m 46s) Loss: 4.0935(4.0935) Grad: 1703175.8750  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0723(0.1769) Grad: 166702.5469  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 5m 43s) Loss: 0.0423(0.0423) 


Epoch 1 - avg_train_loss: 0.1769  avg_val_loss: 0.1243  time: 348s
Epoch 1 - Score: 0.4920  Scores: [0.5698605253085111, 0.44632146733011, 0.43702309358716784, 0.46869019838004583, 0.4797343527840959, 0.5504799136125166]
Epoch 1 - Save Best Score: 0.4920 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1143(0.1243) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 19m 57s) Loss: 0.1369(0.1369) Grad: inf  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.2600(0.1123) Grad: 343927.6875  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 40s) Loss: 0.0946(0.0946) 


Epoch 2 - avg_train_loss: 0.1123  avg_val_loss: 0.1058  time: 348s
Epoch 2 - Score: 0.4564  Scores: [0.4884121366447059, 0.451597453676487, 0.4198180945692375, 0.45348086175645724, 0.46967650866198873, 0.4553607170910343]
Epoch 2 - Save Best Score: 0.4564 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0492(0.1058) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 20m 47s) Loss: 0.0514(0.0514) Grad: 284776.6875  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0718(0.0893) Grad: 184784.7031  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 43s) Loss: 0.0819(0.0819) 


Epoch 3 - avg_train_loss: 0.0893  avg_val_loss: 0.1048  time: 348s
Epoch 3 - Score: 0.4541  Scores: [0.4883614145178865, 0.4422303411675017, 0.4198413387896622, 0.4526747399088859, 0.47032723404744814, 0.45143539211811706]
Epoch 3 - Save Best Score: 0.4541 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0417(0.1048) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 21m 14s) Loss: 2.0723(2.0723) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.1538(0.1824) Grad: 247629.7969  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 33s) Loss: 0.2367(0.2367) 


Epoch 1 - avg_train_loss: 0.1824  avg_val_loss: 0.1329  time: 348s
Epoch 1 - Score: 0.5236  Scores: [0.5157741767541592, 0.5433350975335417, 0.5896734673475126, 0.4894401124604954, 0.4925721578051583, 0.5105823954496267]
Epoch 1 - Save Best Score: 0.5236 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1854(0.1329) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 52s) Loss: 0.1119(0.1119) Grad: 381611.2500  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.1777(0.1167) Grad: 307351.4375  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 36s) Loss: 0.1576(0.1576) 


Epoch 2 - avg_train_loss: 0.1167  avg_val_loss: 0.1157  time: 348s
Epoch 2 - Score: 0.4777  Scores: [0.511900192063541, 0.46902927020699, 0.42099753824775143, 0.4872595047521829, 0.47951178254696925, 0.49738550021249184]
Epoch 2 - Save Best Score: 0.4777 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0554(0.1157) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 24m 26s) Loss: 0.1111(0.1111) Grad: 407140.3438  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0246(0.0877) Grad: 90521.5156  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 41s) Loss: 0.2009(0.2009) 


Epoch 3 - avg_train_loss: 0.0877  avg_val_loss: 0.1073  time: 348s
Epoch 3 - Score: 0.4596  Scores: [0.48059826111245446, 0.4498540921495614, 0.41789178365111485, 0.46586526428794384, 0.4872550712097543, 0.45606954458992494]
Epoch 3 - Save Best Score: 0.4596 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0768(0.1073) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 19m 54s) Loss: 1.9190(1.9190) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1375(0.1811) Grad: 247409.4375  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 14s) Loss: 0.2611(0.2611) 


Epoch 1 - avg_train_loss: 0.1811  avg_val_loss: 0.1177  time: 347s
Epoch 1 - Score: 0.4806  Scores: [0.5063534182970807, 0.49195190843056114, 0.42297986886932404, 0.4459388146805447, 0.49866228937208995, 0.5174834560433352]
Epoch 1 - Save Best Score: 0.4806 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0784(0.1177) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 20m 8s) Loss: 0.1144(0.1144) Grad: 425961.7188  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0124(0.1139) Grad: 57407.2227  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 14s) Loss: 0.1807(0.1807) 


Epoch 2 - avg_train_loss: 0.1139  avg_val_loss: 0.1135  time: 346s
Epoch 2 - Score: 0.4707  Scores: [0.5615651638336483, 0.44944729333435307, 0.443732418926062, 0.4695169766298437, 0.4680821143393613, 0.4317341268199231]
Epoch 2 - Save Best Score: 0.4707 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0819(0.1135) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 55s) Loss: 0.0951(0.0951) Grad: 412739.0312  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0389(0.0904) Grad: 127670.5000  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 15s) Loss: 0.1893(0.1893) 


Epoch 3 - avg_train_loss: 0.0904  avg_val_loss: 0.1010  time: 347s
Epoch 3 - Score: 0.4454  Scores: [0.48562962666166687, 0.4424161564443774, 0.4107116429931964, 0.4384357339666918, 0.4620633432688018, 0.4329820571657352]
Epoch 3 - Save Best Score: 0.4454 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0711(0.1010) 


Score: 0.4523  Scores: [0.48330418563479793, 0.4445624637222645, 0.4154278144660297, 0.4534605000421865, 0.47224257896511, 0.44473262725260043]
[32m[I 2022-12-14 11:43:25,698][0m Trial 7 finished with value: 0.45228836168049824 and parameters: {'epoch': 3, 'encoder_lr': 0.0004596564335419012, 'llrd': 0.9410059335933649}. Best is trial 5 with value: 0.45024024643677324.[0m


11111111111111111111111111111111
0.45228836168049824


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 36s) Loss: 3.6337(3.6337) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0328(0.1775) Grad: 233481.4844  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.1302(0.1302) 


Epoch 1 - avg_train_loss: 0.1775  avg_val_loss: 0.1278  time: 348s
Epoch 1 - Score: 0.4996  Scores: [0.5473628005548506, 0.4529038173654368, 0.4330159621496183, 0.5413836913693465, 0.505828775706767, 0.5168767995616693]
Epoch 1 - Save Best Score: 0.4996 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1844(0.1278) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 18m 50s) Loss: 0.2715(0.2715) Grad: 698529.8750  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0540(0.1148) Grad: 282233.5312  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0948(0.0948) 


Epoch 2 - avg_train_loss: 0.1148  avg_val_loss: 0.1105  time: 348s
Epoch 2 - Score: 0.4616  Scores: [0.5371740378289024, 0.4418641705711978, 0.4068432780544594, 0.460046277652631, 0.4815428712443458, 0.44189722990911323]
Epoch 2 - Save Best Score: 0.4616 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0972(0.1105) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 8s) Loss: 0.0201(0.0201) Grad: 171478.6406  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0809(0.0939) Grad: 394164.0938  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 43s) Loss: 0.0788(0.0788) 


Epoch 3 - avg_train_loss: 0.0939  avg_val_loss: 0.1025  time: 347s
Epoch 3 - Score: 0.4483  Scores: [0.47983628579898235, 0.44243498750739335, 0.4075380621727886, 0.45411947843842715, 0.4670218259685753, 0.4388466961932903]
Epoch 3 - Save Best Score: 0.4483 Model


EVAL: [488/489] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1202(0.1025) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 18m 52s) Loss: 2.8276(2.8276) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0489(0.1743) Grad: 132347.4531  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 50s) Loss: 0.0475(0.0475) 


Epoch 1 - avg_train_loss: 0.1743  avg_val_loss: 0.1281  time: 348s
Epoch 1 - Score: 0.5006  Scores: [0.524734275818752, 0.45431344813518776, 0.4334246190144043, 0.5319306052180206, 0.5334931838781944, 0.5255997781127707]
Epoch 1 - Save Best Score: 0.5006 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0746(0.1281) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 19m 51s) Loss: 0.0951(0.0951) Grad: 384102.6875  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0441(0.1177) Grad: 125283.3359  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 48s) Loss: 0.0952(0.0952) 


Epoch 2 - avg_train_loss: 0.1177  avg_val_loss: 0.1149  time: 348s
Epoch 2 - Score: 0.4735  Scores: [0.5053046058957499, 0.4446725112266623, 0.42243973453173866, 0.45874705653834325, 0.513531578616305, 0.4961816724136345]
Epoch 2 - Save Best Score: 0.4735 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0562(0.1149) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 19m 2s) Loss: 0.0546(0.0546) Grad: 307735.5938  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0499(0.0901) Grad: 149514.1094  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0805(0.0805) 


Epoch 3 - avg_train_loss: 0.0901  avg_val_loss: 0.1049  time: 348s
Epoch 3 - Score: 0.4542  Scores: [0.49115892767611125, 0.44414963965087084, 0.4201513743427517, 0.450739918891948, 0.4702892549132214, 0.44871069826572024]
Epoch 3 - Save Best Score: 0.4542 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0485(0.1049) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 19m 53s) Loss: 2.7283(2.7283) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.2604(0.1798) Grad: 328051.5625  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 37s) Loss: 0.1088(0.1088) 


Epoch 1 - avg_train_loss: 0.1798  avg_val_loss: 0.2433  time: 347s
Epoch 1 - Score: 0.6921  Scores: [0.6340075037387738, 0.5675727831017525, 0.8442164832306411, 0.6415061278854826, 0.9905295640382908, 0.47468586810924807]
Epoch 1 - Save Best Score: 0.6921 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1114(0.2433) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 18m 35s) Loss: 0.5781(0.5781) Grad: inf  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0796(0.1187) Grad: 173605.3750  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 41s) Loss: 0.2252(0.2252) 


Epoch 2 - avg_train_loss: 0.1187  avg_val_loss: 0.1155  time: 347s
Epoch 2 - Score: 0.4802  Scores: [0.4911824361694115, 0.45287855554223877, 0.4647526898607107, 0.4673148932488088, 0.4907008303518718, 0.5144393586008082]
Epoch 2 - Save Best Score: 0.4802 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1039(0.1155) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 44s) Loss: 0.1799(0.1799) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.1473(0.0904) Grad: 251000.3750  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 40s) Loss: 0.1855(0.1855) 


Epoch 3 - avg_train_loss: 0.0904  avg_val_loss: 0.1071  time: 345s
Epoch 3 - Score: 0.4591  Scores: [0.4798415173422024, 0.45129601038791217, 0.41540452850764736, 0.4667405974945159, 0.4875595150531324, 0.4534842352116871]
Epoch 3 - Save Best Score: 0.4591 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0724(0.1071) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 21m 29s) Loss: 2.5515(2.5515) Grad: 1659668.6250  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0864(0.1815) Grad: 181157.3281  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 11s) Loss: 0.3137(0.3137) 


Epoch 1 - avg_train_loss: 0.1815  avg_val_loss: 0.1249  time: 348s
Epoch 1 - Score: 0.4909  Scores: [0.5856330362953542, 0.46795679756554814, 0.42621628434589853, 0.5049949532442929, 0.5036456341128028, 0.45695722287777624]
Epoch 1 - Save Best Score: 0.4909 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0333(0.1249) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 53s) Loss: 0.0373(0.0373) Grad: 233497.4062  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0429(0.1199) Grad: 123940.4844  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 13s) Loss: 0.2052(0.2052) 


Epoch 2 - avg_train_loss: 0.1199  avg_val_loss: 0.1034  time: 348s
Epoch 2 - Score: 0.4503  Scores: [0.4912487590471541, 0.4616500134513257, 0.4136843843133541, 0.4379425957762976, 0.46617816907277976, 0.43138151978259603]
Epoch 2 - Save Best Score: 0.4503 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0464(0.1034) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 40s) Loss: 0.0361(0.0361) Grad: 239963.5156  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0315(0.0898) Grad: 105502.0000  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 10s) Loss: 0.1734(0.1734) 


Epoch 3 - avg_train_loss: 0.0898  avg_val_loss: 0.1013  time: 347s
Epoch 3 - Score: 0.4459  Scores: [0.48492124927266317, 0.44284406755052763, 0.4102225618427285, 0.43820976340258605, 0.46440545143156636, 0.4345969502109852]
Epoch 3 - Save Best Score: 0.4459 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0608(0.1013) 


Score: 0.4519  Scores: [0.48396003850637304, 0.44519589203527365, 0.41335581841558844, 0.45256691348673395, 0.4724061054036974, 0.44397236660578077]
[32m[I 2022-12-14 12:53:26,438][0m Trial 8 finished with value: 0.45190952240890786 and parameters: {'epoch': 3, 'encoder_lr': 9.285261289172484e-05, 'llrd': 0.9641014940094981}. Best is trial 5 with value: 0.45024024643677324.[0m


11111111111111111111111111111111
0.45190952240890786


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 19m 43s) Loss: 1.9886(1.9886) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1455(0.1758) Grad: 242756.0781  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 47s) Loss: 0.0892(0.0892) 


Epoch 1 - avg_train_loss: 0.1758  avg_val_loss: 0.1104  time: 349s
Epoch 1 - Score: 0.4636  Scores: [0.4925511473589047, 0.44970703358796615, 0.4144324444678159, 0.4579226387466673, 0.5085390288753759, 0.4585310193141039]
Epoch 1 - Save Best Score: 0.4636 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1214(0.1104) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 58s) Loss: 0.0360(0.0360) Grad: 242302.7969  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0250(0.1148) Grad: 97176.7031  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 53s) Loss: 0.0727(0.0727) 


Epoch 2 - avg_train_loss: 0.1148  avg_val_loss: 0.1071  time: 349s
Epoch 2 - Score: 0.4573  Scores: [0.4806559703323133, 0.4420633136219639, 0.4084533447119033, 0.4891894760718548, 0.49039806976279793, 0.43275236976157366]
Epoch 2 - Save Best Score: 0.4573 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1097(0.1071) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 49s) Loss: 0.1954(0.1954) Grad: 577561.4375  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0374(0.0894) Grad: 114010.3750  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 47s) Loss: 0.0806(0.0806) 


Epoch 3 - avg_train_loss: 0.0894  avg_val_loss: 0.1030  time: 350s
Epoch 3 - Score: 0.4495  Scores: [0.48158808179896523, 0.44304819833666576, 0.4117665658570596, 0.45406636458330574, 0.47011193257892225, 0.43661001285914597]
Epoch 3 - Save Best Score: 0.4495 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1073(0.1030) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2934] Elapsed 0m 0s (remain 21m 22s) Loss: 2.8493(2.8493) Grad: inf  LR: 0.00000625  
Epoch: [1][2933/2934] Elapsed 5m 22s (remain 0m 0s) Loss: 0.1361(0.1780) Grad: 222052.3125  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 46s) Loss: 0.1360(0.1360) 


Epoch 1 - avg_train_loss: 0.1780  avg_val_loss: 0.1256  time: 350s
Epoch 1 - Score: 0.4971  Scores: [0.5009151934521958, 0.4487419339040933, 0.423511027460066, 0.5944004342771521, 0.4984237720662074, 0.5165034192770426]
Epoch 1 - Save Best Score: 0.4971 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0764(0.1256) 
Epoch: [2][0/2934] Elapsed 0m 0s (remain 20m 43s) Loss: 0.0515(0.0515) Grad: 300675.0625  LR: 0.00003753  
Epoch: [2][2933/2934] Elapsed 5m 21s (remain 0m 0s) Loss: 0.0728(0.1134) Grad: 173330.9219  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 33s) Loss: 0.0677(0.0677) 


Epoch 2 - avg_train_loss: 0.1134  avg_val_loss: 0.1218  time: 348s
Epoch 2 - Score: 0.4944  Scores: [0.505482332797298, 0.5331656263074394, 0.48128295987482944, 0.4610733819828682, 0.4821892685663741, 0.5031317790511235]
Epoch 2 - Save Best Score: 0.4944 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.1040(0.1218) 
Epoch: [3][0/2934] Elapsed 0m 0s (remain 19m 43s) Loss: 0.1177(0.1177) Grad: 437673.3438  LR: 0.00001251  
Epoch: [3][2933/2934] Elapsed 5m 21s (remain 0m 0s) Loss: 0.1083(0.0900) Grad: 211001.0469  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 43s) Loss: 0.0738(0.0738) 


Epoch 3 - avg_train_loss: 0.0900  avg_val_loss: 0.1044  time: 349s
Epoch 3 - Score: 0.4532  Scores: [0.48870800954486926, 0.44075948310594687, 0.41996461772940485, 0.45237295931231625, 0.4698932064961157, 0.4475213099038892]
Epoch 3 - Save Best Score: 0.4532 Model


EVAL: [488/489] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0453(0.1044) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 20m 41s) Loss: 1.1914(1.1914) Grad: 1813047.7500  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 24s (remain 0m 0s) Loss: 0.0764(0.1785) Grad: 181447.7969  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 37s) Loss: 0.3730(0.3730) 


Epoch 1 - avg_train_loss: 0.1785  avg_val_loss: 0.1550  time: 349s
Epoch 1 - Score: 0.5484  Scores: [0.6953253926155508, 0.5015849868012198, 0.5249772960032247, 0.47633663289286693, 0.5385063745436284, 0.5533866351517914]
Epoch 1 - Save Best Score: 0.5484 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2819(0.1550) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 45s) Loss: 0.0360(0.0360) Grad: 220014.8125  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 24s (remain 0m 0s) Loss: 0.1379(0.1154) Grad: 230074.5312  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 33s) Loss: 0.2656(0.2656) 


Epoch 2 - avg_train_loss: 0.1154  avg_val_loss: 0.1362  time: 350s
Epoch 2 - Score: 0.5235  Scores: [0.5550163717850229, 0.4558705039382968, 0.5361513650711325, 0.5036747116816747, 0.5055509048804121, 0.5846053923454266]
Epoch 2 - Save Best Score: 0.5235 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2021(0.1362) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 19m 30s) Loss: 0.2488(0.2488) Grad: inf  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0483(0.0891) Grad: 144386.2188  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 2m 39s) Loss: 0.1703(0.1703) 


Epoch 3 - avg_train_loss: 0.0891  avg_val_loss: 0.1072  time: 348s
Epoch 3 - Score: 0.4594  Scores: [0.480207408619614, 0.4489605502226418, 0.41818598109799604, 0.4672863379572216, 0.4877316522775543, 0.45406055951722823]
Epoch 3 - Save Best Score: 0.4594 Model


EVAL: [488/489] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0738(0.1072) 


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model:

Done
attention.0.weight
attention.0.bias
attention.1.weight
attention.1.bias
attention.3.weight
attention.3.bias
Epoch: [1][0/2933] Elapsed 0m 0s (remain 22m 40s) Loss: 2.2425(2.2425) Grad: inf  LR: 0.00000625  
Epoch: [1][2932/2933] Elapsed 5m 23s (remain 0m 0s) Loss: 0.0610(0.1752) Grad: 347151.7812  LR: 0.00003754  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 9s) Loss: 0.1199(0.1199) 


Epoch 1 - avg_train_loss: 0.1752  avg_val_loss: 0.1494  time: 349s
Epoch 1 - Score: 0.5321  Scores: [0.564431939326412, 0.4609599197940835, 0.46049733954475175, 0.46273008477991306, 0.6744385217093577, 0.5696571863057972]
Epoch 1 - Save Best Score: 0.5321 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1092(0.1494) 
Epoch: [2][0/2933] Elapsed 0m 0s (remain 19m 51s) Loss: 0.0973(0.0973) Grad: 375117.2812  LR: 0.00003753  
Epoch: [2][2932/2933] Elapsed 5m 20s (remain 0m 0s) Loss: 0.0469(0.1155) Grad: 264605.5625  LR: 0.00001252  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 3s) Loss: 0.1888(0.1888) 


Epoch 2 - avg_train_loss: 0.1155  avg_val_loss: 0.1034  time: 346s
Epoch 2 - Score: 0.4524  Scores: [0.48823698667384946, 0.4432369930720677, 0.4358960499990727, 0.44452162421593433, 0.4628015290709445, 0.4398998397312368]
Epoch 2 - Save Best Score: 0.4524 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0473(0.1034) 
Epoch: [3][0/2933] Elapsed 0m 0s (remain 20m 30s) Loss: 0.3832(0.3832) Grad: 806098.5000  LR: 0.00001251  
Epoch: [3][2932/2933] Elapsed 5m 22s (remain 0m 0s) Loss: 0.0731(0.0918) Grad: 180559.8594  LR: 0.00000000  
EVAL: [0/489] Elapsed 0m 0s (remain 3m 9s) Loss: 0.1630(0.1630) 


Epoch 3 - avg_train_loss: 0.0918  avg_val_loss: 0.1004  time: 348s
Epoch 3 - Score: 0.4440  Scores: [0.4807670287807327, 0.4434742693413729, 0.40963217531780977, 0.43559801052061237, 0.464886314218015, 0.42945756491663684]
Epoch 3 - Save Best Score: 0.4440 Model


EVAL: [488/489] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0546(0.1004) 


Score: 0.4516  Scores: [0.4828283467175599, 0.44407167999176844, 0.41490833398215304, 0.45247093741560873, 0.4732360743123661, 0.44201340710333087]
[32m[I 2022-12-14 14:03:43,234][0m Trial 9 finished with value: 0.4515881299204645 and parameters: {'epoch': 4, 'encoder_lr': 0.000134831353541911, 'llrd': 0.9103563486857054}. Best is trial 5 with value: 0.45024024643677324.[0m


11111111111111111111111111111111
0.4515881299204645


NameError: name 'TrialState' is not defined

In [18]:
pruned_trials = study.get_trials(deepcopy=False)
complete_trials = study.get_trials(deepcopy=False)

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Study statistics: 
  Number of finished trials:  10
  Number of pruned trials:  10
  Number of complete trials:  10
Best trial:
  Value:  0.45024024643677324
  Params: 
    epoch: 3
    encoder_lr: 3.007918911639952e-05
    llrd: 0.8329610635006021
