In [1]:
import json
import pandas as pd
import numpy as np
import os
import csv
import os
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler, RandomSampler
)

try:
    from apex import amp
    APEX_INSTALLED = True
except ImportError:
    APEX_INSTALLED = False

from madgrad import MADGRAD

try:
    from torch.optim.swa_utils import (
        AveragedModel, update_bn, SWALR
    )
    SWA_AVAILABLE = True
except ImportError:
    SWA_AVAILABLE = False

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    logging,
    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

print(f"Apex AMP Installed :: {APEX_INSTALLED}")
print(f"SWA Available :: {SWA_AVAILABLE}")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

  from .autonotebook import tqdm as notebook_tqdm
2023-06-02 12:26:53.143157: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-02 12:26:54.830300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-02 12:26:54.830613: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


Apex AMP Installed :: False
SWA Available :: True


In [2]:
all_data = pd.read_csv('../data/forauto.csv')
alltags = all_data[['Title','label']]
train = alltags[:-180581]
test = alltags[-180581:]

In [3]:
train

Unnamed: 0,Title,label
0,Luis Drayton - Edinburgh shoot #6,11.18
1,Arena da Barra - Arena HSBC - Arena do PAN #...,15.15
2,MARILYN 2015,10.99
3,Knikkertijd - 1959,8.63
4,CAMELS01,11.16
...,...,...
305608,Mushrooms,4.00
305609,Evie,1.00
305610,BSSLS+R8GT!!,2.00
305611,C to A1,6.89


In [4]:
# train = pd.read_csv('../input/commonlitreadabilityprize/train.csv', low_memory=False)
def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data
train = create_folds(train, num_splits=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["kfold"] = -1


In [5]:
class Config:
    # model
    num_labels = 1
    model_type = 'roberta'
    model_name_or_path = 'roberta-base'
    config_name = 'roberta-base'
    fp16 = True if APEX_INSTALLED else False
    fp16_opt_level = "O1"

    # tokenizer
    tokenizer_name = 'roberta-base'
    max_seq_length = 250

    # train
    epochs = 6
    train_batch_size = 64
    eval_batch_size = 32

    # optimizer
    optimizer_type = 'MADGRAD'
    learning_rate = 2e-5
    weight_decay = 1e-5
    epsilon = 1e-6
    max_grad_norm = 1.0

    # stochastic weight averaging
    swa = True
    swa_start = 4
    swa_learning_rate = 1e-4
    anneal_epochs=2
    anneal_strategy='cos'

    # scheduler
    decay_name = 'cosine-warmup'
    warmup_ratio = 0.03

    # logging
    logging_steps = 10

    # evaluate
    output_dir = 'output'
    seed = 2020

In [6]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.max = 0
        self.min = 1e5

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        if val > self.max:
            self.max = val
        if val < self.min:
            self.min = val

In [7]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        super(DatasetRetriever, self).__init__()
        self.data = data
        self.is_test = is_test
        self.excerpts = self.data.Title.values.tolist()
        if not self.is_test:
            self.targets = self.data.label.values.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        excerpt = self.excerpts[item]
        features = self.convert_examples_to_features(
            excerpt, self.tokenizer, 
            self.max_len
        )
        features = {key : torch.tensor(value, dtype=torch.long) for key, value in features.items()}
        if not self.is_test:
            label = self.targets[item]
            features['labels'] = torch.tensor(label, dtype=torch.double)
        return features
    
    def convert_examples_to_features(self, example, tokenizer, max_len):
        features = tokenizer.encode_plus(
            example.replace('\n', ''), 
            max_length=max_len, 
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
        )
        return features

In [8]:
class Model(nn.Module):
    def __init__(
        self, model_name, 
        config
    ):
        super(Model, self).__init__()
        self.config = config
        self.roberta = AutoModel.from_pretrained(
            model_name, 
            config=config
        )
        self.dropout = nn.Dropout(p=0.0)
        self.high_dropout = nn.Dropout(p=0.0)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
        self._init_weights(self.layer_norm)
        self.regressor = nn.Linear(config.hidden_size, config.num_labels)
        self._init_weights(self.regressor)
        
        weights_init = torch.zeros(config.num_hidden_layers + 1).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, input_ids=None,
        attention_mask=None, labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
        )
        all_hidden_states = outputs[2]
        
        # weighted layer pooling
        cls_embeddings = torch.stack(
            [self.dropout(layer[:, 0]) for layer in all_hidden_states], 
            dim=2
        )
        cls_output = (
            torch.softmax(self.layer_weights, dim=0) * cls_embeddings
        ).sum(-1)
        cls_output = self.layer_norm(cls_output)
        
        # multi-sample dropout
        logits = torch.mean(
            torch.stack(
                [self.regressor(self.high_dropout(cls_output)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )
 
        # calculate loss
        loss = None
        if labels is not None:
            # regression task
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        output = (logits,) + outputs[2:]
        
        del all_hidden_states, cls_embeddings
        del cls_output, logits
        gc.collect();
        
        return ((loss,) + output) if loss is not None else output

In [9]:
def get_optimizer_grouped_parameters(args, model):
    no_decay = ["bias", "LayerNorm.weight"]
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': args.weight_decay, 'lr': args.learning_rate/2.6},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': args.weight_decay, 'lr': args.learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': args.weight_decay, 'lr': args.learning_rate*2.6},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': args.learning_rate},
        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': args.learning_rate*2.6},
        {'params': [p for n, p in model.named_parameters() if args.model_type not in n], 'lr':args.learning_rate*20, "weight_decay": 0.0},
    ]
    return optimizer_grouped_parameters

In [10]:
def make_model(args, output_attentions=False):
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    config = AutoConfig.from_pretrained(args.config_name)
    config.update({'num_labels':args.num_labels})
    config.update({"output_hidden_states":True})
    if output_attentions:
        config.update({"output_attentions":True})
    model = Model(args.model_name_or_path, config=config)
    return model, config, tokenizer

def make_optimizer(args, model):
    optimizer_grouped_parameters = get_optimizer_grouped_parameters(args, model)
    if args.optimizer_type == "AdamW":
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            eps=args.epsilon,
            correct_bias=not args.use_bertadam
        )
    else:
        optimizer = MADGRAD(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            eps=args.epsilon,
            weight_decay=args.weight_decay
        )
    return optimizer

def make_scheduler(
    args, optimizer, 
    num_warmup_steps, 
    num_training_steps
):
    if args.decay_name == "cosine-warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    else:
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    return scheduler    

def make_loader(
    args, data, 
    tokenizer, fold
):
    train_set, valid_set = data[data['kfold']!=fold], data[data['kfold']==fold]

    train_dataset = DatasetRetriever(train_set, tokenizer, args.max_seq_length)
    valid_dataset = DatasetRetriever(valid_set, tokenizer, args.max_seq_length)
    print(f"Num examples Train= {len(train_dataset)}, Num examples Valid={len(valid_dataset)}")
    
    train_sampler = RandomSampler(train_dataset)
    valid_sampler = SequentialSampler(valid_dataset)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=args.train_batch_size,
        sampler=train_sampler,
        num_workers=optimal_num_of_loader_workers(),
        pin_memory=True,
        drop_last=False 
    )

    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=args.eval_batch_size, 
        sampler=valid_sampler,
        num_workers=optimal_num_of_loader_workers(),
        pin_memory=True, 
        drop_last=False
    )

    return train_dataloader, valid_dataloader

In [11]:
class Trainer:
    def __init__(
        self, model, tokenizer, 
        optimizer, scheduler, 
        swa_model=None, swa_scheduler=None
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.swa_model = swa_model
        self.swa_scheduler = swa_scheduler

    def train(
        self, args, 
        train_dataloader, 
        epoch, result_dict
    ):
        count = 0
        losses = AverageMeter()
        
        self.model.zero_grad()
        self.model.train()
        
        fix_all_seeds(args.seed)
        for batch_idx, batch_data in enumerate(train_dataloader):
            input_ids, attention_mask, labels = \
                batch_data['input_ids'], batch_data['attention_mask'], batch_data['labels']
            input_ids, attention_mask, labels = \
                input_ids.cuda(), attention_mask.cuda(), labels.cuda()

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss, logits = outputs[:2]
            
            if args.fp16:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            count += labels.size(0)
            losses.update(loss.item(), input_ids.size(0))

            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm)

            self.optimizer.step()
            if not args.swa:
                self.scheduler.step()
            else:
                if (epoch+1) < args.swa_start:
                    self.scheduler.step()
            self.optimizer.zero_grad()

            if (batch_idx % args.logging_steps == 0) or (batch_idx+1)==len(train_dataloader):
                _s = str(len(str(len(train_dataloader.sampler))))
                ret = [
                    ('Epoch: {:0>2} [{: >' + _s + '}/{} ({: >3.0f}%)]').format(epoch, count, len(train_dataloader.sampler), 100 * count / len(train_dataloader.sampler)),
                    'Train Loss: {: >4.5f}'.format(losses.avg),
                ]
                print(', '.join(ret))
            
        if args.swa and (epoch+1) >= args.swa_start:
            self.swa_model.update_parameters(self.model)
            self.swa_scheduler.step()

        result_dict['train_loss'].append(losses.avg)
        return result_dict

In [12]:
class Evaluator:
    def __init__(self, model, swa_model):
        self.model = model
        self.swa_model = swa_model
    
    def save(self, result, output_dir):
        with open(f'{output_dir}/result_dict.json', 'w') as f:
            f.write(json.dumps(result, sort_keys=True, indent=4, ensure_ascii=False))

    def evaluate(self, valid_dataloader, epoch, result_dict):
        losses = AverageMeter()
        for batch_idx, batch_data in enumerate(valid_dataloader):
            self.model = self.model.eval()
            input_ids, attention_mask, labels = \
                batch_data['input_ids'], batch_data['attention_mask'], batch_data['labels']
            input_ids, attention_mask, labels = \
                input_ids.cuda(), attention_mask.cuda(), labels.cuda()
            with torch.no_grad():            
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss, logits = outputs[:2]
                losses.update(loss.item(), input_ids.size(0))
        print('----Validation Results Summary----')
        print('Epoch: [{}] Valid Loss: {: >4.5f}'.format(epoch, losses.avg))
        result_dict['val_loss'].append(losses.avg)        
        return result_dict
    
    def swa_evaluate(self, valid_dataloader, epoch, result_dict):
        losses = AverageMeter()
        for batch_idx, batch_data in enumerate(valid_dataloader):
            self.swa_model = self.swa_model.eval()
            input_ids, attention_mask, labels = \
                batch_data['input_ids'], batch_data['attention_mask'], batch_data['labels']
            input_ids, attention_mask, labels = \
                input_ids.cuda(), attention_mask.cuda(), labels.cuda()
            with torch.no_grad():            
                outputs = self.swa_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss, logits = outputs[:2]
                losses.update(loss.item(), input_ids.size(0))
        print('----SWA Validation Results Summary----')
        print('Epoch: [{}] Valid Loss: {: >4.5f}'.format(epoch, losses.avg))
        result_dict['swa_loss'].append(losses.avg)        
        return result_dict

In [13]:
def init_training(args, data, fold):
    fix_all_seeds(args.seed)
    
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    
    # model
    model, model_config, tokenizer = make_model(args)
    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')
    
    # data loaders for training and evaluation
    train_dataloader, valid_dataloader = make_loader(args, data, tokenizer, fold)

    # optimizer
    optimizer = make_optimizer(args, model)

    # scheduler
    num_training_steps = len(train_dataloader) * args.epochs
    if args.warmup_ratio > 0:
        num_warmup_steps = int(args.warmup_ratio * num_training_steps)
    else:
        num_warmup_steps = 0
    print(f"Total Training Steps: {num_training_steps}, Total Warmup Steps: {num_warmup_steps}")
    scheduler = make_scheduler(args, optimizer, num_warmup_steps, num_training_steps)

    # stochastic weight averaging
    swa_model = AveragedModel(model)
    swa_scheduler = SWALR(
        optimizer, swa_lr=args.swa_learning_rate, 
        anneal_epochs=args.anneal_epochs, 
        anneal_strategy=args.anneal_strategy
    )

    print(f"Total Training Steps: {num_training_steps}, Total Warmup Steps: {num_warmup_steps}, SWA Start Step: {args.swa_start}")

    # mixed precision training with NVIDIA Apex
    if args.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    
    result_dict = {
        'epoch':[], 
        'train_loss': [], 
        'val_loss' : [], 
        'swa_loss': [],
        'best_val_loss': np.inf
    }

    return (
        model, model_config, tokenizer, optimizer, scheduler, 
        train_dataloader, valid_dataloader, result_dict,
        swa_model, swa_scheduler
    )

In [14]:
def run(data, fold):
    args = Config()
    model, model_config, tokenizer, optimizer, scheduler, train_dataloader, \
        valid_dataloader, result_dict, swa_model, swa_scheduler = init_training(args, data, fold)
    
    trainer = Trainer(model, tokenizer, optimizer, scheduler, swa_model, swa_scheduler)
    evaluator = Evaluator(model, swa_model)

    train_time_list = []
    valid_time_list = []

    for epoch in range(args.epochs):
        result_dict['epoch'].append(epoch)

        # Train
        torch.cuda.synchronize()
        tic1 = time.time()
        result_dict = trainer.train(
            args, train_dataloader, 
            epoch, result_dict
        )
        torch.cuda.synchronize()
        tic2 = time.time() 
        train_time_list.append(tic2 - tic1)
        
        # Evaluate
        torch.cuda.synchronize()
        tic3 = time.time()
        result_dict = evaluator.evaluate(
            valid_dataloader, epoch, result_dict
        )
        torch.cuda.synchronize()
        tic4 = time.time() 
        valid_time_list.append(tic4 - tic3)
            
        output_dir = os.path.join(args.output_dir, f"checkpoint-fold-{fold}")
        if result_dict['val_loss'][-1] < result_dict['best_val_loss']:
            print("{} Epoch, Best epoch was updated! Valid Loss: {: >4.5f}".format(epoch, result_dict['val_loss'][-1]))
            result_dict["best_val_loss"] = result_dict['val_loss'][-1]        
            
            os.makedirs(output_dir, exist_ok=True)
            torch.save(model.state_dict(), f"{output_dir}/pytorch_model.bin")
            model_config.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            print(f"Saving model checkpoint to {output_dir}.")
    
            #torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
            #torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
            #print(f"Saving optimizer and scheduler states to {output_dir}.")
        print()
        
    if args.swa:
        update_bn(train_dataloader, swa_model, device=torch.device('cuda'))
    result_dict = evaluator.swa_evaluate(valid_dataloader, epoch, result_dict)
    
    evaluator.save(result_dict, output_dir)
    torch.save(swa_model.state_dict(), f"{output_dir}/swa_pytorch_model.bin")
    
    print()
    print(f"Total Training Time: {np.sum(train_time_list)}secs, Average Training Time per Epoch: {np.mean(train_time_list)}secs.")
    print(f"Total Validation Time: {np.sum(valid_time_list)}secs, Average Validation Time per Epoch: {np.mean(valid_time_list)}secs.")
    
    torch.cuda.empty_cache()
    del trainer, evaluator
    del model, model_config, tokenizer
    del optimizer, scheduler
    del train_dataloader, valid_dataloader, result_dict
    del swa_model, swa_scheduler
    gc.collect()

In [None]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
for fold in range(5):
    print();print()
    print('-'*50)
    print(f'FOLD: {fold}')
    print('-'*50)
    run(train, fold)



--------------------------------------------------
FOLD: 0
--------------------------------------------------
Model pushed to 2 GPU(s), type NVIDIA GeForce RTX 3090.
Num examples Train= 244490, Num examples Valid=61123
Total Training Steps: 22926, Total Warmup Steps: 687
Total Training Steps: 22926, Total Warmup Steps: 687, SWA Start Step: 4
----Validation Results Summary----
Epoch: [0] Valid Loss: 1.91271
0 Epoch, Best epoch was updated! Valid Loss: 1.91271
Saving model checkpoint to output/checkpoint-fold-0.

----Validation Results Summary----
Epoch: [1] Valid Loss: 1.79845
1 Epoch, Best epoch was updated! Valid Loss: 1.79845
Saving model checkpoint to output/checkpoint-fold-0.

----Validation Results Summary----
Epoch: [2] Valid Loss: 1.76508
2 Epoch, Best epoch was updated! Valid Loss: 1.76508
Saving model checkpoint to output/checkpoint-fold-0.

----Validation Results Summary----
Epoch: [3] Valid Loss: 1.74029
3 Epoch, Best epoch was updated! Valid Loss: 1.74029
Saving model che