In [1]:
!nvidia-smi

Fri Jul 23 16:19:53 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    49W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Directory 설정, 구글 드라이브 import

In [2]:
cur_dir = '/content/drive/MyDrive/KLUE_TC'

## Utils

In [3]:
!pip install adamp
!pip install transformers



In [4]:
import os
import random
import torch
import numpy as np
from torch import nn

from torch.optim import Adam, AdamW, SGD
from adamp import AdamP
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR, ExponentialLR, \
    CosineAnnealingWarmRestarts
from transformers import get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification


def set_seeds(seed=42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.benchmark = False


def save_checkpoint(state, model_dir, model_filename):
    print('saving model ...')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    torch.save(state, os.path.join(model_dir, model_filename))


def get_optimizer(model, args):
    if args.optimizer == 'adam':
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'adamW':
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'adamP':
        optimizer = AdamP(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()

    return optimizer


def get_scheduler(optimizer, args):
    if args.scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer, patience=args.plateau_patience, factor=args.plateau_factor, mode='max',
                                      verbose=True)
    elif args.scheduler == 'linear_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                    num_training_steps=args.total_steps)
    elif args.scheduler == 'step_lr':
        scheduler = StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
    elif args.scheduler == 'exp_lr':
        scheduler = ExponentialLR(optimizer, gamma=args.gamma)
    elif args.scheduler == 'cosine_annealing':
        scheduler = CosineAnnealingLR(optimizer, T_max=args.t_max, eta_min=args.eta_min)
    elif args.scheduler == 'cosine_annealing_warmstart':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=args.T_0, T_mult=args.T_mult, eta_min=args.eta_min,
                                                last_epoch=-1)

    return scheduler


def update_params(loss, model, optimizer, args):
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
    optimizer.step()
    optimizer.zero_grad()


def load_tokenizer(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name
        else args.model_name_or_path,
        use_fast=True,
    )

    return tokenizer


def load_model(args, model_name=None):
    if not model_name:
        model_name = args.model_name
    model_path = os.path.join(args.model_dir, model_name)
    print("Loading Model from:", model_path)
    load_state = torch.load(model_path)

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        args.config_name
        if args.config_name
        else args.model_name_or_path,
    )

    config.num_labels = 7

    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        from_tf=bool(".ckpt" in model_path),
        config=config
    ).to(args.device)

    model.load_state_dict(load_state['state_dict'], strict=True)

    print(model)

    print("Loading Model from:", model_path, "...Finished.")

    return model


def get_model(args):
    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        args.config_name
        if args.config_name
        else args.model_name_or_path,
    )

    config.num_labels = 7
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    ).to(args.device)

    model.classifier.dropout = nn.Dropout(p=0.4, inplace = False)
    # print(model)
    return model


def get_loaders(args, train, valid, is_inference=False):
    pin_memory = True
    train_loader, valid_loader = None, None

    if is_inference:
        test_dataset = YNAT_dataset(args, valid, is_inference)
        test_loader = torch.utils.data.DataLoader(test_dataset, num_workers=args.num_workers, shuffle=False,
                                                  batch_size=args.batch_size, pin_memory=pin_memory)
        return test_loader

    if train is not None:
        train_dataset = YNAT_dataset(args, train, is_inference)
        train_loader = torch.utils.data.DataLoader(train_dataset, num_workers=args.num_workers, shuffle=True,
                                                   batch_size=args.batch_size, pin_memory=pin_memory)
    if valid is not None:
        valid_dataset = YNAT_dataset(args, valid, is_inference)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, num_workers=args.num_workers, shuffle=False,
                                                   batch_size=args.batch_size, pin_memory=pin_memory)

    return train_loader, valid_loader


# loss계산하고 parameter update!
def compute_loss(preds, targets, args):
    """
    Args :
        preds   : (batch_size, max_seq_len)
        targets : (batch_size, max_seq_len)
    """
    # print(preds, targets)
    loss = get_criterion(preds, targets, args)
    # 마지막 시퀀스에 대한 값만 loss 계산
    # loss = loss[:, -1]
    # loss = torch.mean(loss)
    return loss


def get_criterion(pred, target, args):
    if args.criterion == 'BCE':
        loss = nn.BCELoss(reduction="none")
    elif args.criterion == "BCELogit":
        loss = nn.BCEWithLogitsLoss(reduction="none")
    elif args.criterion == "MSE":
        loss = nn.MSELoss(reduction="none")
    elif args.criterion == "L1":
        loss = nn.L1Loss(reduction="none")
    elif args.criterion == "CE":
        loss = nn.CrossEntropyLoss()
    # NLL, CrossEntropy not available
    return loss(pred, target)


## Dataloader

In [5]:
import os
import torch
import pandas as pd


class Preprocess:
    def __init__(self, args):
        self.args = args
        self.train_data = None
        self.test_data = None

    def load_data(self, file_name):
        csv_file_name = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_name)
        del df['Unnamed: 0']
        return df.values

    def load_train_data(self):
        self.train_data = self.load_data('train_data_translated.csv')

    def load_test_data(self):
        self.test_data = self.load_data('test_data_translated.csv')


class YNAT_dataset(torch.utils.data.Dataset):
    def __init__(self, args, data, is_inference):
        self.args = args
        self.data = data
        self.is_inference = is_inference

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data[index]
        element = [row[i] for i in range(len(row))]
        #print(type(row))
        # np.array -> torch.tensor 형변환
        #for i, col in enumerate(row):
        #    if type(col) == str:
        #        pass
        #    else:
        #        row[i] = torch.tensor(col)

        return element



## Trainer

In [6]:
from sklearn.metrics import accuracy_score
from torch.nn.functional import one_hot
from tqdm import tqdm


def run(args, tokenizer, train_data, valid_data, cv_count):
    train_loader, valid_loader = get_loaders(args, train_data, valid_data)

    # only when using warmup scheduler
    # args.total_steps = int(len(train_loader.dataset) / args.batch_size) * args.n_epochs
    # args.warmup_steps = int(args.total_steps * args.warmup_ratio)

    model = get_model(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    best_acc = -1
    early_stopping_counter = 0
    for epoch in range(args.n_epochs):

        print(f"Start Training: Epoch {epoch + 1}")

        if not args.cv_strategy:
            model_name = args.run_name
        else:
            model_name = f"{args.run_name.split('.pt')[0]}_{cv_count}.pt"

        # TRAIN
        train_acc, train_loss = train(args, model, tokenizer, train_loader, optimizer)

        # VALID
        acc, val_loss = validate(args, model, tokenizer, valid_loader)

        # TODO: model save or early stopping
        if args.scheduler == 'plateau':
            last_lr = optimizer.param_groups[0]['lr']
        else:
            last_lr = scheduler.get_last_lr()[0]

        print({"epoch": epoch, "train_loss": train_loss, "train_acc": train_acc,
                   "valid_acc": acc, "val_loss": val_loss, "learning_rate": last_lr})

        if acc > best_acc:
            best_acc = acc
            # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
            model_to_save = model.module if hasattr(model, 'module') else model
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
            },
                args.model_dir, model_name,
            )
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= args.patience:
                print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
                break

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_acc)
        else:
            scheduler.step()

    return best_acc


def inference(args, test_data):
    # ckpt_file_names = []
    all_fold_preds = []
    all_fold_argmax_preds = []

    if not args.cv_strategy:
        ckpt_file_names = [args.model_name]
    else:
        ckpt_file_names = [f"{args.model_name.split('.pt')[0]}_{i + 1}.pt" for i in range(args.fold_num)]

    tokenizer = load_tokenizer(args)

    for fold_idx, ckpt in enumerate(ckpt_file_names):
        model = load_model(args, ckpt)
        model.eval()
        test_loader = get_loaders(args, None, test_data, True)

        total_preds = []
        total_argmax_preds = []
        total_ids = []

        for step, batch in tqdm(enumerate(test_loader), desc='Inferencing', total=len(test_loader)):
            idx, text, text_en = batch
            tokenized_examples = tokenizer(
                text,
                text_en,
                max_length=args.max_seq_len,
                padding="max_length",
                return_tensors="pt"
            ).to(args.device)

            preds = model(**tokenized_examples)

            logits = preds['logits']
            argmax_logits = torch.argmax(logits, dim=1)

            if args.device == 'cuda':
                argmax_preds = argmax_logits.to('cpu').detach().numpy()
                preds = logits.to('cpu').detach().numpy()
            else:  # cpu
                argmax_preds = argmax_logits.detach().numpy()
                preds = logits.detach().numpy()

            total_preds += list(preds)
            total_argmax_preds += list(argmax_preds)
            total_ids += list(idx)

        all_fold_preds.append(total_preds)
        all_fold_argmax_preds.append(total_argmax_preds)

        output_file_name = "output.csv" if not args.cv_strategy else f"output_{fold_idx + 1}.csv"
        write_path = os.path.join(args.output_dir, output_file_name)
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(write_path, 'w', encoding='utf8') as w:
            print("writing prediction : {}".format(write_path))
            w.write("index,topic_idx\n")
            for index, p in zip(total_ids, total_argmax_preds):
                w.write('{},{}\n'.format(index, p))

    if len(all_fold_preds) > 1:
        # Soft voting ensemble
        votes = np.sum(all_fold_preds, axis=0)
        votes = np.argmax(votes, axis=1)

        write_path = os.path.join(args.output_dir, "output_softvote.csv")
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(write_path, 'w', encoding='utf8') as w:
            print("writing prediction : {}".format(write_path))
            w.write("index,topic_idx\n")
            for id, p in zip(total_ids, votes):
                w.write('{},{}\n'.format(id, p))


def train(args, model, tokenizer, train_loader, optimizer):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in tqdm(enumerate(train_loader), desc='Training', total=len(train_loader)):
        idx, text, text_en, label = batch
        label = label.to(args.device)
        # print(idx[:10])
        # print(text[:10])
        # print(label[:10])
        tokenized_examples = tokenizer(
            text,
            text_en,
            max_length=args.max_seq_len,
            padding="max_length",
            return_tensors="pt"
        ).to(args.device)

        # tokenize
        # 모델의 입력으로
        # label은 one-hot?
        # loss 주고
        # argmax를 golden

        preds = model(**tokenized_examples)
        logits = preds['logits']
        softmax_logits = nn.Softmax(dim=1)(logits)
        argmax_logits = torch.argmax(logits, dim=1)

        # one_hot_logits = one_hot(argmax_logits, num_classes=7).float()
        # print(one_hot(argmax_logits, num_classes=7).type(torch.FloatTensor))
        loss = compute_loss(logits,
                            label, args)

        # print(loss)

        update_params(loss, model, optimizer, args)

        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")

        if args.device == 'cuda':
            argmax_logits = argmax_logits.to('cpu').detach().numpy()
            label = label.to('cpu').detach().numpy()
            loss = loss.to('cpu').detach().numpy()
            tokenized_examples = tokenized_examples.to('cpu')
            logits = logits.to('cpu').detach().numpy()
        else:  # cpu
            argmax_logits = argmax_logits.detach().numpy()
            label = label.detach().numpy()
            loss = loss.detach().numpy()

        total_preds.append(argmax_logits)
        total_targets.append(label)
        losses.append(loss)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    acc = accuracy_score(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f'TRAIN ACC : {acc}, TRAIN LOSS : {loss_avg}')
    return acc, loss_avg


def validate(args, model, tokenizer, valid_loader):
    model.eval()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in tqdm(enumerate(valid_loader), desc='Training', total=len(valid_loader)):
        idx, text, text_en, label = batch
        label = label.to(args.device)
        tokenized_examples = tokenizer(
            text,
            text_en,
            max_length=args.max_seq_len,
            padding="max_length",
            return_tensors="pt"
        ).to(args.device)

        # tokenize
        # 모델의 입력으로
        # label은 one-hot?
        # loss 주고
        # argmax를 golden

        preds = model(**tokenized_examples)
        logits = preds['logits']
        softmax_logits = nn.Softmax(dim=1)(logits)
        argmax_logits = torch.argmax(logits, dim=1)

        # one_hot_logits = one_hot(argmax_logits, num_classes=7).float()
        # print(one_hot(argmax_logits, num_classes=7).type(torch.FloatTensor))
        loss = compute_loss(logits,
                            label, args)

        if step % args.log_steps == 0:
            print(f"Validation steps: {step} Loss: {str(loss.item())}")

        if args.device == 'cuda':
            argmax_logits = argmax_logits.to('cpu').detach().numpy()
            label = label.to('cpu').detach().numpy()
            loss = loss.to('cpu').detach().numpy()
            tokenized_examples = tokenized_examples.to('cpu')
            logits = logits.to('cpu').detach().numpy()
        else:  # cpu
            argmax_logits = argmax_logits.detach().numpy()
            label = label.detach().numpy()
            loss = loss.detach().numpy()

        total_preds.append(argmax_logits)
        total_targets.append(label)
        losses.append(loss)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    acc = accuracy_score(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f'VALID ACC : {acc}, VALID LOSS : {loss_avg}')
    return acc, loss_avg


## Train

In [7]:
import torch
from sklearn.model_selection import KFold, StratifiedKFold
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from datetime import datetime
from pytz import timezone


def main(args):
    if not args.run_name:
        args.run_name = datetime.now(timezone("Asia/Seoul")).strftime("%Y-%m-%d-%H:%M:%S")

    set_seeds(args.seed)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    args.device = device

    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name
        else args.model_name_or_path,
        use_fast=True,
    )

    preprocess = Preprocess(args)
    preprocess.load_train_data()
    train_data_origin = preprocess.train_data

    print(f"Size of train data : {len(train_data_origin)}")
    # print(f"size of test data : {len(test_data)}")

    if args.cv_strategy == 'random':
        kf = KFold(n_splits=args.fold_num, shuffle=True)
        splits = kf.split(X=train_data_origin)
    else:
        # default
        # 여기 각 label로 바꿔야됨
        train_labels = [sequence[-1] for sequence in train_data_origin]
        skf = StratifiedKFold(n_splits=args.fold_num, shuffle=True)
        splits = skf.split(X=train_data_origin, y=train_labels)

    acc_avg = 0
    for fold_num, (train_index, valid_index) in enumerate(splits):
        train_data = train_data_origin[train_index]
        valid_data = train_data_origin[valid_index]
        best_acc = run(args, tokenizer, train_data, valid_data, fold_num + 1)

        if not args.cv_strategy:
            break

        acc_avg += best_acc

    if args.cv_strategy:
        acc_avg /= args.fold_num

        print("*" * 50, 'auc_avg', "*" * 50)
        print(acc_avg)


## Run

In [8]:
import argparse
import easydict

def parse_args():
    args = easydict.EasyDict({'run_name' : 'temp',
                             'seed':42,
                             'device' :'cuda',
                             'data_dir': cur_dir + '/data/open/',
                             'model_dir' : '/content/drive/MyDrive/KLUE_TC/models/',
                             'model_name_or_path' : 'xlm-roberta-large',
                             'config_name' : None,
                             'tokenizer_name' : None,
                             'output_dir' : '/content/drive/MyDrive/KLUE_TC/output/translation/',

                             'cv_strategy' : 'stratified',
                             'fold_num' : 4,

                             'num_workers' : 1,

                             # 훈련
                             'n_epochs' : 3,
                             'batch_size' : 16,
                             'lr' : 5e-6,
                             'clip_grad' : 10,
                             'patience' : 5,
                             'max_seq_len' : 80,

                             # Optimizer
                             'optimizer' : 'adamW',

                             # Optimizer-parameters
                             'weight_decay' : 0.01,
                             'momentum' : 0.9,

                             # Scheduler
                             'scheduler' : 'step_lr',

                             # Scheduler-parameters
                             # plateau
                             'plateau_patience' : 10,
                             'plateau_factor' : 0.5,
                              
                             't_max' : 10,
                             'T_0' : 10,
                             'T_mult' : 2,
                             '--eta_min' : 0.01,

                             # linear_warmup
                             'warmup_ratio' : 0.3,

                             # Step LR
                             'step_size' : 50,
                             'gamma' : 0.1,

                             'criterion' : 'CE',

                             'log_steps' : 100})
    
    return args

In [9]:
if __name__ == '__main__':
    args = parse_args()
    main(args)

Size of train data : 45654


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

Start Training: Epoch 1


Training:   0%|          | 1/2140 [00:00<11:42,  3.04it/s]

Training steps: 0 Loss: 1.9897428750991821


Training:   5%|▍         | 101/2140 [00:30<10:25,  3.26it/s]

Training steps: 100 Loss: 1.7341123819351196


Training:   9%|▉         | 201/2140 [01:00<09:45,  3.31it/s]

Training steps: 200 Loss: 0.9641258120536804


Training:  14%|█▍        | 301/2140 [01:30<09:13,  3.32it/s]

Training steps: 300 Loss: 0.5547016859054565


Training:  19%|█▊        | 401/2140 [02:00<08:44,  3.32it/s]

Training steps: 400 Loss: 0.4257950484752655


Training:  23%|██▎       | 501/2140 [02:31<08:14,  3.32it/s]

Training steps: 500 Loss: 0.1729910522699356


Training:  28%|██▊       | 601/2140 [03:01<07:45,  3.30it/s]

Training steps: 600 Loss: 0.28079795837402344


Training:  33%|███▎      | 701/2140 [03:31<07:14,  3.31it/s]

Training steps: 700 Loss: 0.8450873494148254


Training:  37%|███▋      | 801/2140 [04:01<06:46,  3.29it/s]

Training steps: 800 Loss: 0.186244398355484


Training:  42%|████▏     | 901/2140 [04:31<06:14,  3.30it/s]

Training steps: 900 Loss: 0.2889895737171173


Training:  47%|████▋     | 1001/2140 [05:01<05:44,  3.31it/s]

Training steps: 1000 Loss: 0.4411683678627014


Training:  51%|█████▏    | 1101/2140 [05:32<05:12,  3.33it/s]

Training steps: 1100 Loss: 0.2728343605995178


Training:  56%|█████▌    | 1201/2140 [06:02<04:42,  3.32it/s]

Training steps: 1200 Loss: 0.3200002908706665


Training:  61%|██████    | 1301/2140 [06:32<04:13,  3.31it/s]

Training steps: 1300 Loss: 0.42877987027168274


Training:  65%|██████▌   | 1401/2140 [07:02<03:42,  3.31it/s]

Training steps: 1400 Loss: 0.5136380195617676


Training:  70%|███████   | 1501/2140 [07:32<03:13,  3.31it/s]

Training steps: 1500 Loss: 0.4456239342689514


Training:  75%|███████▍  | 1601/2140 [08:02<02:42,  3.32it/s]

Training steps: 1600 Loss: 0.23705464601516724


Training:  79%|███████▉  | 1701/2140 [08:33<02:12,  3.30it/s]

Training steps: 1700 Loss: 0.4352942109107971


Training:  84%|████████▍ | 1801/2140 [09:03<01:42,  3.31it/s]

Training steps: 1800 Loss: 0.779090166091919


Training:  89%|████████▉ | 1901/2140 [09:33<01:11,  3.33it/s]

Training steps: 1900 Loss: 0.2852676212787628


Training:  94%|█████████▎| 2001/2140 [10:03<00:42,  3.30it/s]

Training steps: 2000 Loss: 0.6911035180091858


Training:  98%|█████████▊| 2101/2140 [10:33<00:11,  3.30it/s]

Training steps: 2100 Loss: 0.46843504905700684


Training: 100%|██████████| 2140/2140 [10:45<00:00,  3.31it/s]

TRAIN ACC : 0.8168224299065421, TRAIN LOSS : 0.5582443712914589



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.3306698203086853


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.86it/s]

Validation steps: 100 Loss: 0.6280769109725952


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.91it/s]

Validation steps: 200 Loss: 0.560386598110199


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.96it/s]

Validation steps: 300 Loss: 0.1931847482919693


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.98it/s]

Validation steps: 400 Loss: 0.5967592597007751


Training:  70%|███████   | 502/714 [00:35<00:15, 13.99it/s]

Validation steps: 500 Loss: 0.2451133131980896


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.96it/s]

Validation steps: 600 Loss: 0.08712708950042725


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.99it/s]

Validation steps: 700 Loss: 0.08300890028476715


Training: 100%|██████████| 714/714 [00:51<00:00, 13.95it/s]


VALID ACC : 0.8769055545820922, VALID LOSS : 0.4290382828288229
{'epoch': 0, 'train_loss': 0.5582443712914589, 'train_acc': 0.8168224299065421, 'valid_acc': 0.8769055545820922, 'val_loss': 0.4290382828288229, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2140 [00:00<11:58,  2.98it/s]

Training steps: 0 Loss: 0.4141247868537903


Training:   5%|▍         | 101/2140 [00:30<10:18,  3.30it/s]

Training steps: 100 Loss: 0.24946463108062744


Training:   9%|▉         | 201/2140 [01:00<09:43,  3.32it/s]

Training steps: 200 Loss: 0.10348251461982727


Training:  14%|█▍        | 301/2140 [01:30<09:13,  3.32it/s]

Training steps: 300 Loss: 0.05872631445527077


Training:  19%|█▊        | 401/2140 [02:01<08:45,  3.31it/s]

Training steps: 400 Loss: 0.057598553597927094


Training:  23%|██▎       | 501/2140 [02:31<08:13,  3.32it/s]

Training steps: 500 Loss: 0.7009661197662354


Training:  28%|██▊       | 601/2140 [03:01<07:44,  3.31it/s]

Training steps: 600 Loss: 0.2570623457431793


Training:  33%|███▎      | 701/2140 [03:31<07:16,  3.30it/s]

Training steps: 700 Loss: 0.4390595555305481


Training:  37%|███▋      | 801/2140 [04:01<06:44,  3.31it/s]

Training steps: 800 Loss: 0.1896304041147232


Training:  42%|████▏     | 901/2140 [04:31<06:14,  3.31it/s]

Training steps: 900 Loss: 0.2654356062412262


Training:  47%|████▋     | 1001/2140 [05:02<05:43,  3.32it/s]

Training steps: 1000 Loss: 0.23753830790519714


Training:  51%|█████▏    | 1101/2140 [05:32<05:13,  3.31it/s]

Training steps: 1100 Loss: 0.7125740051269531


Training:  56%|█████▌    | 1201/2140 [06:02<04:44,  3.30it/s]

Training steps: 1200 Loss: 0.12812848389148712


Training:  61%|██████    | 1301/2140 [06:32<04:12,  3.32it/s]

Training steps: 1300 Loss: 0.1644313633441925


Training:  65%|██████▌   | 1401/2140 [07:02<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.7269488573074341


Training:  70%|███████   | 1501/2140 [07:33<03:13,  3.30it/s]

Training steps: 1500 Loss: 0.1565258800983429


Training:  75%|███████▍  | 1601/2140 [08:03<02:42,  3.31it/s]

Training steps: 1600 Loss: 0.16285711526870728


Training:  79%|███████▉  | 1701/2140 [08:33<02:12,  3.31it/s]

Training steps: 1700 Loss: 0.26216214895248413


Training:  84%|████████▍ | 1801/2140 [09:03<01:42,  3.31it/s]

Training steps: 1800 Loss: 0.3501555025577545


Training:  89%|████████▉ | 1901/2140 [09:33<01:12,  3.31it/s]

Training steps: 1900 Loss: 0.6464861035346985


Training:  94%|█████████▎| 2001/2140 [10:04<00:41,  3.31it/s]

Training steps: 2000 Loss: 0.19527855515480042


Training:  98%|█████████▊| 2101/2140 [10:34<00:11,  3.31it/s]

Training steps: 2100 Loss: 0.3786007761955261


Training: 100%|██████████| 2140/2140 [10:46<00:00,  3.31it/s]

TRAIN ACC : 0.890303738317757, TRAIN LOSS : 0.34002564149934833



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.5913920402526855


Training:  14%|█▍        | 102/714 [00:07<00:43, 13.96it/s]

Validation steps: 100 Loss: 0.3875342309474945


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.90it/s]

Validation steps: 200 Loss: 0.5109320282936096


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.95it/s]

Validation steps: 300 Loss: 0.363810658454895


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.87it/s]

Validation steps: 400 Loss: 0.5289619565010071


Training:  70%|███████   | 502/714 [00:36<00:15, 13.95it/s]

Validation steps: 500 Loss: 0.25204575061798096


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.88it/s]

Validation steps: 600 Loss: 0.13212142884731293


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.84it/s]

Validation steps: 700 Loss: 0.28712883591651917


Training: 100%|██████████| 714/714 [00:51<00:00, 13.92it/s]


VALID ACC : 0.8850534431400036, VALID LOSS : 0.37585159776281013
{'epoch': 1, 'train_loss': 0.34002564149934833, 'train_acc': 0.890303738317757, 'valid_acc': 0.8850534431400036, 'val_loss': 0.37585159776281013, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2140 [00:00<11:39,  3.06it/s]

Training steps: 0 Loss: 0.25894033908843994


Training:   5%|▍         | 101/2140 [00:30<10:15,  3.31it/s]

Training steps: 100 Loss: 0.5096768736839294


Training:   9%|▉         | 201/2140 [01:00<09:44,  3.32it/s]

Training steps: 200 Loss: 0.41200461983680725


Training:  14%|█▍        | 301/2140 [01:30<09:12,  3.33it/s]

Training steps: 300 Loss: 0.10452470183372498


Training:  19%|█▊        | 401/2140 [02:01<08:46,  3.30it/s]

Training steps: 400 Loss: 0.11743313074111938


Training:  23%|██▎       | 501/2140 [02:31<08:14,  3.32it/s]

Training steps: 500 Loss: 0.5552412867546082


Training:  28%|██▊       | 601/2140 [03:01<07:44,  3.32it/s]

Training steps: 600 Loss: 0.3423633575439453


Training:  33%|███▎      | 701/2140 [03:31<07:16,  3.30it/s]

Training steps: 700 Loss: 0.5977783799171448


Training:  37%|███▋      | 801/2140 [04:01<06:43,  3.32it/s]

Training steps: 800 Loss: 0.4266216456890106


Training:  42%|████▏     | 901/2140 [04:31<06:13,  3.32it/s]

Training steps: 900 Loss: 0.029345616698265076


Training:  47%|████▋     | 1001/2140 [05:02<05:43,  3.32it/s]

Training steps: 1000 Loss: 0.03743874654173851


Training:  51%|█████▏    | 1101/2140 [05:32<05:14,  3.31it/s]

Training steps: 1100 Loss: 0.26818642020225525


Training:  56%|█████▌    | 1201/2140 [06:02<04:43,  3.31it/s]

Training steps: 1200 Loss: 0.11583337187767029


Training:  61%|██████    | 1301/2140 [06:32<04:14,  3.29it/s]

Training steps: 1300 Loss: 0.17409613728523254


Training:  65%|██████▌   | 1401/2140 [07:02<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.5255331993103027


Training:  70%|███████   | 1501/2140 [07:32<03:14,  3.29it/s]

Training steps: 1500 Loss: 0.4368504285812378


Training:  75%|███████▍  | 1601/2140 [08:03<02:43,  3.30it/s]

Training steps: 1600 Loss: 0.44165897369384766


Training:  79%|███████▉  | 1701/2140 [08:33<02:12,  3.31it/s]

Training steps: 1700 Loss: 0.40463897585868835


Training:  84%|████████▍ | 1801/2140 [09:03<01:42,  3.30it/s]

Training steps: 1800 Loss: 0.5188642144203186


Training:  89%|████████▉ | 1901/2140 [09:33<01:12,  3.31it/s]

Training steps: 1900 Loss: 0.3658648133277893


Training:  94%|█████████▎| 2001/2140 [10:03<00:42,  3.30it/s]

Training steps: 2000 Loss: 0.34665748476982117


Training:  98%|█████████▊| 2101/2140 [10:33<00:11,  3.31it/s]

Training steps: 2100 Loss: 0.2601730525493622


Training: 100%|██████████| 2140/2140 [10:45<00:00,  3.31it/s]

TRAIN ACC : 0.9076810747663552, TRAIN LOSS : 0.2870838575961643



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 1.2986257076263428


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.88it/s]

Validation steps: 100 Loss: 0.2099766731262207


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.95it/s]

Validation steps: 200 Loss: 0.4684719443321228


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.97it/s]

Validation steps: 300 Loss: 0.33055561780929565


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.83it/s]

Validation steps: 400 Loss: 0.4659135341644287


Training:  70%|███████   | 502/714 [00:36<00:15, 13.89it/s]

Validation steps: 500 Loss: 0.4280887842178345


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.79it/s]

Validation steps: 600 Loss: 0.2422022521495819


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.93it/s]

Validation steps: 700 Loss: 0.14184635877609253


Training: 100%|██████████| 714/714 [00:51<00:00, 13.91it/s]


VALID ACC : 0.8824250919922901, VALID LOSS : 0.40241467805594955
{'epoch': 2, 'train_loss': 0.2870838575961643, 'train_acc': 0.9076810747663552, 'valid_acc': 0.8824250919922901, 'val_loss': 0.40241467805594955, 'learning_rate': 5e-06}


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

Start Training: Epoch 1


Training:   0%|          | 1/2140 [00:00<11:26,  3.11it/s]

Training steps: 0 Loss: 1.9261094331741333


Training:   5%|▍         | 101/2140 [00:30<10:16,  3.31it/s]

Training steps: 100 Loss: 1.9245063066482544


Training:   9%|▉         | 201/2140 [01:00<09:45,  3.31it/s]

Training steps: 200 Loss: 0.9124456644058228


Training:  14%|█▍        | 301/2140 [01:31<09:16,  3.30it/s]

Training steps: 300 Loss: 0.44590920209884644


Training:  19%|█▊        | 401/2140 [02:01<08:47,  3.30it/s]

Training steps: 400 Loss: 0.7293850183486938


Training:  23%|██▎       | 501/2140 [02:31<08:18,  3.29it/s]

Training steps: 500 Loss: 0.3051021099090576


Training:  28%|██▊       | 601/2140 [03:01<07:44,  3.31it/s]

Training steps: 600 Loss: 0.40683916211128235


Training:  33%|███▎      | 701/2140 [03:32<07:16,  3.30it/s]

Training steps: 700 Loss: 0.3692683279514313


Training:  37%|███▋      | 801/2140 [04:02<06:46,  3.30it/s]

Training steps: 800 Loss: 0.39839649200439453


Training:  42%|████▏     | 901/2140 [04:32<06:15,  3.30it/s]

Training steps: 900 Loss: 0.2729201316833496


Training:  47%|████▋     | 1001/2140 [05:02<05:44,  3.31it/s]

Training steps: 1000 Loss: 0.18370383977890015


Training:  51%|█████▏    | 1101/2140 [05:33<05:14,  3.30it/s]

Training steps: 1100 Loss: 0.29469746351242065


Training:  56%|█████▌    | 1201/2140 [06:03<04:44,  3.29it/s]

Training steps: 1200 Loss: 0.27185487747192383


Training:  61%|██████    | 1301/2140 [06:33<04:13,  3.31it/s]

Training steps: 1300 Loss: 0.27431440353393555


Training:  65%|██████▌   | 1401/2140 [07:03<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.297188937664032


Training:  70%|███████   | 1501/2140 [07:33<03:13,  3.30it/s]

Training steps: 1500 Loss: 0.3202686607837677


Training:  75%|███████▍  | 1601/2140 [08:04<02:42,  3.31it/s]

Training steps: 1600 Loss: 0.3076959550380707


Training:  79%|███████▉  | 1701/2140 [08:34<02:13,  3.30it/s]

Training steps: 1700 Loss: 0.48654526472091675


Training:  84%|████████▍ | 1801/2140 [09:04<01:42,  3.31it/s]

Training steps: 1800 Loss: 0.34483978152275085


Training:  89%|████████▉ | 1901/2140 [09:34<01:12,  3.30it/s]

Training steps: 1900 Loss: 0.5504066944122314


Training:  94%|█████████▎| 2001/2140 [10:04<00:42,  3.29it/s]

Training steps: 2000 Loss: 0.9984256625175476


Training:  98%|█████████▊| 2101/2140 [10:35<00:11,  3.30it/s]

Training steps: 2100 Loss: 0.5052536129951477


Training: 100%|██████████| 2140/2140 [10:47<00:00,  3.31it/s]

TRAIN ACC : 0.8114778037383178, TRAIN LOSS : 0.5614246631093393



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.36025938391685486


Training:  14%|█▍        | 102/714 [00:07<00:43, 14.06it/s]

Validation steps: 100 Loss: 0.38324132561683655


Training:  28%|██▊       | 202/714 [00:14<00:36, 14.00it/s]

Validation steps: 200 Loss: 1.0261690616607666


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.93it/s]

Validation steps: 300 Loss: 0.5923615097999573


Training:  56%|█████▋    | 402/714 [00:28<00:22, 14.09it/s]

Validation steps: 400 Loss: 0.07529950886964798


Training:  70%|███████   | 502/714 [00:35<00:15, 14.06it/s]

Validation steps: 500 Loss: 0.15947791934013367


Training:  84%|████████▍ | 602/714 [00:42<00:07, 14.00it/s]

Validation steps: 600 Loss: 0.9353556632995605


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.89it/s]

Validation steps: 700 Loss: 0.1138557717204094


Training: 100%|██████████| 714/714 [00:51<00:00, 13.99it/s]


VALID ACC : 0.8735763097949886, VALID LOSS : 0.41570401104644505
{'epoch': 0, 'train_loss': 0.5614246631093393, 'train_acc': 0.8114778037383178, 'valid_acc': 0.8735763097949886, 'val_loss': 0.41570401104644505, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2140 [00:00<20:56,  1.70it/s]

Training steps: 0 Loss: 0.07444313168525696


Training:   5%|▍         | 101/2140 [00:30<10:20,  3.29it/s]

Training steps: 100 Loss: 0.25323402881622314


Training:   9%|▉         | 201/2140 [01:01<09:46,  3.30it/s]

Training steps: 200 Loss: 0.805570662021637


Training:  14%|█▍        | 301/2140 [01:31<09:17,  3.30it/s]

Training steps: 300 Loss: 0.5163930654525757


Training:  19%|█▊        | 401/2140 [02:01<08:45,  3.31it/s]

Training steps: 400 Loss: 0.5426090359687805


Training:  23%|██▎       | 501/2140 [02:31<08:16,  3.30it/s]

Training steps: 500 Loss: 0.30474767088890076


Training:  28%|██▊       | 601/2140 [03:01<07:43,  3.32it/s]

Training steps: 600 Loss: 0.07140367478132248


Training:  33%|███▎      | 701/2140 [03:32<07:16,  3.30it/s]

Training steps: 700 Loss: 0.44133061170578003


Training:  37%|███▋      | 801/2140 [04:02<06:45,  3.30it/s]

Training steps: 800 Loss: 0.23886796832084656


Training:  42%|████▏     | 901/2140 [04:32<06:13,  3.32it/s]

Training steps: 900 Loss: 0.4925195872783661


Training:  47%|████▋     | 1001/2140 [05:02<05:44,  3.30it/s]

Training steps: 1000 Loss: 0.38675248622894287


Training:  51%|█████▏    | 1101/2140 [05:32<05:12,  3.33it/s]

Training steps: 1100 Loss: 0.02265198528766632


Training:  56%|█████▌    | 1201/2140 [06:03<04:43,  3.31it/s]

Training steps: 1200 Loss: 0.5998172163963318


Training:  61%|██████    | 1301/2140 [06:33<04:13,  3.31it/s]

Training steps: 1300 Loss: 0.21370428800582886


Training:  65%|██████▌   | 1401/2140 [07:03<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.12714040279388428


Training:  70%|███████   | 1501/2140 [07:33<03:14,  3.29it/s]

Training steps: 1500 Loss: 0.18414686620235443


Training:  75%|███████▍  | 1601/2140 [08:03<02:41,  3.34it/s]

Training steps: 1600 Loss: 0.028110966086387634


Training:  79%|███████▉  | 1701/2140 [08:34<02:12,  3.32it/s]

Training steps: 1700 Loss: 0.5289005041122437


Training:  84%|████████▍ | 1801/2140 [09:04<01:42,  3.31it/s]

Training steps: 1800 Loss: 0.06774347275495529


Training:  89%|████████▉ | 1901/2140 [09:34<01:12,  3.30it/s]

Training steps: 1900 Loss: 0.19492468237876892


Training:  94%|█████████▎| 2001/2140 [10:04<00:41,  3.32it/s]

Training steps: 2000 Loss: 0.04771619662642479


Training:  98%|█████████▊| 2101/2140 [10:34<00:11,  3.30it/s]

Training steps: 2100 Loss: 0.7364579439163208


Training: 100%|██████████| 2140/2140 [10:46<00:00,  3.31it/s]

TRAIN ACC : 0.8908878504672897, TRAIN LOSS : 0.33287469067776176



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.2383602261543274


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.86it/s]

Validation steps: 100 Loss: 0.5219048857688904


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.89it/s]

Validation steps: 200 Loss: 0.5701873898506165


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.94it/s]

Validation steps: 300 Loss: 0.37409508228302


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.88it/s]

Validation steps: 400 Loss: 0.17208991944789886


Training:  70%|███████   | 502/714 [00:36<00:15, 13.95it/s]

Validation steps: 500 Loss: 0.03822615370154381


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.94it/s]

Validation steps: 600 Loss: 0.846095085144043


Training:  98%|█████████▊| 702/714 [00:50<00:00, 14.00it/s]

Validation steps: 700 Loss: 0.20597510039806366


Training: 100%|██████████| 714/714 [00:51<00:00, 13.91it/s]


VALID ACC : 0.8876817942877169, VALID LOSS : 0.3865162525704282
{'epoch': 1, 'train_loss': 0.33287469067776176, 'train_acc': 0.8908878504672897, 'valid_acc': 0.8876817942877169, 'val_loss': 0.3865162525704282, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2140 [00:00<11:38,  3.06it/s]

Training steps: 0 Loss: 0.2834719717502594


Training:   5%|▍         | 101/2140 [00:30<10:20,  3.29it/s]

Training steps: 100 Loss: 0.4028545618057251


Training:   9%|▉         | 201/2140 [01:00<09:47,  3.30it/s]

Training steps: 200 Loss: 0.21911029517650604


Training:  14%|█▍        | 301/2140 [01:31<09:15,  3.31it/s]

Training steps: 300 Loss: 0.0884355828166008


Training:  19%|█▊        | 401/2140 [02:01<08:47,  3.29it/s]

Training steps: 400 Loss: 0.10509134083986282


Training:  23%|██▎       | 501/2140 [02:31<08:15,  3.31it/s]

Training steps: 500 Loss: 0.11999189108610153


Training:  28%|██▊       | 601/2140 [03:01<07:42,  3.32it/s]

Training steps: 600 Loss: 0.022268878296017647


Training:  33%|███▎      | 701/2140 [03:31<07:14,  3.31it/s]

Training steps: 700 Loss: 0.4562908113002777


Training:  37%|███▋      | 801/2140 [04:02<06:42,  3.33it/s]

Training steps: 800 Loss: 0.5570234060287476


Training:  42%|████▏     | 901/2140 [04:32<06:13,  3.31it/s]

Training steps: 900 Loss: 0.1478678286075592


Training:  47%|████▋     | 1001/2140 [05:02<05:45,  3.30it/s]

Training steps: 1000 Loss: 0.3036172389984131


Training:  51%|█████▏    | 1101/2140 [05:32<05:15,  3.30it/s]

Training steps: 1100 Loss: 0.42070087790489197


Training:  56%|█████▌    | 1201/2140 [06:02<04:43,  3.31it/s]

Training steps: 1200 Loss: 0.5967422127723694


Training:  61%|██████    | 1301/2140 [06:33<04:12,  3.33it/s]

Training steps: 1300 Loss: 0.31630465388298035


Training:  65%|██████▌   | 1401/2140 [07:03<03:43,  3.30it/s]

Training steps: 1400 Loss: 0.7564688324928284


Training:  70%|███████   | 1501/2140 [07:33<03:12,  3.32it/s]

Training steps: 1500 Loss: 0.3778105676174164


Training:  75%|███████▍  | 1601/2140 [08:03<02:42,  3.32it/s]

Training steps: 1600 Loss: 0.05675987899303436


Training:  79%|███████▉  | 1701/2140 [08:33<02:12,  3.31it/s]

Training steps: 1700 Loss: 0.40071621537208557


Training:  84%|████████▍ | 1801/2140 [09:03<01:43,  3.29it/s]

Training steps: 1800 Loss: 0.3073666989803314


Training:  89%|████████▉ | 1901/2140 [09:34<01:11,  3.32it/s]

Training steps: 1900 Loss: 0.05771259590983391


Training:  94%|█████████▎| 2001/2140 [10:04<00:42,  3.29it/s]

Training steps: 2000 Loss: 1.140985131263733


Training:  98%|█████████▊| 2101/2140 [10:34<00:11,  3.30it/s]

Training steps: 2100 Loss: 0.2697735130786896


Training: 100%|██████████| 2140/2140 [10:46<00:00,  3.31it/s]

TRAIN ACC : 0.9052570093457943, TRAIN LOSS : 0.2859406569339422



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.31054940819740295


Training:  14%|█▍        | 102/714 [00:07<00:43, 13.94it/s]

Validation steps: 100 Loss: 0.4565780758857727


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.93it/s]

Validation steps: 200 Loss: 0.34713464975357056


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.91it/s]

Validation steps: 300 Loss: 0.3649244010448456


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.88it/s]

Validation steps: 400 Loss: 0.13976003229618073


Training:  70%|███████   | 502/714 [00:36<00:15, 13.83it/s]

Validation steps: 500 Loss: 0.0283679086714983


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.97it/s]

Validation steps: 600 Loss: 0.9161330461502075


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.94it/s]

Validation steps: 700 Loss: 0.2534182667732239


Training: 100%|██████████| 714/714 [00:51<00:00, 13.90it/s]


VALID ACC : 0.8861923953040126, VALID LOSS : 0.40078553763943453
{'epoch': 2, 'train_loss': 0.2859406569339422, 'train_acc': 0.9052570093457943, 'valid_acc': 0.8861923953040126, 'val_loss': 0.40078553763943453, 'learning_rate': 5e-06}


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

Start Training: Epoch 1


Training:   0%|          | 1/2141 [00:00<11:56,  2.99it/s]

Training steps: 0 Loss: 1.8811229467391968


Training:   5%|▍         | 101/2141 [00:30<10:16,  3.31it/s]

Training steps: 100 Loss: 1.8895790576934814


Training:   9%|▉         | 201/2141 [01:00<09:48,  3.30it/s]

Training steps: 200 Loss: 0.9796773791313171


Training:  14%|█▍        | 301/2141 [01:31<09:15,  3.31it/s]

Training steps: 300 Loss: 0.5594099164009094


Training:  19%|█▊        | 401/2141 [02:01<08:46,  3.30it/s]

Training steps: 400 Loss: 0.6963773369789124


Training:  23%|██▎       | 501/2141 [02:32<08:14,  3.31it/s]

Training steps: 500 Loss: 0.8195551633834839


Training:  28%|██▊       | 601/2141 [03:02<07:45,  3.31it/s]

Training steps: 600 Loss: 0.3299283981323242


Training:  33%|███▎      | 701/2141 [03:32<07:16,  3.30it/s]

Training steps: 700 Loss: 0.8234213590621948


Training:  37%|███▋      | 801/2141 [04:02<06:46,  3.30it/s]

Training steps: 800 Loss: 0.4926972985267639


Training:  42%|████▏     | 901/2141 [04:33<06:15,  3.30it/s]

Training steps: 900 Loss: 0.1894388645887375


Training:  47%|████▋     | 1001/2141 [05:03<05:45,  3.30it/s]

Training steps: 1000 Loss: 0.8835515379905701


Training:  51%|█████▏    | 1101/2141 [05:33<05:15,  3.30it/s]

Training steps: 1100 Loss: 0.25986772775650024


Training:  56%|█████▌    | 1201/2141 [06:04<04:45,  3.29it/s]

Training steps: 1200 Loss: 0.9311360120773315


Training:  61%|██████    | 1301/2141 [06:34<04:12,  3.32it/s]

Training steps: 1300 Loss: 0.29713568091392517


Training:  65%|██████▌   | 1401/2141 [07:04<03:44,  3.30it/s]

Training steps: 1400 Loss: 0.36657020449638367


Training:  70%|███████   | 1501/2141 [07:34<03:13,  3.31it/s]

Training steps: 1500 Loss: 0.4779397249221802


Training:  75%|███████▍  | 1601/2141 [08:04<02:44,  3.28it/s]

Training steps: 1600 Loss: 0.6354064345359802


Training:  79%|███████▉  | 1701/2141 [08:35<02:13,  3.30it/s]

Training steps: 1700 Loss: 0.4084497094154358


Training:  84%|████████▍ | 1801/2141 [09:05<01:42,  3.31it/s]

Training steps: 1800 Loss: 0.4312728941440582


Training:  89%|████████▉ | 1901/2141 [09:35<01:12,  3.30it/s]

Training steps: 1900 Loss: 0.10772377997636795


Training:  93%|█████████▎| 2001/2141 [10:05<00:42,  3.31it/s]

Training steps: 2000 Loss: 0.5456793904304504


Training:  98%|█████████▊| 2101/2141 [10:36<00:12,  3.31it/s]

Training steps: 2100 Loss: 0.5402238965034485


Training: 100%|██████████| 2141/2141 [10:48<00:00,  3.30it/s]

TRAIN ACC : 0.810169095528752, TRAIN LOSS : 0.5711602736299267



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.5877512693405151


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.82it/s]

Validation steps: 100 Loss: 0.9711741209030151


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.90it/s]

Validation steps: 200 Loss: 0.4672229290008545


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.81it/s]

Validation steps: 300 Loss: 0.4242030680179596


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.99it/s]

Validation steps: 400 Loss: 0.2157527059316635


Training:  70%|███████   | 502/714 [00:36<00:15, 13.84it/s]

Validation steps: 500 Loss: 0.009954674169421196


Training:  84%|████████▍ | 602/714 [00:43<00:07, 14.03it/s]

Validation steps: 600 Loss: 1.1460134983062744


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.86it/s]

Validation steps: 700 Loss: 0.12319979816675186


Training: 100%|██████████| 714/714 [00:51<00:00, 13.89it/s]


VALID ACC : 0.8811881188118812, VALID LOSS : 0.38382761795347664
{'epoch': 0, 'train_loss': 0.5711602736299267, 'train_acc': 0.810169095528752, 'valid_acc': 0.8811881188118812, 'val_loss': 0.38382761795347664, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2141 [00:00<11:35,  3.08it/s]

Training steps: 0 Loss: 0.24837109446525574


Training:   5%|▍         | 101/2141 [00:30<10:21,  3.28it/s]

Training steps: 100 Loss: 0.19071967899799347


Training:   9%|▉         | 201/2141 [01:00<09:46,  3.31it/s]

Training steps: 200 Loss: 0.2863507568836212


Training:  14%|█▍        | 301/2141 [01:31<09:15,  3.31it/s]

Training steps: 300 Loss: 0.15223845839500427


Training:  19%|█▊        | 401/2141 [02:01<08:46,  3.31it/s]

Training steps: 400 Loss: 0.40556421875953674


Training:  23%|██▎       | 501/2141 [02:31<08:16,  3.30it/s]

Training steps: 500 Loss: 0.35795751214027405


Training:  28%|██▊       | 601/2141 [03:01<07:44,  3.32it/s]

Training steps: 600 Loss: 0.0782284215092659


Training:  33%|███▎      | 701/2141 [03:32<07:12,  3.33it/s]

Training steps: 700 Loss: 0.07291832566261292


Training:  37%|███▋      | 801/2141 [04:02<06:46,  3.29it/s]

Training steps: 800 Loss: 0.11296039819717407


Training:  42%|████▏     | 901/2141 [04:32<06:15,  3.30it/s]

Training steps: 900 Loss: 0.09791231155395508


Training:  47%|████▋     | 1001/2141 [05:02<05:44,  3.31it/s]

Training steps: 1000 Loss: 0.16578394174575806


Training:  51%|█████▏    | 1101/2141 [05:32<05:13,  3.32it/s]

Training steps: 1100 Loss: 0.17241746187210083


Training:  56%|█████▌    | 1201/2141 [06:03<04:44,  3.31it/s]

Training steps: 1200 Loss: 0.08708691596984863


Training:  61%|██████    | 1301/2141 [06:33<04:13,  3.31it/s]

Training steps: 1300 Loss: 0.13239124417304993


Training:  65%|██████▌   | 1401/2141 [07:03<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.6963334083557129


Training:  70%|███████   | 1501/2141 [07:33<03:14,  3.30it/s]

Training steps: 1500 Loss: 0.15026408433914185


Training:  75%|███████▍  | 1601/2141 [08:03<02:43,  3.31it/s]

Training steps: 1600 Loss: 0.6306202411651611


Training:  79%|███████▉  | 1701/2141 [08:34<02:12,  3.33it/s]

Training steps: 1700 Loss: 0.19636766612529755


Training:  84%|████████▍ | 1801/2141 [09:04<01:43,  3.30it/s]

Training steps: 1800 Loss: 0.7991535663604736


Training:  89%|████████▉ | 1901/2141 [09:34<01:12,  3.30it/s]

Training steps: 1900 Loss: 0.27557820081710815


Training:  93%|█████████▎| 2001/2141 [10:04<00:42,  3.31it/s]

Training steps: 2000 Loss: 0.12922316789627075


Training:  98%|█████████▊| 2101/2141 [10:35<00:12,  3.31it/s]

Training steps: 2100 Loss: 0.12244214862585068


Training: 100%|██████████| 2141/2141 [10:46<00:00,  3.31it/s]

TRAIN ACC : 0.8879705616074297, TRAIN LOSS : 0.35089807107388654



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.5842822790145874


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.87it/s]

Validation steps: 100 Loss: 1.0419609546661377


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.99it/s]

Validation steps: 200 Loss: 0.44112861156463623


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.86it/s]

Validation steps: 300 Loss: 0.3603275418281555


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.87it/s]

Validation steps: 400 Loss: 0.16815711557865143


Training:  70%|███████   | 502/714 [00:36<00:15, 13.92it/s]

Validation steps: 500 Loss: 0.008044867776334286


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.99it/s]

Validation steps: 600 Loss: 1.0366328954696655


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.94it/s]

Validation steps: 700 Loss: 0.15446403622627258


Training: 100%|██████████| 714/714 [00:51<00:00, 13.91it/s]


VALID ACC : 0.8880224305616402, VALID LOSS : 0.3689098885936365
{'epoch': 1, 'train_loss': 0.35089807107388654, 'train_acc': 0.8879705616074297, 'valid_acc': 0.8880224305616402, 'val_loss': 0.3689098885936365, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2141 [00:05<3:18:12,  5.56s/it]

Training steps: 0 Loss: 0.3822324573993683


Training:   5%|▍         | 101/2141 [00:35<10:21,  3.28it/s]

Training steps: 100 Loss: 0.23589156568050385


Training:   9%|▉         | 201/2141 [01:06<09:46,  3.31it/s]

Training steps: 200 Loss: 0.13740698993206024


Training:  14%|█▍        | 301/2141 [01:36<09:16,  3.30it/s]

Training steps: 300 Loss: 0.06265519559383392


Training:  19%|█▊        | 401/2141 [02:06<08:43,  3.32it/s]

Training steps: 400 Loss: 0.017479265108704567


Training:  23%|██▎       | 501/2141 [02:36<08:13,  3.33it/s]

Training steps: 500 Loss: 0.34987398982048035


Training:  28%|██▊       | 601/2141 [03:06<07:45,  3.31it/s]

Training steps: 600 Loss: 0.16932426393032074


Training:  33%|███▎      | 701/2141 [03:37<07:14,  3.32it/s]

Training steps: 700 Loss: 0.09487131983041763


Training:  37%|███▋      | 801/2141 [04:07<06:44,  3.32it/s]

Training steps: 800 Loss: 0.6017641425132751


Training:  42%|████▏     | 901/2141 [04:37<06:14,  3.31it/s]

Training steps: 900 Loss: 0.3037278652191162


Training:  47%|████▋     | 1001/2141 [05:07<05:44,  3.31it/s]

Training steps: 1000 Loss: 0.6651172041893005


Training:  51%|█████▏    | 1101/2141 [05:37<05:15,  3.29it/s]

Training steps: 1100 Loss: 0.5318798422813416


Training:  56%|█████▌    | 1201/2141 [06:08<04:44,  3.31it/s]

Training steps: 1200 Loss: 0.17648963630199432


Training:  61%|██████    | 1301/2141 [06:38<04:12,  3.32it/s]

Training steps: 1300 Loss: 0.3290310502052307


Training:  65%|██████▌   | 1401/2141 [07:08<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.31623825430870056


Training:  70%|███████   | 1501/2141 [07:38<03:14,  3.30it/s]

Training steps: 1500 Loss: 0.1901938021183014


Training:  75%|███████▍  | 1601/2141 [08:08<02:43,  3.31it/s]

Training steps: 1600 Loss: 0.0859891027212143


Training:  79%|███████▉  | 1701/2141 [08:39<02:12,  3.31it/s]

Training steps: 1700 Loss: 0.1989666223526001


Training:  84%|████████▍ | 1801/2141 [09:09<01:42,  3.32it/s]

Training steps: 1800 Loss: 0.05832965672016144


Training:  89%|████████▉ | 1901/2141 [09:39<01:12,  3.31it/s]

Training steps: 1900 Loss: 0.4737023711204529


Training:  93%|█████████▎| 2001/2141 [10:09<00:42,  3.30it/s]

Training steps: 2000 Loss: 0.38601037859916687


Training:  98%|█████████▊| 2101/2141 [10:40<00:12,  3.31it/s]

Training steps: 2100 Loss: 0.19669252634048462


Training: 100%|██████████| 2141/2141 [10:52<00:00,  3.28it/s]

TRAIN ACC : 0.9038871528284804, TRAIN LOSS : 0.29183466899387506



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.8039504289627075


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.89it/s]

Validation steps: 100 Loss: 1.044792890548706


Training:  28%|██▊       | 202/714 [00:14<00:36, 14.00it/s]

Validation steps: 200 Loss: 0.21242831647396088


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.87it/s]

Validation steps: 300 Loss: 0.42950335144996643


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.92it/s]

Validation steps: 400 Loss: 0.19241847097873688


Training:  70%|███████   | 502/714 [00:35<00:15, 13.88it/s]

Validation steps: 500 Loss: 0.008978331461548805


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.88it/s]

Validation steps: 600 Loss: 1.670079231262207


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.78it/s]

Validation steps: 700 Loss: 0.10365107655525208


Training: 100%|██████████| 714/714 [00:51<00:00, 13.92it/s]


VALID ACC : 0.8884605274686761, VALID LOSS : 0.37798752112794115
{'epoch': 2, 'train_loss': 0.29183466899387506, 'train_acc': 0.9038871528284804, 'valid_acc': 0.8884605274686761, 'val_loss': 0.37798752112794115, 'learning_rate': 5e-06}
saving model ...


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

Start Training: Epoch 1


Training:   0%|          | 1/2141 [00:00<31:20,  1.14it/s]

Training steps: 0 Loss: 2.0064868927001953


Training:   5%|▍         | 101/2141 [00:31<10:22,  3.28it/s]

Training steps: 100 Loss: 1.8158384561538696


Training:   9%|▉         | 201/2141 [01:01<09:45,  3.31it/s]

Training steps: 200 Loss: 0.7586265206336975


Training:  14%|█▍        | 301/2141 [01:31<09:16,  3.30it/s]

Training steps: 300 Loss: 0.40253207087516785


Training:  19%|█▊        | 401/2141 [02:01<08:47,  3.30it/s]

Training steps: 400 Loss: 0.35808265209198


Training:  23%|██▎       | 501/2141 [02:32<08:15,  3.31it/s]

Training steps: 500 Loss: 0.2793596386909485


Training:  28%|██▊       | 601/2141 [03:02<07:44,  3.32it/s]

Training steps: 600 Loss: 0.30841943621635437


Training:  33%|███▎      | 701/2141 [03:32<07:14,  3.32it/s]

Training steps: 700 Loss: 0.8591518402099609


Training:  37%|███▋      | 801/2141 [04:02<06:45,  3.31it/s]

Training steps: 800 Loss: 0.35357528924942017


Training:  42%|████▏     | 901/2141 [04:33<06:15,  3.30it/s]

Training steps: 900 Loss: 0.4491084814071655


Training:  47%|████▋     | 1001/2141 [05:03<05:44,  3.31it/s]

Training steps: 1000 Loss: 0.09831059724092484


Training:  51%|█████▏    | 1101/2141 [05:33<05:15,  3.30it/s]

Training steps: 1100 Loss: 0.6010152101516724


Training:  56%|█████▌    | 1201/2141 [06:03<04:43,  3.31it/s]

Training steps: 1200 Loss: 0.736746609210968


Training:  61%|██████    | 1301/2141 [06:34<04:15,  3.29it/s]

Training steps: 1300 Loss: 0.35815876722335815


Training:  65%|██████▌   | 1401/2141 [07:04<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.4035674035549164


Training:  70%|███████   | 1501/2141 [07:34<03:14,  3.30it/s]

Training steps: 1500 Loss: 0.4062543511390686


Training:  75%|███████▍  | 1601/2141 [08:04<02:43,  3.31it/s]

Training steps: 1600 Loss: 0.515329897403717


Training:  79%|███████▉  | 1701/2141 [08:34<02:12,  3.32it/s]

Training steps: 1700 Loss: 0.7883517146110535


Training:  84%|████████▍ | 1801/2141 [09:05<01:42,  3.33it/s]

Training steps: 1800 Loss: 0.6653663516044617


Training:  89%|████████▉ | 1901/2141 [09:35<01:12,  3.30it/s]

Training steps: 1900 Loss: 0.05338694155216217


Training:  93%|█████████▎| 2001/2141 [10:05<00:42,  3.31it/s]

Training steps: 2000 Loss: 0.5256528854370117


Training:  98%|█████████▊| 2101/2141 [10:35<00:12,  3.31it/s]

Training steps: 2100 Loss: 0.6044194102287292


Training: 100%|██████████| 2141/2141 [10:47<00:00,  3.30it/s]

TRAIN ACC : 0.8194854122251103, TRAIN LOSS : 0.5477244263059873



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.021326981484889984


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.90it/s]

Validation steps: 100 Loss: 0.16184721887111664


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.87it/s]

Validation steps: 200 Loss: 0.3742023706436157


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.81it/s]

Validation steps: 300 Loss: 0.6117756962776184


Training:  56%|█████▋    | 402/714 [00:28<00:22, 14.01it/s]

Validation steps: 400 Loss: 0.5419984459877014


Training:  70%|███████   | 502/714 [00:36<00:15, 13.89it/s]

Validation steps: 500 Loss: 0.009459269233047962


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.75it/s]

Validation steps: 600 Loss: 0.8273007273674011


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.85it/s]

Validation steps: 700 Loss: 0.12869815528392792


Training: 100%|██████████| 714/714 [00:51<00:00, 13.89it/s]


VALID ACC : 0.8741785682993078, VALID LOSS : 0.3841330075573896
{'epoch': 0, 'train_loss': 0.5477244263059873, 'train_acc': 0.8194854122251103, 'valid_acc': 0.8741785682993078, 'val_loss': 0.3841330075573896, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2141 [00:00<11:34,  3.08it/s]

Training steps: 0 Loss: 0.4772661030292511


Training:   5%|▍         | 101/2141 [00:30<10:19,  3.29it/s]

Training steps: 100 Loss: 0.38322582840919495


Training:   9%|▉         | 201/2141 [01:00<09:49,  3.29it/s]

Training steps: 200 Loss: 0.34168216586112976


Training:  14%|█▍        | 301/2141 [01:31<09:18,  3.29it/s]

Training steps: 300 Loss: 0.23446588218212128


Training:  19%|█▊        | 401/2141 [02:01<08:45,  3.31it/s]

Training steps: 400 Loss: 0.6078871488571167


Training:  23%|██▎       | 501/2141 [02:31<08:17,  3.30it/s]

Training steps: 500 Loss: 0.31473496556282043


Training:  28%|██▊       | 601/2141 [03:01<07:47,  3.29it/s]

Training steps: 600 Loss: 0.07040185481309891


Training:  33%|███▎      | 701/2141 [03:32<07:15,  3.30it/s]

Training steps: 700 Loss: 0.533821165561676


Training:  37%|███▋      | 801/2141 [04:02<06:45,  3.30it/s]

Training steps: 800 Loss: 0.42027485370635986


Training:  42%|████▏     | 901/2141 [04:32<06:15,  3.30it/s]

Training steps: 900 Loss: 0.22990164160728455


Training:  47%|████▋     | 1001/2141 [05:02<05:43,  3.32it/s]

Training steps: 1000 Loss: 0.3614605963230133


Training:  51%|█████▏    | 1101/2141 [05:32<05:13,  3.32it/s]

Training steps: 1100 Loss: 0.2526368200778961


Training:  56%|█████▌    | 1201/2141 [06:03<04:43,  3.31it/s]

Training steps: 1200 Loss: 0.6615341901779175


Training:  61%|██████    | 1301/2141 [06:33<04:15,  3.29it/s]

Training steps: 1300 Loss: 0.3380335867404938


Training:  65%|██████▌   | 1401/2141 [07:03<03:42,  3.32it/s]

Training steps: 1400 Loss: 0.7381388545036316


Training:  70%|███████   | 1501/2141 [07:33<03:14,  3.30it/s]

Training steps: 1500 Loss: 0.34515029191970825


Training:  75%|███████▍  | 1601/2141 [08:03<02:43,  3.30it/s]

Training steps: 1600 Loss: 0.17564928531646729


Training:  79%|███████▉  | 1701/2141 [08:34<02:12,  3.31it/s]

Training steps: 1700 Loss: 0.4551556706428528


Training:  84%|████████▍ | 1801/2141 [09:04<01:42,  3.31it/s]

Training steps: 1800 Loss: 0.6995732188224792


Training:  89%|████████▉ | 1901/2141 [09:34<01:13,  3.28it/s]

Training steps: 1900 Loss: 0.6448945999145508


Training:  93%|█████████▎| 2001/2141 [10:04<00:42,  3.32it/s]

Training steps: 2000 Loss: 0.4502802789211273


Training:  98%|█████████▊| 2101/2141 [10:34<00:12,  3.30it/s]

Training steps: 2100 Loss: 0.1745445877313614


Training: 100%|██████████| 2141/2141 [10:46<00:00,  3.31it/s]

TRAIN ACC : 0.890452965742823, TRAIN LOSS : 0.33614104268276374



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.02586800791323185


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.81it/s]

Validation steps: 100 Loss: 0.1690395474433899


Training:  28%|██▊       | 202/714 [00:14<00:37, 13.84it/s]

Validation steps: 200 Loss: 0.4177192449569702


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.90it/s]

Validation steps: 300 Loss: 0.6718763709068298


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.92it/s]

Validation steps: 400 Loss: 0.5719658732414246


Training:  70%|███████   | 502/714 [00:36<00:15, 13.96it/s]

Validation steps: 500 Loss: 0.005379393696784973


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.93it/s]

Validation steps: 600 Loss: 0.754355788230896


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.89it/s]

Validation steps: 700 Loss: 0.145980566740036


Training: 100%|██████████| 714/714 [00:51<00:00, 13.92it/s]


VALID ACC : 0.8849557522123894, VALID LOSS : 0.3650467043034263
{'epoch': 1, 'train_loss': 0.33614104268276374, 'train_acc': 0.890452965742823, 'valid_acc': 0.8849557522123894, 'val_loss': 0.3650467043034263, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2141 [00:00<11:56,  2.99it/s]

Training steps: 0 Loss: 0.23724734783172607


Training:   5%|▍         | 101/2141 [00:30<10:19,  3.29it/s]

Training steps: 100 Loss: 0.20180627703666687


Training:   9%|▉         | 201/2141 [01:00<09:44,  3.32it/s]

Training steps: 200 Loss: 0.37199917435646057


Training:  14%|█▍        | 301/2141 [01:31<09:15,  3.31it/s]

Training steps: 300 Loss: 0.1678474396467209


Training:  19%|█▊        | 401/2141 [02:01<08:44,  3.32it/s]

Training steps: 400 Loss: 0.25203976035118103


Training:  23%|██▎       | 501/2141 [02:31<08:16,  3.30it/s]

Training steps: 500 Loss: 0.3342229127883911


Training:  28%|██▊       | 601/2141 [03:01<07:46,  3.30it/s]

Training steps: 600 Loss: 0.060539357364177704


Training:  33%|███▎      | 701/2141 [03:31<07:17,  3.29it/s]

Training steps: 700 Loss: 0.39473098516464233


Training:  37%|███▋      | 801/2141 [04:01<06:45,  3.30it/s]

Training steps: 800 Loss: 0.4048183262348175


Training:  42%|████▏     | 901/2141 [04:32<06:13,  3.32it/s]

Training steps: 900 Loss: 0.08846389502286911


Training:  47%|████▋     | 1001/2141 [05:02<05:44,  3.31it/s]

Training steps: 1000 Loss: 0.1695086508989334


Training:  51%|█████▏    | 1101/2141 [05:32<05:15,  3.30it/s]

Training steps: 1100 Loss: 0.3836090564727783


Training:  56%|█████▌    | 1201/2141 [06:02<04:40,  3.35it/s]

Training steps: 1200 Loss: 0.03833664208650589


Training:  61%|██████    | 1301/2141 [06:32<04:12,  3.32it/s]

Training steps: 1300 Loss: 0.04005080834031105


Training:  65%|██████▌   | 1401/2141 [07:03<03:42,  3.33it/s]

Training steps: 1400 Loss: 0.05914650112390518


Training:  70%|███████   | 1501/2141 [07:33<03:13,  3.30it/s]

Training steps: 1500 Loss: 0.37903258204460144


Training:  75%|███████▍  | 1601/2141 [08:03<02:43,  3.31it/s]

Training steps: 1600 Loss: 0.3592953383922577


Training:  79%|███████▉  | 1701/2141 [08:33<02:13,  3.31it/s]

Training steps: 1700 Loss: 0.1956956833600998


Training:  84%|████████▍ | 1801/2141 [09:04<01:42,  3.30it/s]

Training steps: 1800 Loss: 0.06098828464746475


Training:  89%|████████▉ | 1901/2141 [09:34<01:12,  3.30it/s]

Training steps: 1900 Loss: 0.3512175381183624


Training:  93%|█████████▎| 2001/2141 [10:04<00:42,  3.33it/s]

Training steps: 2000 Loss: 0.2855571508407593


Training:  98%|█████████▊| 2101/2141 [10:34<00:11,  3.33it/s]

Training steps: 2100 Loss: 0.399824857711792


Training: 100%|██████████| 2141/2141 [10:46<00:00,  3.31it/s]

TRAIN ACC : 0.908121842235916, TRAIN LOSS : 0.2831622151260354



Training:   0%|          | 0/714 [00:00<?, ?it/s]

Validation steps: 0 Loss: 0.023209353908896446


Training:  14%|█▍        | 102/714 [00:07<00:44, 13.90it/s]

Validation steps: 100 Loss: 0.1618785709142685


Training:  28%|██▊       | 202/714 [00:14<00:36, 13.89it/s]

Validation steps: 200 Loss: 0.3742953836917877


Training:  42%|████▏     | 302/714 [00:21<00:29, 13.93it/s]

Validation steps: 300 Loss: 0.7475258708000183


Training:  56%|█████▋    | 402/714 [00:28<00:22, 13.96it/s]

Validation steps: 400 Loss: 0.5344774723052979


Training:  70%|███████   | 502/714 [00:36<00:15, 13.87it/s]

Validation steps: 500 Loss: 0.006976488512009382


Training:  84%|████████▍ | 602/714 [00:43<00:08, 13.89it/s]

Validation steps: 600 Loss: 0.8003606796264648


Training:  98%|█████████▊| 702/714 [00:50<00:00, 13.94it/s]

Validation steps: 700 Loss: 0.044075172394514084


Training: 100%|██████████| 714/714 [00:51<00:00, 13.94it/s]


VALID ACC : 0.8860071847892754, VALID LOSS : 0.3759598066601172
{'epoch': 2, 'train_loss': 0.2831622151260354, 'train_acc': 0.908121842235916, 'valid_acc': 0.8860071847892754, 'val_loss': 0.3759598066601172, 'learning_rate': 5e-06}
saving model ...
************************************************** auc_avg **************************************************
0.886800737421418


In [10]:
# torch.cuda.empty_cache()

In [11]:
!nvidia-smi

Fri Jul 23 18:42:18 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    39W / 300W |  14817MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Inference

In [12]:
def inference_main():
    args = parse_args()
    args.model_name = "temp"
    preprocess = Preprocess(args)
    preprocess.load_test_data()
    test_data = preprocess.test_data

    print(f"size of test data : {len(test_data)}")
    torch.cuda.empty_cache()
    # del model
    inference(args, test_data)

inference_main()

size of test data : 9131
Loading Model from: /content/drive/MyDrive/KLUE_TC/models/temp_1.pt


Some weights of the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_1.pt were not used when initializing XLMRobertaForSequenceClassification: ['state_dict', 'epoch']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_1.pt and are newly initialized: ['encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.19.attention.self.query.bias', 'encoder.layer.11.output.LayerNorm.bi

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
     

Inferencing: 100%|██████████| 571/571 [00:40<00:00, 14.06it/s]


writing prediction : /content/drive/MyDrive/KLUE_TC/output/translation/output_1.csv
Loading Model from: /content/drive/MyDrive/KLUE_TC/models/temp_2.pt


Some weights of the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_2.pt were not used when initializing XLMRobertaForSequenceClassification: ['state_dict', 'epoch']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_2.pt and are newly initialized: ['encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.19.attention.self.query.bias', 'encoder.layer.11.output.LayerNorm.bi

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
     

Inferencing: 100%|██████████| 571/571 [00:44<00:00, 12.89it/s]


writing prediction : /content/drive/MyDrive/KLUE_TC/output/translation/output_2.csv
Loading Model from: /content/drive/MyDrive/KLUE_TC/models/temp_3.pt


Some weights of the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_3.pt were not used when initializing XLMRobertaForSequenceClassification: ['state_dict', 'epoch']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_3.pt and are newly initialized: ['encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.19.attention.self.query.bias', 'encoder.layer.11.output.LayerNorm.bi

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
     

Inferencing: 100%|██████████| 571/571 [00:42<00:00, 13.36it/s]


writing prediction : /content/drive/MyDrive/KLUE_TC/output/translation/output_3.csv
Loading Model from: /content/drive/MyDrive/KLUE_TC/models/temp_4.pt


Some weights of the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_4.pt were not used when initializing XLMRobertaForSequenceClassification: ['state_dict', 'epoch']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_4.pt and are newly initialized: ['encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.19.attention.self.query.bias', 'encoder.layer.11.output.LayerNorm.bi

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
     

Inferencing: 100%|██████████| 571/571 [00:42<00:00, 13.32it/s]


writing prediction : /content/drive/MyDrive/KLUE_TC/output/translation/output_4.csv
writing prediction : /content/drive/MyDrive/KLUE_TC/output/translation/output_softvote.csv
