In [10]:
!nvidia-smi

Tue Jul 27 17:42:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    34W / 250W |   1035MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Directory 설정, 구글 드라이브 import

In [11]:
cur_dir = '/content/drive/MyDrive/KLUE_TC'

## Utils

In [12]:
!pip install adamp
!pip install transformers



In [13]:
import os
import random
import torch
import numpy as np
from torch import nn

from torch.optim import Adam, AdamW, SGD
from adamp import AdamP
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR, ExponentialLR, \
    CosineAnnealingWarmRestarts
from transformers import get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification


def set_seeds(seed=42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.benchmark = False


def save_checkpoint(state, model_dir, model_filename):
    print('saving model ...')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    # torch.save(state, os.path.join(model_dir, model_filename))
    torch.save(state, model_filename)


def get_optimizer(model, args):
    if args.optimizer == 'adam':
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'adamW':
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'adamP':
        optimizer = AdamP(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()

    return optimizer


def get_scheduler(optimizer, args):
    if args.scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer, patience=args.plateau_patience, factor=args.plateau_factor, mode='max',
                                      verbose=True)
    elif args.scheduler == 'linear_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                    num_training_steps=args.total_steps)
    elif args.scheduler == 'step_lr':
        scheduler = StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
    elif args.scheduler == 'exp_lr':
        scheduler = ExponentialLR(optimizer, gamma=args.gamma)
    elif args.scheduler == 'cosine_annealing':
        scheduler = CosineAnnealingLR(optimizer, T_max=args.t_max, eta_min=args.eta_min)
    elif args.scheduler == 'cosine_annealing_warmstart':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=args.T_0, T_mult=args.T_mult, eta_min=args.eta_min,
                                                last_epoch=-1)

    return scheduler


def update_params(loss, model, optimizer, batch_idx, max_len, args):
    if args.gradient_accumulation:
        # normalize loss to account for batch accumulation
        loss = loss / args.accum_iter 

        # backward pass
        loss.backward()

        # weights update
        if ((batch_idx + 1) % args.accum_iter == 0) or (batch_idx + 1 == max_len):
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
            optimizer.step()
            optimizer.zero_grad()
    else:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
        optimizer.step()
        optimizer.zero_grad()


def load_tokenizer(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name
        else args.model_name_or_path,
        use_fast=True,
    )

    return tokenizer


def load_model(args, model_name=None):
    if not model_name:
        model_name = args.model_name
    model_path = os.path.join(args.model_dir, model_name)
    print("Loading Model from:", model_path)
    load_state = torch.load(model_path)

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        args.config_name
        if args.config_name
        else args.model_name_or_path,
    )

    config.num_labels = 7

    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        from_tf=bool(".ckpt" in model_path),
        config=config
    ).to(args.device)

    model.load_state_dict(load_state['state_dict'], strict=True)

    print("Loading Model from:", model_path, "...Finished.")

    return model


def get_model(args):
    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        args.config_name
        if args.config_name
        else args.model_name_or_path,
    )

    config.num_labels = 7
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    ).to(args.device)

    return model


def get_loaders(args, train, valid, is_inference=False):
    pin_memory = True
    train_loader, valid_loader = None, None

    if is_inference:
        test_dataset = YNAT_dataset(args, valid, is_inference)
        test_loader = torch.utils.data.DataLoader(test_dataset, num_workers=args.num_workers, shuffle=False,
                                                  batch_size=args.batch_size, pin_memory=pin_memory)
        return test_loader

    if train is not None:
        train_dataset = YNAT_dataset(args, train, is_inference)
        train_loader = torch.utils.data.DataLoader(train_dataset, num_workers=args.num_workers, shuffle=True,
                                                   batch_size=args.batch_size, pin_memory=pin_memory)
    if valid is not None:
        valid_dataset = YNAT_dataset(args, valid, is_inference)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, num_workers=args.num_workers, shuffle=False,
                                                   batch_size=args.batch_size, pin_memory=pin_memory)

    return train_loader, valid_loader


# loss계산하고 parameter update!
def compute_loss(preds, targets, args):
    """
    Args :
        preds   : (batch_size, max_seq_len)
        targets : (batch_size, max_seq_len)
    """
    # print(preds, targets)
    loss = get_criterion(preds, targets, args)
    # 마지막 시퀀스에 대한 값만 loss 계산
    # loss = loss[:, -1]
    # loss = torch.mean(loss)
    return loss


def get_criterion(pred, target, args):
    if args.criterion == 'BCE':
        loss = nn.BCELoss(reduction="none")
    elif args.criterion == "BCELogit":
        loss = nn.BCEWithLogitsLoss(reduction="none")
    elif args.criterion == "MSE":
        loss = nn.MSELoss(reduction="none")
    elif args.criterion == "L1":
        loss = nn.L1Loss(reduction="none")
    elif args.criterion == "CE":
        weights = [1,1,2,1,1,1,1] #as class distribution
        class_weights = torch.FloatTensor(weights).cuda()
        loss = nn.CrossEntropyLoss(weight=class_weights)
    # NLL, CrossEntropy not available
    return loss(pred, target)


## Dataloader

In [14]:
import os
import torch
import pandas as pd


class Preprocess:
    def __init__(self, args):
        self.args = args
        self.train_data = None
        self.test_data = None

    def load_data(self, file_name):
        csv_file_name = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_name)
        del df['Unnamed: 0']
        return df.values

    def load_train_data(self):
        self.train_data = self.load_data('train_data_translated.csv')

    def load_test_data(self):
        self.test_data = self.load_data('test_data_translated.csv')


class YNAT_dataset(torch.utils.data.Dataset):
    def __init__(self, args, data, is_inference):
        self.args = args
        self.data = data
        self.is_inference = is_inference

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data[index]
        element = [row[i] for i in range(len(row))]
        #print(type(row))
        # np.array -> torch.tensor 형변환
        #for i, col in enumerate(row):
        #    if type(col) == str:
        #        pass
        #    else:
        #        row[i] = torch.tensor(col)

        return element



## Trainer

In [15]:
from sklearn.metrics import accuracy_score
from torch.nn.functional import one_hot
from tqdm import tqdm
from sklearn import metrics


def run(args, tokenizer, train_data, valid_data, cv_count):
    train_loader, valid_loader = get_loaders(args, train_data, valid_data)

    # only when using warmup scheduler
    # args.total_steps = int(len(train_loader.dataset) / args.batch_size) * args.n_epochs
    # args.warmup_steps = int(args.total_steps * args.warmup_ratio)

    model = get_model(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    best_acc = -1
    early_stopping_counter = 0
    for epoch in range(args.n_epochs):

        print(f"Start Training: Epoch {epoch + 1}")

        if not args.cv_strategy:
            model_name = args.run_name
        else:
            model_name = f"{args.run_name.split('.pt')[0]}_{cv_count}.pt"

        # TRAIN
        train_acc, train_loss = train(args, model, tokenizer, train_loader, optimizer)

        # VALID
        acc, val_loss = validate(args, model, tokenizer, valid_loader)

        # TODO: model save or early stopping
        if args.scheduler == 'plateau':
            last_lr = optimizer.param_groups[0]['lr']
        else:
            last_lr = scheduler.get_last_lr()[0]

        print({"epoch": epoch, "train_loss": train_loss, "train_acc": train_acc,
                   "valid_acc": acc, "val_loss": val_loss, "learning_rate": last_lr})

        if acc > best_acc:
            best_acc = acc
            # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
            model_to_save = model.module if hasattr(model, 'module') else model
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
            },
                args.model_dir, model_name,
            )
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= args.patience:
                print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
                break

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_acc)
        else:
            scheduler.step()

    return best_acc


def inference(args, test_data):
    # ckpt_file_names = []
    all_fold_preds = []
    all_fold_argmax_preds = []

    if not args.cv_strategy:
        ckpt_file_names = [args.model_name]
    else:
        ckpt_file_names = [f"{args.model_name.split('.pt')[0]}_{i + 1}.pt" for i in range(args.fold_num)]

    tokenizer = load_tokenizer(args)

    for fold_idx, ckpt in enumerate(ckpt_file_names):
        model = load_model(args, ckpt)
        model.eval()
        test_loader = get_loaders(args, None, test_data, True)

        total_preds = []
        total_argmax_preds = []
        total_ids = []

        for step, batch in tqdm(enumerate(test_loader), desc='Inferencing', total=len(test_loader)):
            idx, text, en_text = batch
            tokenized_examples = tokenizer(
                text,
                en_text,
                max_length=args.max_seq_len,
                padding="max_length",
                return_tensors="pt"
            ).to(args.device)

            preds = model(input_ids = tokenized_examples['input_ids'],
                          attention_mask = tokenized_examples['attention_mask'])

            logits = preds['logits']
            argmax_logits = torch.argmax(logits, dim=1)

            if args.device == 'cuda':
                argmax_preds = argmax_logits.to('cpu').detach().numpy()
                preds = logits.to('cpu').detach().numpy()
            else:  # cpu
                argmax_preds = argmax_logits.detach().numpy()
                preds = logits.detach().numpy()

            total_preds += list(preds)
            total_argmax_preds += list(argmax_preds)
            total_ids += list(idx)

        all_fold_preds.append(total_preds)
        all_fold_argmax_preds.append(total_argmax_preds)

        output_file_name = "output.csv" if not args.cv_strategy else f"output_{fold_idx + 1}.csv"
        write_path = os.path.join(args.output_dir, output_file_name)
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(write_path, 'w', encoding='utf8') as w:
            print("writing prediction : {}".format(write_path))
            w.write("index,topic_idx\n")
            for index, p in zip(total_ids, total_argmax_preds):
                w.write('{},{}\n'.format(index, p))

    if len(all_fold_preds) > 1:
        # Soft voting ensemble
        votes = np.sum(all_fold_preds, axis=0)
        votes = np.argmax(votes, axis=1)

        write_path = os.path.join(args.output_dir, "output_softvote.csv")
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(write_path, 'w', encoding='utf8') as w:
            print("writing prediction : {}".format(write_path))
            w.write("index,topic_idx\n")
            for id, p in zip(total_ids, votes):
                w.write('{},{}\n'.format(id, p))


def train(args, model, tokenizer, train_loader, optimizer):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in tqdm(enumerate(train_loader), desc='Training', total=len(train_loader)):
        idx, text, text_en, label = batch
        label = label.to(args.device)
        # print(idx[:10])
        # print(text[:10])
        # print(label[:10])
        tokenized_examples = tokenizer(
            text,
            text_en,
            max_length=args.max_seq_len,
            padding="max_length",
            return_tensors="pt"
        ).to(args.device)

        # tokenize
        # 모델의 입력으로
        # label은 one-hot?
        # loss 주고
        # argmax를 golden

        #print(tokenized_examples)
        #print(tokenized_examples['input_ids'][:5])
        #print(tokenized_examples['attention_mask'][:5])
        #print(tokenized_examples['token_type_ids'][:5])

        #preds = model(**tokenized_examples)

        
        preds = model(input_ids = tokenized_examples['input_ids'],
                        attention_mask = tokenized_examples['attention_mask'])
            

        logits = preds['logits']
        softmax_logits = nn.Softmax(dim=1)(logits)
        argmax_logits = torch.argmax(logits, dim=1)

        # one_hot_logits = one_hot(argmax_logits, num_classes=7).float()
        # print(one_hot(argmax_logits, num_classes=7).type(torch.FloatTensor))
        loss = compute_loss(logits,
                            label, args)

        # print(loss)

        update_params(loss, model, optimizer, step, len(train_loader), args)

        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")

        if args.device == 'cuda':
            argmax_logits = argmax_logits.to('cpu').detach().numpy()
            label = label.to('cpu').detach().numpy()
            loss = loss.to('cpu').detach().numpy()
            tokenized_examples = tokenized_examples.to('cpu')
            logits = logits.to('cpu').detach().numpy()
        else:  # cpu
            argmax_logits = argmax_logits.detach().numpy()
            label = label.detach().numpy()
            loss = loss.detach().numpy()

        total_preds.append(argmax_logits)
        total_targets.append(label)
        losses.append(loss)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    acc = accuracy_score(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f'TRAIN ACC : {acc}, TRAIN LOSS : {loss_avg}')
    return acc, loss_avg


def validate(args, model, tokenizer, valid_loader):
    model.eval()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in tqdm(enumerate(valid_loader), desc='Training', total=len(valid_loader)):
        idx, text, text_en, label = batch
        label = label.to(args.device)
        tokenized_examples = tokenizer(
            text,
            text_en,
            max_length=args.max_seq_len,
            padding="max_length",
            return_tensors="pt"
        ).to(args.device)

        # tokenize
        # 모델의 입력으로
        # label은 one-hot?
        # loss 주고
        # argmax를 golden
        #print(tokenized_examples)
        #print(tokenized_examples['input_ids'][:5])
        #print(tokenized_examples['attention_mask'][:5])
        #print(tokenized_examples['token_type_ids'][:5])

        #preds = model(**tokenized_examples)

        
        preds = model(input_ids = tokenized_examples['input_ids'],
                        attention_mask = tokenized_examples['attention_mask'])
        
        logits = preds['logits']
        softmax_logits = nn.Softmax(dim=1)(logits)
        argmax_logits = torch.argmax(logits, dim=1)

        # one_hot_logits = one_hot(argmax_logits, num_classes=7).float()
        # print(one_hot(argmax_logits, num_classes=7).type(torch.FloatTensor))
        loss = compute_loss(logits,
                            label, args)

        if step % args.log_steps == 0:
            print(f"Validation steps: {step} Loss: {str(loss.item())}")

        if args.device == 'cuda':
            argmax_logits = argmax_logits.to('cpu').detach().numpy()
            label = label.to('cpu').detach().numpy()
            loss = loss.to('cpu').detach().numpy()
            tokenized_examples = tokenized_examples.to('cpu')
            logits = logits.to('cpu').detach().numpy()
        else:  # cpu
            argmax_logits = argmax_logits.detach().numpy()
            label = label.detach().numpy()
            loss = loss.detach().numpy()

        total_preds.append(argmax_logits)
        total_targets.append(label)
        losses.append(loss)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    target_names = ['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']
    print(metrics.classification_report(total_targets, total_preds, target_names=target_names))
    matrix = metrics.confusion_matrix(total_targets, total_preds)
    print(matrix.diagonal()/matrix.sum(axis=1))

    # Train AUC / ACC
    acc = accuracy_score(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f'VALID ACC : {acc}, VALID LOSS : {loss_avg}')
    return acc, loss_avg


## Train

In [16]:
import torch
from sklearn.model_selection import KFold, StratifiedKFold
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from datetime import datetime
from pytz import timezone


def main(args):
    if not args.run_name:
        args.run_name = datetime.now(timezone("Asia/Seoul")).strftime("%Y-%m-%d-%H:%M:%S")

    set_seeds(args.seed)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    args.device = device

    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name
        else args.model_name_or_path,
        #use_fast=True,
    )

    preprocess = Preprocess(args)
    preprocess.load_train_data()
    train_data_origin = preprocess.train_data

    print(f"Size of train data : {len(train_data_origin)}")
    # print(f"size of test data : {len(test_data)}")

    if args.cv_strategy == 'random':
        kf = KFold(n_splits=args.fold_num, shuffle=True)
        splits = kf.split(X=train_data_origin)
    else:
        # default
        # 여기 각 label로 바꿔야됨
        train_labels = [sequence[-1] for sequence in train_data_origin]
        skf = StratifiedKFold(n_splits=args.fold_num, shuffle=True)
        splits = skf.split(X=train_data_origin, y=train_labels)

    acc_avg = 0
    for fold_num, (train_index, valid_index) in enumerate(splits):
        train_data = train_data_origin[train_index]
        valid_data = train_data_origin[valid_index]
        best_acc = run(args, tokenizer, train_data, valid_data, fold_num + 1)

        if not args.cv_strategy:
            break

        acc_avg += best_acc

    if args.cv_strategy:
        acc_avg /= args.fold_num

        print("*" * 50, 'auc_avg', "*" * 50)
        print(acc_avg)


## Run

In [17]:
import argparse
import easydict

def parse_args():
    args = easydict.EasyDict({'run_name' : 'temp',
                             'seed':42,
                             'device' :'cuda',
                             'data_dir': cur_dir + '/data/open/',
                             'model_dir' : '/content/drive/MyDrive/KLUE_TC/models/',
                             'model_name_or_path' : 'klue/roberta-large',
                             #'model_name_or_path' : 'xlm-roberta-large',
                             'config_name' : None,
                             'tokenizer_name' : None,
                             'output_dir' : '/content/drive/MyDrive/KLUE_TC/output/translation/',
                             
                             'accum_iter' : 8,
                             'gradient_accumulation' : True,

                             'cv_strategy' : 'stratified',
                             'fold_num' : 4,

                             'num_workers' : 1,

                             # 훈련
                             'n_epochs' : 3,
                             'batch_size' : 16,
                             'lr' : 5e-6,
                             'clip_grad' : 10,
                             'patience' : 5,
                             'max_seq_len' : 110,

                             # Optimizer
                             'optimizer' : 'adamW',

                             # Optimizer-parameters
                             'weight_decay' : 0.05,
                             'momentum' : 0.9,

                             # Scheduler
                             'scheduler' : 'step_lr',

                             # Scheduler-parameters
                             # plateau
                             'plateau_patience' : 10,
                             'plateau_factor' : 0.5,
                              
                             't_max' : 10,
                             'T_0' : 10,
                             'T_mult' : 2,
                             '--eta_min' : 0.01,

                             # linear_warmup
                             'warmup_ratio' : 0.3,

                             # Step LR
                             'step_size' : 50,
                             'gamma' : 0.1,

                             'criterion' : 'CE',

                             'log_steps' : 100})
    
    return args

In [18]:
if __name__ == '__main__':
    args = parse_args()
    main(args)

Size of train data : 45654


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classif

Start Training: Epoch 1


Training:   0%|          | 1/2140 [00:00<18:45,  1.90it/s]

Training steps: 0 Loss: 1.9368716478347778


Training:   5%|▍         | 101/2140 [00:51<17:05,  1.99it/s]

Training steps: 100 Loss: 1.7498630285263062


Training:   9%|▉         | 201/2140 [01:42<16:38,  1.94it/s]

Training steps: 200 Loss: 1.8373318910598755


Training:  14%|█▍        | 301/2140 [02:33<15:25,  1.99it/s]

Training steps: 300 Loss: 1.1045527458190918


Training:  19%|█▊        | 401/2140 [03:24<14:55,  1.94it/s]

Training steps: 400 Loss: 0.5639625191688538


Training:  23%|██▎       | 501/2140 [04:14<13:43,  1.99it/s]

Training steps: 500 Loss: 0.49279266595840454


Training:  28%|██▊       | 601/2140 [05:05<13:12,  1.94it/s]

Training steps: 600 Loss: 0.5215424299240112


Training:  33%|███▎      | 701/2140 [05:56<12:02,  1.99it/s]

Training steps: 700 Loss: 0.39748960733413696


Training:  37%|███▋      | 801/2140 [06:47<11:28,  1.94it/s]

Training steps: 800 Loss: 0.16743890941143036


Training:  42%|████▏     | 901/2140 [07:38<10:26,  1.98it/s]

Training steps: 900 Loss: 0.43384048342704773


Training:  47%|████▋     | 1001/2140 [08:28<09:44,  1.95it/s]

Training steps: 1000 Loss: 0.22007212042808533


Training:  51%|█████▏    | 1101/2140 [09:19<08:43,  1.99it/s]

Training steps: 1100 Loss: 0.39740827679634094


Training:  56%|█████▌    | 1201/2140 [10:10<08:04,  1.94it/s]

Training steps: 1200 Loss: 0.9159243702888489


Training:  61%|██████    | 1301/2140 [11:01<07:01,  1.99it/s]

Training steps: 1300 Loss: 0.4144302308559418


Training:  65%|██████▌   | 1401/2140 [11:51<06:20,  1.94it/s]

Training steps: 1400 Loss: 0.8003654479980469


Training:  70%|███████   | 1501/2140 [12:42<05:21,  1.99it/s]

Training steps: 1500 Loss: 0.09011437743902206


Training:  75%|███████▍  | 1601/2140 [13:33<04:37,  1.94it/s]

Training steps: 1600 Loss: 0.1571245640516281


Training:  79%|███████▉  | 1701/2140 [14:24<03:40,  1.99it/s]

Training steps: 1700 Loss: 0.2719777226448059


Training:  84%|████████▍ | 1801/2140 [15:15<02:54,  1.94it/s]

Training steps: 1800 Loss: 0.40875327587127686


Training:  89%|████████▉ | 1901/2140 [16:05<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.3863888084888458


Training:  94%|█████████▎| 2001/2140 [16:56<01:11,  1.94it/s]

Training steps: 2000 Loss: 1.0237284898757935


Training:  98%|█████████▊| 2101/2140 [17:47<00:19,  1.99it/s]

Training steps: 2100 Loss: 0.7932698726654053


Training: 100%|██████████| 2140/2140 [18:07<00:00,  1.97it/s]

TRAIN ACC : 0.7808703271028037, TRAIN LOSS : 0.632146699026784



Training:   0%|          | 2/714 [00:00<02:07,  5.58it/s]

Validation steps: 0 Loss: 0.3478621542453766


Training:  14%|█▍        | 102/714 [00:17<01:46,  5.77it/s]

Validation steps: 100 Loss: 0.21762678027153015


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.79it/s]

Validation steps: 200 Loss: 0.5130056142807007


Training:  42%|████▏     | 302/714 [00:51<01:11,  5.76it/s]

Validation steps: 300 Loss: 0.35162341594696045


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.79it/s]

Validation steps: 400 Loss: 0.6862462759017944


Training:  70%|███████   | 502/714 [01:26<00:36,  5.81it/s]

Validation steps: 500 Loss: 0.18365836143493652


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.85it/s]

Validation steps: 600 Loss: 0.16125072538852692


Training:  98%|█████████▊| 702/714 [02:00<00:02,  5.70it/s]

Validation steps: 700 Loss: 0.10223781317472458


Training: 100%|██████████| 714/714 [02:02<00:00,  5.82it/s]


              precision    recall  f1-score   support

        IT과학       0.82      0.89      0.85      1206
          경제       0.85      0.84      0.84      1555
          사회       0.80      0.78      0.79      1841
        생활문화       0.91      0.90      0.91      1483
          세계       0.93      0.93      0.93      1908
         스포츠       0.97      0.98      0.97      1734
          정치       0.93      0.91      0.92      1687

    accuracy                           0.89     11414
   macro avg       0.89      0.89      0.89     11414
weighted avg       0.89      0.89      0.89     11414

[0.89220564 0.83794212 0.77729495 0.89952798 0.93081761 0.97923875
 0.90871369]
VALID ACC : 0.8893464166812686, VALID LOSS : 0.3496647546281751
{'epoch': 0, 'train_loss': 0.632146699026784, 'train_acc': 0.7808703271028037, 'valid_acc': 0.8893464166812686, 'val_loss': 0.3496647546281751, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2140 [00:00<18:49,  1.89it/s]

Training steps: 0 Loss: 0.2776041030883789


Training:   5%|▍         | 101/2140 [00:51<17:05,  1.99it/s]

Training steps: 100 Loss: 0.33108529448509216


Training:   9%|▉         | 201/2140 [01:42<16:37,  1.94it/s]

Training steps: 200 Loss: 0.15718574821949005


Training:  14%|█▍        | 301/2140 [02:32<15:24,  1.99it/s]

Training steps: 300 Loss: 0.21763600409030914


Training:  19%|█▊        | 401/2140 [03:23<14:55,  1.94it/s]

Training steps: 400 Loss: 0.33193275332450867


Training:  23%|██▎       | 501/2140 [04:14<13:45,  1.99it/s]

Training steps: 500 Loss: 0.4713955521583557


Training:  28%|██▊       | 601/2140 [05:05<13:14,  1.94it/s]

Training steps: 600 Loss: 0.0845748707652092


Training:  33%|███▎      | 701/2140 [05:56<12:04,  1.99it/s]

Training steps: 700 Loss: 0.4217678904533386


Training:  37%|███▋      | 801/2140 [06:46<11:29,  1.94it/s]

Training steps: 800 Loss: 0.19552171230316162


Training:  42%|████▏     | 901/2140 [07:37<10:24,  1.99it/s]

Training steps: 900 Loss: 0.09262977540493011


Training:  47%|████▋     | 1001/2140 [08:28<09:47,  1.94it/s]

Training steps: 1000 Loss: 0.47631993889808655


Training:  51%|█████▏    | 1101/2140 [09:19<08:43,  1.98it/s]

Training steps: 1100 Loss: 0.1282646805047989


Training:  56%|█████▌    | 1201/2140 [10:10<08:03,  1.94it/s]

Training steps: 1200 Loss: 0.2732006013393402


Training:  61%|██████    | 1301/2140 [11:00<07:01,  1.99it/s]

Training steps: 1300 Loss: 0.12715353071689606


Training:  65%|██████▌   | 1401/2140 [11:51<06:21,  1.94it/s]

Training steps: 1400 Loss: 0.6241583824157715


Training:  70%|███████   | 1501/2140 [12:42<05:21,  1.98it/s]

Training steps: 1500 Loss: 0.3480980098247528


Training:  75%|███████▍  | 1601/2140 [13:33<04:37,  1.94it/s]

Training steps: 1600 Loss: 0.5039451718330383


Training:  79%|███████▉  | 1701/2140 [14:24<03:40,  1.99it/s]

Training steps: 1700 Loss: 0.3761443495750427


Training:  84%|████████▍ | 1801/2140 [15:14<02:54,  1.94it/s]

Training steps: 1800 Loss: 0.28863176703453064


Training:  89%|████████▉ | 1901/2140 [16:05<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.1161569356918335


Training:  94%|█████████▎| 2001/2140 [16:56<01:11,  1.94it/s]

Training steps: 2000 Loss: 0.10038566589355469


Training:  98%|█████████▊| 2101/2140 [17:47<00:19,  1.99it/s]

Training steps: 2100 Loss: 0.25516149401664734


Training: 100%|██████████| 2140/2140 [18:07<00:00,  1.97it/s]

TRAIN ACC : 0.89375, TRAIN LOSS : 0.32856439185180814



Training:   0%|          | 2/714 [00:00<02:06,  5.61it/s]

Validation steps: 0 Loss: 0.3788854479789734


Training:  14%|█▍        | 102/714 [00:17<01:44,  5.85it/s]

Validation steps: 100 Loss: 0.2857610881328583


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.79it/s]

Validation steps: 200 Loss: 0.4030283987522125


Training:  42%|████▏     | 302/714 [00:51<01:10,  5.87it/s]

Validation steps: 300 Loss: 0.338022381067276


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.83it/s]

Validation steps: 400 Loss: 0.4754464328289032


Training:  70%|███████   | 502/714 [01:26<00:36,  5.83it/s]

Validation steps: 500 Loss: 0.11042875051498413


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.81it/s]

Validation steps: 600 Loss: 0.09063142538070679


Training:  98%|█████████▊| 702/714 [02:00<00:02,  5.74it/s]

Validation steps: 700 Loss: 0.1489429771900177


Training: 100%|██████████| 714/714 [02:02<00:00,  5.82it/s]


              precision    recall  f1-score   support

        IT과학       0.87      0.81      0.84      1206
          경제       0.91      0.76      0.83      1555
          사회       0.70      0.85      0.77      1841
        생활문화       0.92      0.87      0.90      1483
          세계       0.93      0.94      0.93      1908
         스포츠       0.98      0.97      0.97      1734
          정치       0.91      0.93      0.92      1687

    accuracy                           0.88     11414
   macro avg       0.89      0.87      0.88     11414
weighted avg       0.89      0.88      0.88     11414

[0.80845771 0.75562701 0.85497012 0.8705327  0.93501048 0.96828143
 0.93123889]
VALID ACC : 0.8804100227790432, VALID LOSS : 0.3481475175658361
{'epoch': 1, 'train_loss': 0.32856439185180814, 'train_acc': 0.89375, 'valid_acc': 0.8804100227790432, 'val_loss': 0.3481475175658361, 'learning_rate': 5e-06}
Start Training: Epoch 3


Training:   0%|          | 1/2140 [00:00<17:59,  1.98it/s]

Training steps: 0 Loss: 0.22879916429519653


Training:   5%|▍         | 101/2140 [00:51<17:04,  1.99it/s]

Training steps: 100 Loss: 0.49957624077796936


Training:   9%|▉         | 201/2140 [01:42<16:39,  1.94it/s]

Training steps: 200 Loss: 0.12006013840436935


Training:  14%|█▍        | 301/2140 [02:32<15:24,  1.99it/s]

Training steps: 300 Loss: 0.08308196812868118


Training:  19%|█▊        | 401/2140 [03:23<14:55,  1.94it/s]

Training steps: 400 Loss: 0.8532280325889587


Training:  23%|██▎       | 501/2140 [04:14<13:46,  1.98it/s]

Training steps: 500 Loss: 0.14400605857372284


Training:  28%|██▊       | 601/2140 [05:05<13:14,  1.94it/s]

Training steps: 600 Loss: 0.33843016624450684


Training:  33%|███▎      | 701/2140 [05:56<12:03,  1.99it/s]

Training steps: 700 Loss: 0.16928035020828247


Training:  37%|███▋      | 801/2140 [06:47<11:31,  1.94it/s]

Training steps: 800 Loss: 0.35535144805908203


Training:  42%|████▏     | 901/2140 [07:37<10:23,  1.99it/s]

Training steps: 900 Loss: 0.3408161401748657


Training:  47%|████▋     | 1001/2140 [08:28<09:46,  1.94it/s]

Training steps: 1000 Loss: 0.5236793756484985


Training:  51%|█████▏    | 1101/2140 [09:19<08:42,  1.99it/s]

Training steps: 1100 Loss: 0.25241994857788086


Training:  56%|█████▌    | 1201/2140 [10:10<08:04,  1.94it/s]

Training steps: 1200 Loss: 0.08461210876703262


Training:  61%|██████    | 1301/2140 [11:01<07:01,  1.99it/s]

Training steps: 1300 Loss: 0.3314885199069977


Training:  65%|██████▌   | 1401/2140 [11:51<06:21,  1.94it/s]

Training steps: 1400 Loss: 0.5433880686759949


Training:  70%|███████   | 1501/2140 [12:42<05:21,  1.99it/s]

Training steps: 1500 Loss: 0.21840259432792664


Training:  75%|███████▍  | 1601/2140 [13:33<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.04914820194244385


Training:  79%|███████▉  | 1701/2140 [14:24<03:40,  1.99it/s]

Training steps: 1700 Loss: 0.5024186968803406


Training:  84%|████████▍ | 1801/2140 [15:15<02:54,  1.95it/s]

Training steps: 1800 Loss: 0.2120150923728943


Training:  89%|████████▉ | 1901/2140 [16:06<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.20087368786334991


Training:  94%|█████████▎| 2001/2140 [16:56<01:11,  1.94it/s]

Training steps: 2000 Loss: 0.48349806666374207


Training:  98%|█████████▊| 2101/2140 [17:47<00:19,  1.99it/s]

Training steps: 2100 Loss: 0.04769303277134895


Training: 100%|██████████| 2140/2140 [18:07<00:00,  1.97it/s]

TRAIN ACC : 0.9095210280373832, TRAIN LOSS : 0.2768378154444291



Training:   0%|          | 2/714 [00:00<02:07,  5.57it/s]

Validation steps: 0 Loss: 0.5061116218566895


Training:  14%|█▍        | 102/714 [00:17<01:44,  5.84it/s]

Validation steps: 100 Loss: 0.23209033906459808


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.80it/s]

Validation steps: 200 Loss: 0.6010313034057617


Training:  42%|████▏     | 302/714 [00:51<01:11,  5.79it/s]

Validation steps: 300 Loss: 0.34724780917167664


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.87it/s]

Validation steps: 400 Loss: 0.5341745018959045


Training:  70%|███████   | 502/714 [01:26<00:36,  5.79it/s]

Validation steps: 500 Loss: 0.09867288917303085


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.82it/s]

Validation steps: 600 Loss: 0.13558639585971832


Training:  98%|█████████▊| 702/714 [02:00<00:02,  5.82it/s]

Validation steps: 700 Loss: 0.04673229157924652


Training: 100%|██████████| 714/714 [02:02<00:00,  5.82it/s]


              precision    recall  f1-score   support

        IT과학       0.82      0.88      0.85      1206
          경제       0.86      0.82      0.84      1555
          사회       0.81      0.78      0.80      1841
        생활문화       0.91      0.91      0.91      1483
          세계       0.93      0.94      0.93      1908
         스포츠       0.98      0.97      0.97      1734
          정치       0.92      0.92      0.92      1687

    accuracy                           0.89     11414
   macro avg       0.89      0.89      0.89     11414
weighted avg       0.89      0.89      0.89     11414

[0.88225539 0.82186495 0.78326996 0.91368847 0.93972746 0.97404844
 0.91819798]
VALID ACC : 0.8910110390748204, VALID LOSS : 0.3526197511713211
{'epoch': 2, 'train_loss': 0.2768378154444291, 'train_acc': 0.9095210280373832, 'valid_acc': 0.8910110390748204, 'val_loss': 0.3526197511713211, 'learning_rate': 5e-06}
saving model ...


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classif

Start Training: Epoch 1


Training:   0%|          | 1/2140 [00:00<18:20,  1.94it/s]

Training steps: 0 Loss: 1.9782353639602661


Training:   5%|▍         | 101/2140 [00:51<17:06,  1.99it/s]

Training steps: 100 Loss: 2.03049635887146


Training:   9%|▉         | 201/2140 [01:42<16:43,  1.93it/s]

Training steps: 200 Loss: 1.5524286031723022


Training:  14%|█▍        | 301/2140 [02:33<15:28,  1.98it/s]

Training steps: 300 Loss: 1.0923594236373901


Training:  19%|█▊        | 401/2140 [03:23<14:57,  1.94it/s]

Training steps: 400 Loss: 0.6426074504852295


Training:  23%|██▎       | 501/2140 [04:14<13:43,  1.99it/s]

Training steps: 500 Loss: 0.26287421584129333


Training:  28%|██▊       | 601/2140 [05:05<13:12,  1.94it/s]

Training steps: 600 Loss: 0.4879920184612274


Training:  33%|███▎      | 701/2140 [05:56<12:04,  1.99it/s]

Training steps: 700 Loss: 0.6740790009498596


Training:  37%|███▋      | 801/2140 [06:47<11:30,  1.94it/s]

Training steps: 800 Loss: 0.3623389005661011


Training:  42%|████▏     | 901/2140 [07:37<10:24,  1.99it/s]

Training steps: 900 Loss: 0.5696316361427307


Training:  47%|████▋     | 1001/2140 [08:28<09:47,  1.94it/s]

Training steps: 1000 Loss: 0.46836450695991516


Training:  51%|█████▏    | 1101/2140 [09:19<08:43,  1.99it/s]

Training steps: 1100 Loss: 0.7569010853767395


Training:  56%|█████▌    | 1201/2140 [10:10<08:03,  1.94it/s]

Training steps: 1200 Loss: 0.2385009229183197


Training:  61%|██████    | 1301/2140 [11:01<07:01,  1.99it/s]

Training steps: 1300 Loss: 0.5142331719398499


Training:  65%|██████▌   | 1401/2140 [11:52<06:20,  1.94it/s]

Training steps: 1400 Loss: 0.4028877019882202


Training:  70%|███████   | 1501/2140 [12:42<05:21,  1.99it/s]

Training steps: 1500 Loss: 0.14852404594421387


Training:  75%|███████▍  | 1601/2140 [13:33<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.8508303165435791


Training:  79%|███████▉  | 1701/2140 [14:24<03:40,  1.99it/s]

Training steps: 1700 Loss: 0.7148768305778503


Training:  84%|████████▍ | 1801/2140 [15:15<02:54,  1.94it/s]

Training steps: 1800 Loss: 0.2390211969614029


Training:  89%|████████▉ | 1901/2140 [16:06<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.37778329849243164


Training:  94%|█████████▎| 2001/2140 [16:56<01:12,  1.93it/s]

Training steps: 2000 Loss: 0.40755563974380493


Training:  98%|█████████▊| 2101/2140 [17:47<00:19,  1.98it/s]

Training steps: 2100 Loss: 0.267194926738739


Training: 100%|██████████| 2140/2140 [18:07<00:00,  1.97it/s]

TRAIN ACC : 0.7867990654205608, TRAIN LOSS : 0.6185753647168385



Training:   0%|          | 2/714 [00:00<02:06,  5.64it/s]

Validation steps: 0 Loss: 0.40503308176994324


Training:  14%|█▍        | 102/714 [00:17<01:46,  5.75it/s]

Validation steps: 100 Loss: 0.5399025082588196


Training:  28%|██▊       | 202/714 [00:34<01:27,  5.83it/s]

Validation steps: 200 Loss: 0.5878603458404541


Training:  42%|████▏     | 302/714 [00:52<01:10,  5.81it/s]

Validation steps: 300 Loss: 0.5481908917427063


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.82it/s]

Validation steps: 400 Loss: 0.2829934358596802


Training:  70%|███████   | 502/714 [01:26<00:36,  5.78it/s]

Validation steps: 500 Loss: 0.1732712984085083


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.85it/s]

Validation steps: 600 Loss: 0.47095757722854614


Training:  98%|█████████▊| 702/714 [02:00<00:02,  5.77it/s]

Validation steps: 700 Loss: 0.2009667307138443


Training: 100%|██████████| 714/714 [02:02<00:00,  5.81it/s]


              precision    recall  f1-score   support

        IT과학       0.80      0.89      0.84      1206
          경제       0.84      0.84      0.84      1556
          사회       0.79      0.78      0.78      1841
        생활문화       0.89      0.92      0.91      1483
          세계       0.95      0.86      0.90      1907
         스포츠       0.96      0.98      0.97      1733
          정치       0.92      0.90      0.91      1688

    accuracy                           0.88     11414
   macro avg       0.88      0.88      0.88     11414
weighted avg       0.88      0.88      0.88     11414

[0.8946932  0.83997429 0.7789245  0.92245448 0.86103828 0.98153491
 0.89632701]
VALID ACC : 0.8799719642544244, VALID LOSS : 0.3745224628576628
{'epoch': 0, 'train_loss': 0.6185753647168385, 'train_acc': 0.7867990654205608, 'valid_acc': 0.8799719642544244, 'val_loss': 0.3745224628576628, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2140 [00:00<18:27,  1.93it/s]

Training steps: 0 Loss: 0.292519211769104


Training:   5%|▍         | 101/2140 [00:51<17:05,  1.99it/s]

Training steps: 100 Loss: 0.17224381864070892


Training:   9%|▉         | 201/2140 [01:42<16:39,  1.94it/s]

Training steps: 200 Loss: 0.32985177636146545


Training:  14%|█▍        | 301/2140 [02:33<15:27,  1.98it/s]

Training steps: 300 Loss: 0.20894822478294373


Training:  19%|█▊        | 401/2140 [03:23<14:56,  1.94it/s]

Training steps: 400 Loss: 0.3102926015853882


Training:  23%|██▎       | 501/2140 [04:14<13:43,  1.99it/s]

Training steps: 500 Loss: 0.7097106575965881


Training:  28%|██▊       | 601/2140 [05:05<13:14,  1.94it/s]

Training steps: 600 Loss: 0.38383322954177856


Training:  33%|███▎      | 701/2140 [05:56<12:05,  1.98it/s]

Training steps: 700 Loss: 0.14214834570884705


Training:  37%|███▋      | 801/2140 [06:47<11:28,  1.94it/s]

Training steps: 800 Loss: 0.403164803981781


Training:  42%|████▏     | 901/2140 [07:38<10:24,  1.98it/s]

Training steps: 900 Loss: 0.15836359560489655


Training:  47%|████▋     | 1001/2140 [08:28<09:48,  1.94it/s]

Training steps: 1000 Loss: 0.2684457004070282


Training:  51%|█████▏    | 1101/2140 [09:19<08:42,  1.99it/s]

Training steps: 1100 Loss: 0.10641496628522873


Training:  56%|█████▌    | 1201/2140 [10:10<08:05,  1.93it/s]

Training steps: 1200 Loss: 0.15959689021110535


Training:  61%|██████    | 1301/2140 [11:01<07:01,  1.99it/s]

Training steps: 1300 Loss: 0.4713374078273773


Training:  65%|██████▌   | 1401/2140 [11:52<06:19,  1.95it/s]

Training steps: 1400 Loss: 0.07550571113824844


Training:  70%|███████   | 1501/2140 [12:43<05:22,  1.98it/s]

Training steps: 1500 Loss: 0.520344078540802


Training:  75%|███████▍  | 1601/2140 [13:34<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.22988909482955933


Training:  79%|███████▉  | 1701/2140 [14:24<03:40,  1.99it/s]

Training steps: 1700 Loss: 0.13247351348400116


Training:  84%|████████▍ | 1801/2140 [15:15<02:54,  1.94it/s]

Training steps: 1800 Loss: 0.23368069529533386


Training:  89%|████████▉ | 1901/2140 [16:06<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.0748046264052391


Training:  94%|█████████▎| 2001/2140 [16:57<01:11,  1.94it/s]

Training steps: 2000 Loss: 0.10114271193742752


Training:  98%|█████████▊| 2101/2140 [17:48<00:19,  1.99it/s]

Training steps: 2100 Loss: 0.42809754610061646


Training: 100%|██████████| 2140/2140 [18:08<00:00,  1.97it/s]

TRAIN ACC : 0.8949474299065421, TRAIN LOSS : 0.3304353560660487



Training:   0%|          | 2/714 [00:00<02:07,  5.58it/s]

Validation steps: 0 Loss: 0.25134244561195374


Training:  14%|█▍        | 102/714 [00:17<01:46,  5.76it/s]

Validation steps: 100 Loss: 0.3409470319747925


Training:  28%|██▊       | 202/714 [00:34<01:27,  5.85it/s]

Validation steps: 200 Loss: 0.6860376596450806


Training:  42%|████▏     | 302/714 [00:52<01:10,  5.80it/s]

Validation steps: 300 Loss: 0.4955792725086212


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.81it/s]

Validation steps: 400 Loss: 0.20666195452213287


Training:  70%|███████   | 502/714 [01:26<00:36,  5.79it/s]

Validation steps: 500 Loss: 0.10569154471158981


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.86it/s]

Validation steps: 600 Loss: 0.462196946144104


Training:  98%|█████████▊| 702/714 [02:00<00:02,  5.77it/s]

Validation steps: 700 Loss: 0.11884594708681107


Training: 100%|██████████| 714/714 [02:03<00:00,  5.80it/s]


              precision    recall  f1-score   support

        IT과학       0.83      0.87      0.85      1206
          경제       0.86      0.84      0.85      1556
          사회       0.79      0.80      0.79      1841
        생활문화       0.93      0.88      0.91      1483
          세계       0.93      0.93      0.93      1907
         스포츠       0.97      0.98      0.97      1733
          정치       0.91      0.92      0.92      1688

    accuracy                           0.89     11414
   macro avg       0.89      0.89      0.89     11414
weighted avg       0.89      0.89      0.89     11414

[0.87396352 0.83804627 0.79684954 0.88064734 0.92606188 0.98095788
 0.9235782 ]
VALID ACC : 0.8897844752058875, VALID LOSS : 0.3517816122516948
{'epoch': 1, 'train_loss': 0.3304353560660487, 'train_acc': 0.8949474299065421, 'valid_acc': 0.8897844752058875, 'val_loss': 0.3517816122516948, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2140 [00:00<18:31,  1.93it/s]

Training steps: 0 Loss: 0.4193726181983948


Training:   5%|▍         | 101/2140 [00:51<17:07,  1.98it/s]

Training steps: 100 Loss: 0.17177829146385193


Training:   9%|▉         | 201/2140 [01:42<16:42,  1.93it/s]

Training steps: 200 Loss: 0.20586661994457245


Training:  14%|█▍        | 301/2140 [02:33<15:26,  1.99it/s]

Training steps: 300 Loss: 0.06401001662015915


Training:  19%|█▊        | 401/2140 [03:24<14:58,  1.94it/s]

Training steps: 400 Loss: 0.44610923528671265


Training:  23%|██▎       | 501/2140 [04:14<13:46,  1.98it/s]

Training steps: 500 Loss: 0.3480013608932495


Training:  28%|██▊       | 601/2140 [05:05<13:13,  1.94it/s]

Training steps: 600 Loss: 0.1064426526427269


Training:  33%|███▎      | 701/2140 [05:56<12:08,  1.97it/s]

Training steps: 700 Loss: 0.2901146411895752


Training:  37%|███▋      | 801/2140 [06:47<11:31,  1.94it/s]

Training steps: 800 Loss: 0.30984076857566833


Training:  42%|████▏     | 901/2140 [07:38<10:24,  1.98it/s]

Training steps: 900 Loss: 0.30205777287483215


Training:  47%|████▋     | 1001/2140 [08:29<09:47,  1.94it/s]

Training steps: 1000 Loss: 0.6936576962471008


Training:  51%|█████▏    | 1101/2140 [09:20<08:42,  1.99it/s]

Training steps: 1100 Loss: 0.10539943724870682


Training:  56%|█████▌    | 1201/2140 [10:10<08:03,  1.94it/s]

Training steps: 1200 Loss: 0.27459198236465454


Training:  61%|██████    | 1301/2140 [11:01<07:03,  1.98it/s]

Training steps: 1300 Loss: 0.43237048387527466


Training:  65%|██████▌   | 1401/2140 [11:52<06:21,  1.94it/s]

Training steps: 1400 Loss: 0.2834928631782532


Training:  70%|███████   | 1501/2140 [12:43<05:21,  1.99it/s]

Training steps: 1500 Loss: 0.10558229684829712


Training:  75%|███████▍  | 1601/2140 [13:34<04:38,  1.93it/s]

Training steps: 1600 Loss: 0.07956473529338837


Training:  79%|███████▉  | 1701/2140 [14:25<03:40,  1.99it/s]

Training steps: 1700 Loss: 0.4122280180454254


Training:  84%|████████▍ | 1801/2140 [15:16<02:55,  1.94it/s]

Training steps: 1800 Loss: 0.18119098246097565


Training:  89%|████████▉ | 1901/2140 [16:07<02:00,  1.98it/s]

Training steps: 1900 Loss: 0.2805542051792145


Training:  94%|█████████▎| 2001/2140 [16:58<01:11,  1.93it/s]

Training steps: 2000 Loss: 0.3090938627719879


Training:  98%|█████████▊| 2101/2140 [17:48<00:19,  1.99it/s]

Training steps: 2100 Loss: 0.35087400674819946


Training: 100%|██████████| 2140/2140 [18:08<00:00,  1.97it/s]

TRAIN ACC : 0.9082359813084112, TRAIN LOSS : 0.2813713302130802



Training:   0%|          | 2/714 [00:00<02:07,  5.60it/s]

Validation steps: 0 Loss: 0.2487310767173767


Training:  14%|█▍        | 102/714 [00:17<01:44,  5.84it/s]

Validation steps: 100 Loss: 0.4223249554634094


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.77it/s]

Validation steps: 200 Loss: 0.6706560254096985


Training:  42%|████▏     | 302/714 [00:52<01:11,  5.74it/s]

Validation steps: 300 Loss: 0.5305514931678772


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.80it/s]

Validation steps: 400 Loss: 0.29670852422714233


Training:  70%|███████   | 502/714 [01:26<00:36,  5.84it/s]

Validation steps: 500 Loss: 0.13412638008594513


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.80it/s]

Validation steps: 600 Loss: 0.6195914149284363


Training:  98%|█████████▊| 702/714 [02:01<00:02,  5.81it/s]

Validation steps: 700 Loss: 0.2094474881887436


Training: 100%|██████████| 714/714 [02:03<00:00,  5.80it/s]


              precision    recall  f1-score   support

        IT과학       0.84      0.85      0.85      1206
          경제       0.91      0.76      0.82      1556
          사회       0.76      0.80      0.78      1841
        생활문화       0.88      0.94      0.91      1483
          세계       0.92      0.92      0.92      1907
         스포츠       0.95      0.99      0.97      1733
          정치       0.92      0.90      0.91      1688

    accuracy                           0.88     11414
   macro avg       0.88      0.88      0.88     11414
weighted avg       0.88      0.88      0.88     11414

[0.85323383 0.75578406 0.80391092 0.93728928 0.92134242 0.99307559
 0.90106635]
VALID ACC : 0.8826003154021377, VALID LOSS : 0.36886556113284885
{'epoch': 2, 'train_loss': 0.2813713302130802, 'train_acc': 0.9082359813084112, 'valid_acc': 0.8826003154021377, 'val_loss': 0.36886556113284885, 'learning_rate': 5e-06}


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classif

Start Training: Epoch 1


Training:   0%|          | 1/2141 [00:00<18:33,  1.92it/s]

Training steps: 0 Loss: 1.9640291929244995


Training:   5%|▍         | 101/2141 [00:51<17:08,  1.98it/s]

Training steps: 100 Loss: 1.8890615701675415


Training:   9%|▉         | 201/2141 [01:42<16:42,  1.94it/s]

Training steps: 200 Loss: 1.6669684648513794


Training:  14%|█▍        | 301/2141 [02:33<15:28,  1.98it/s]

Training steps: 300 Loss: 1.1429795026779175


Training:  19%|█▊        | 401/2141 [03:24<14:57,  1.94it/s]

Training steps: 400 Loss: 0.43257811665534973


Training:  23%|██▎       | 501/2141 [04:15<13:46,  1.98it/s]

Training steps: 500 Loss: 0.7050406336784363


Training:  28%|██▊       | 601/2141 [05:06<13:16,  1.93it/s]

Training steps: 600 Loss: 0.2433127611875534


Training:  33%|███▎      | 701/2141 [05:57<12:05,  1.98it/s]

Training steps: 700 Loss: 0.21976107358932495


Training:  37%|███▋      | 801/2141 [06:48<11:31,  1.94it/s]

Training steps: 800 Loss: 0.513734757900238


Training:  42%|████▏     | 901/2141 [07:38<10:26,  1.98it/s]

Training steps: 900 Loss: 1.2588415145874023


Training:  47%|████▋     | 1001/2141 [08:29<09:47,  1.94it/s]

Training steps: 1000 Loss: 0.42588257789611816


Training:  51%|█████▏    | 1101/2141 [09:20<08:44,  1.98it/s]

Training steps: 1100 Loss: 0.5368363261222839


Training:  56%|█████▌    | 1201/2141 [10:11<08:06,  1.93it/s]

Training steps: 1200 Loss: 0.39608433842658997


Training:  61%|██████    | 1301/2141 [11:02<07:02,  1.99it/s]

Training steps: 1300 Loss: 0.6726480722427368


Training:  65%|██████▌   | 1401/2141 [11:53<06:21,  1.94it/s]

Training steps: 1400 Loss: 0.7001802921295166


Training:  70%|███████   | 1501/2141 [12:44<05:23,  1.98it/s]

Training steps: 1500 Loss: 0.3270600438117981


Training:  75%|███████▍  | 1601/2141 [13:35<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.49018749594688416


Training:  79%|███████▉  | 1701/2141 [14:25<03:41,  1.99it/s]

Training steps: 1700 Loss: 0.8489689826965332


Training:  84%|████████▍ | 1801/2141 [15:16<02:56,  1.93it/s]

Training steps: 1800 Loss: 0.5390361547470093


Training:  89%|████████▉ | 1901/2141 [16:07<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.078391432762146


Training:  93%|█████████▎| 2001/2141 [16:58<01:12,  1.94it/s]

Training steps: 2000 Loss: 0.134877547621727


Training:  98%|█████████▊| 2101/2141 [17:49<00:20,  1.99it/s]

Training steps: 2100 Loss: 0.49543020129203796


Training: 100%|██████████| 2141/2141 [18:09<00:00,  1.96it/s]

TRAIN ACC : 0.7778102275050378, TRAIN LOSS : 0.6388645696356746



Training:   0%|          | 2/714 [00:00<02:06,  5.61it/s]

Validation steps: 0 Loss: 0.43922269344329834


Training:  14%|█▍        | 102/714 [00:17<01:44,  5.86it/s]

Validation steps: 100 Loss: 1.0204598903656006


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.77it/s]

Validation steps: 200 Loss: 0.43655264377593994


Training:  42%|████▏     | 302/714 [00:51<01:10,  5.82it/s]

Validation steps: 300 Loss: 0.5337651968002319


Training:  56%|█████▋    | 402/714 [01:09<00:52,  5.89it/s]

Validation steps: 400 Loss: 0.24541454017162323


Training:  70%|███████   | 502/714 [01:26<00:36,  5.85it/s]

Validation steps: 500 Loss: 0.07315061241388321


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.72it/s]

Validation steps: 600 Loss: 0.8655748963356018


Training:  98%|█████████▊| 702/714 [02:00<00:02,  5.76it/s]

Validation steps: 700 Loss: 0.08903587609529495


Training: 100%|██████████| 714/714 [02:02<00:00,  5.81it/s]


              precision    recall  f1-score   support

        IT과학       0.86      0.81      0.83      1206
          경제       0.93      0.76      0.84      1556
          사회       0.70      0.87      0.77      1840
        생활문화       0.92      0.88      0.90      1483
          세계       0.94      0.92      0.93      1907
         스포츠       0.97      0.97      0.97      1733
          정치       0.92      0.92      0.92      1688

    accuracy                           0.88     11413
   macro avg       0.89      0.88      0.88     11413
weighted avg       0.89      0.88      0.88     11413

[0.80845771 0.76478149 0.86630435 0.88334457 0.92448873 0.97172533
 0.91706161]
VALID ACC : 0.8818014544817313, VALID LOSS : 0.3545834574253619
{'epoch': 0, 'train_loss': 0.6388645696356746, 'train_acc': 0.7778102275050378, 'valid_acc': 0.8818014544817313, 'val_loss': 0.3545834574253619, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2141 [00:00<19:11,  1.86it/s]

Training steps: 0 Loss: 0.16663913428783417


Training:   5%|▍         | 101/2141 [00:51<17:09,  1.98it/s]

Training steps: 100 Loss: 0.5236068964004517


Training:   9%|▉         | 201/2141 [01:42<16:43,  1.93it/s]

Training steps: 200 Loss: 0.42013776302337646


Training:  14%|█▍        | 301/2141 [02:33<15:26,  1.99it/s]

Training steps: 300 Loss: 0.23972289264202118


Training:  19%|█▊        | 401/2141 [03:24<14:58,  1.94it/s]

Training steps: 400 Loss: 0.35434192419052124


Training:  23%|██▎       | 501/2141 [04:14<13:46,  1.98it/s]

Training steps: 500 Loss: 0.7699450850486755


Training:  28%|██▊       | 601/2141 [05:05<13:13,  1.94it/s]

Training steps: 600 Loss: 0.31230059266090393


Training:  33%|███▎      | 701/2141 [05:56<12:04,  1.99it/s]

Training steps: 700 Loss: 0.06574886292219162


Training:  37%|███▋      | 801/2141 [06:47<11:32,  1.94it/s]

Training steps: 800 Loss: 0.382998526096344


Training:  42%|████▏     | 901/2141 [07:38<10:23,  1.99it/s]

Training steps: 900 Loss: 0.8306833505630493


Training:  47%|████▋     | 1001/2141 [08:29<09:48,  1.94it/s]

Training steps: 1000 Loss: 0.20591261982917786


Training:  51%|█████▏    | 1101/2141 [09:20<08:44,  1.98it/s]

Training steps: 1100 Loss: 0.33595573902130127


Training:  56%|█████▌    | 1201/2141 [10:11<08:04,  1.94it/s]

Training steps: 1200 Loss: 0.3451897203922272


Training:  61%|██████    | 1301/2141 [11:02<07:03,  1.99it/s]

Training steps: 1300 Loss: 0.16435302793979645


Training:  65%|██████▌   | 1401/2141 [11:53<06:23,  1.93it/s]

Training steps: 1400 Loss: 0.46934762597084045


Training:  70%|███████   | 1501/2141 [12:43<05:22,  1.99it/s]

Training steps: 1500 Loss: 0.14825038611888885


Training:  75%|███████▍  | 1601/2141 [13:34<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.4044865071773529


Training:  79%|███████▉  | 1701/2141 [14:25<03:41,  1.99it/s]

Training steps: 1700 Loss: 0.1998993158340454


Training:  84%|████████▍ | 1801/2141 [15:16<02:55,  1.94it/s]

Training steps: 1800 Loss: 0.26252231001853943


Training:  89%|████████▉ | 1901/2141 [16:07<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.36355748772621155


Training:  93%|█████████▎| 2001/2141 [16:58<01:12,  1.94it/s]

Training steps: 2000 Loss: 0.6949017643928528


Training:  98%|█████████▊| 2101/2141 [17:49<00:20,  1.98it/s]

Training steps: 2100 Loss: 0.509442150592804


Training: 100%|██████████| 2141/2141 [18:09<00:00,  1.97it/s]

TRAIN ACC : 0.8930521888963523, TRAIN LOSS : 0.3313435685191441



Training:   0%|          | 2/714 [00:00<02:10,  5.47it/s]

Validation steps: 0 Loss: 0.32530122995376587


Training:  14%|█▍        | 102/714 [00:17<01:45,  5.80it/s]

Validation steps: 100 Loss: 0.9729326963424683


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.79it/s]

Validation steps: 200 Loss: 0.2761904299259186


Training:  42%|████▏     | 302/714 [00:52<01:10,  5.87it/s]

Validation steps: 300 Loss: 0.6573759317398071


Training:  56%|█████▋    | 402/714 [01:09<00:54,  5.75it/s]

Validation steps: 400 Loss: 0.1866668462753296


Training:  70%|███████   | 502/714 [01:26<00:36,  5.75it/s]

Validation steps: 500 Loss: 0.02402294985949993


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.88it/s]

Validation steps: 600 Loss: 0.917819082736969


Training:  98%|█████████▊| 702/714 [02:01<00:02,  5.71it/s]

Validation steps: 700 Loss: 0.057365551590919495


Training: 100%|██████████| 714/714 [02:03<00:00,  5.80it/s]


              precision    recall  f1-score   support

        IT과학       0.80      0.92      0.86      1206
          경제       0.89      0.85      0.87      1556
          사회       0.81      0.79      0.80      1840
        생활문화       0.91      0.89      0.90      1483
          세계       0.93      0.94      0.93      1907
         스포츠       0.96      0.98      0.97      1733
          정치       0.94      0.90      0.92      1688

    accuracy                           0.89     11413
   macro avg       0.89      0.89      0.89     11413
weighted avg       0.90      0.89      0.89     11413

[0.91625207 0.84640103 0.78695652 0.89008766 0.94284216 0.98384305
 0.89691943]
VALID ACC : 0.8943310260229563, VALID LOSS : 0.34085313910070586
{'epoch': 1, 'train_loss': 0.3313435685191441, 'train_acc': 0.8930521888963523, 'valid_acc': 0.8943310260229563, 'val_loss': 0.34085313910070586, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2141 [00:00<18:42,  1.91it/s]

Training steps: 0 Loss: 0.0566750168800354


Training:   5%|▍         | 101/2141 [00:51<17:10,  1.98it/s]

Training steps: 100 Loss: 0.10626346617937088


Training:   9%|▉         | 201/2141 [01:42<16:43,  1.93it/s]

Training steps: 200 Loss: 0.12839464843273163


Training:  14%|█▍        | 301/2141 [02:33<15:24,  1.99it/s]

Training steps: 300 Loss: 0.20186495780944824


Training:  19%|█▊        | 401/2141 [03:24<15:02,  1.93it/s]

Training steps: 400 Loss: 0.4948241114616394


Training:  23%|██▎       | 501/2141 [04:15<13:46,  1.99it/s]

Training steps: 500 Loss: 0.12634232640266418


Training:  28%|██▊       | 601/2141 [05:06<13:15,  1.94it/s]

Training steps: 600 Loss: 0.08780556172132492


Training:  33%|███▎      | 701/2141 [05:57<12:07,  1.98it/s]

Training steps: 700 Loss: 0.10493675619363785


Training:  37%|███▋      | 801/2141 [06:47<11:31,  1.94it/s]

Training steps: 800 Loss: 0.3542768955230713


Training:  42%|████▏     | 901/2141 [07:38<10:24,  1.99it/s]

Training steps: 900 Loss: 0.49877142906188965


Training:  47%|████▋     | 1001/2141 [08:29<09:48,  1.94it/s]

Training steps: 1000 Loss: 0.1319836676120758


Training:  51%|█████▏    | 1101/2141 [09:20<08:43,  1.99it/s]

Training steps: 1100 Loss: 0.2149183303117752


Training:  56%|█████▌    | 1201/2141 [10:11<08:05,  1.94it/s]

Training steps: 1200 Loss: 0.42449751496315


Training:  61%|██████    | 1301/2141 [11:02<07:04,  1.98it/s]

Training steps: 1300 Loss: 0.15053372085094452


Training:  65%|██████▌   | 1401/2141 [11:53<06:22,  1.94it/s]

Training steps: 1400 Loss: 0.15079520642757416


Training:  70%|███████   | 1501/2141 [12:44<05:22,  1.98it/s]

Training steps: 1500 Loss: 0.12139756232500076


Training:  75%|███████▍  | 1601/2141 [13:35<04:39,  1.93it/s]

Training steps: 1600 Loss: 0.37298619747161865


Training:  79%|███████▉  | 1701/2141 [14:26<03:41,  1.98it/s]

Training steps: 1700 Loss: 0.27363133430480957


Training:  84%|████████▍ | 1801/2141 [15:17<02:55,  1.93it/s]

Training steps: 1800 Loss: 0.3784947693347931


Training:  89%|████████▉ | 1901/2141 [16:08<02:01,  1.98it/s]

Training steps: 1900 Loss: 0.08426228165626526


Training:  93%|█████████▎| 2001/2141 [16:59<01:12,  1.94it/s]

Training steps: 2000 Loss: 0.08972626179456711


Training:  98%|█████████▊| 2101/2141 [17:50<00:20,  1.98it/s]

Training steps: 2100 Loss: 0.283307820558548


Training: 100%|██████████| 2141/2141 [18:10<00:00,  1.96it/s]

TRAIN ACC : 0.9078882041996437, TRAIN LOSS : 0.2846744859461498



Training:   0%|          | 2/714 [00:00<02:08,  5.56it/s]

Validation steps: 0 Loss: 0.2825583219528198


Training:  14%|█▍        | 102/714 [00:17<01:46,  5.72it/s]

Validation steps: 100 Loss: 0.9144029021263123


Training:  28%|██▊       | 202/714 [00:34<01:27,  5.84it/s]

Validation steps: 200 Loss: 0.47278574109077454


Training:  42%|████▏     | 302/714 [00:52<01:09,  5.90it/s]

Validation steps: 300 Loss: 0.5550320744514465


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.82it/s]

Validation steps: 400 Loss: 0.17969389259815216


Training:  70%|███████   | 502/714 [01:26<00:36,  5.79it/s]

Validation steps: 500 Loss: 0.025723254308104515


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.85it/s]

Validation steps: 600 Loss: 0.8609151244163513


Training:  98%|█████████▊| 702/714 [02:01<00:02,  5.75it/s]

Validation steps: 700 Loss: 0.061576295644044876


Training: 100%|██████████| 714/714 [02:03<00:00,  5.79it/s]


              precision    recall  f1-score   support

        IT과학       0.83      0.89      0.86      1206
          경제       0.87      0.86      0.87      1556
          사회       0.81      0.81      0.81      1840
        생활문화       0.92      0.89      0.91      1483
          세계       0.93      0.94      0.93      1907
         스포츠       0.97      0.98      0.97      1733
          정치       0.93      0.91      0.92      1688

    accuracy                           0.90     11413
   macro avg       0.89      0.90      0.90     11413
weighted avg       0.90      0.90      0.90     11413

[0.89054726 0.86053985 0.80706522 0.89008766 0.94022024 0.97691864
 0.90699052]
VALID ACC : 0.896784368702357, VALID LOSS : 0.33248234890839634
{'epoch': 2, 'train_loss': 0.2846744859461498, 'train_acc': 0.9078882041996437, 'valid_acc': 0.896784368702357, 'val_loss': 0.33248234890839634, 'learning_rate': 5e-06}
saving model ...


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classif

Start Training: Epoch 1


Training:   0%|          | 1/2141 [00:00<18:19,  1.95it/s]

Training steps: 0 Loss: 1.928367257118225


Training:   5%|▍         | 101/2141 [00:51<17:14,  1.97it/s]

Training steps: 100 Loss: 1.7773587703704834


Training:   9%|▉         | 201/2141 [01:42<16:43,  1.93it/s]

Training steps: 200 Loss: 1.533544659614563


Training:  14%|█▍        | 301/2141 [02:33<15:27,  1.98it/s]

Training steps: 300 Loss: 1.1203267574310303


Training:  19%|█▊        | 401/2141 [03:24<14:59,  1.94it/s]

Training steps: 400 Loss: 0.6033369898796082


Training:  23%|██▎       | 501/2141 [04:15<13:43,  1.99it/s]

Training steps: 500 Loss: 0.4672258794307709


Training:  28%|██▊       | 601/2141 [05:06<13:14,  1.94it/s]

Training steps: 600 Loss: 0.6354444622993469


Training:  33%|███▎      | 701/2141 [05:57<12:05,  1.98it/s]

Training steps: 700 Loss: 0.3433629274368286


Training:  37%|███▋      | 801/2141 [06:48<11:29,  1.94it/s]

Training steps: 800 Loss: 0.23510238528251648


Training:  42%|████▏     | 901/2141 [07:39<10:25,  1.98it/s]

Training steps: 900 Loss: 0.5015973448753357


Training:  47%|████▋     | 1001/2141 [08:30<09:49,  1.93it/s]

Training steps: 1000 Loss: 0.28876006603240967


Training:  51%|█████▏    | 1101/2141 [09:21<08:43,  1.99it/s]

Training steps: 1100 Loss: 0.5567362904548645


Training:  56%|█████▌    | 1201/2141 [10:12<08:06,  1.93it/s]

Training steps: 1200 Loss: 0.5367820262908936


Training:  61%|██████    | 1301/2141 [11:03<07:05,  1.97it/s]

Training steps: 1300 Loss: 0.12158849835395813


Training:  65%|██████▌   | 1401/2141 [11:54<06:23,  1.93it/s]

Training steps: 1400 Loss: 0.22372125089168549


Training:  70%|███████   | 1501/2141 [12:44<05:22,  1.98it/s]

Training steps: 1500 Loss: 0.12088773399591446


Training:  75%|███████▍  | 1601/2141 [13:35<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.634015679359436


Training:  79%|███████▉  | 1701/2141 [14:26<03:42,  1.98it/s]

Training steps: 1700 Loss: 0.8529894351959229


Training:  84%|████████▍ | 1801/2141 [15:17<02:55,  1.94it/s]

Training steps: 1800 Loss: 0.3089272081851959


Training:  89%|████████▉ | 1901/2141 [16:08<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.09423370659351349


Training:  93%|█████████▎| 2001/2141 [16:59<01:12,  1.93it/s]

Training steps: 2000 Loss: 0.25211331248283386


Training:  98%|█████████▊| 2101/2141 [17:50<00:20,  1.98it/s]

Training steps: 2100 Loss: 0.1032308042049408


Training: 100%|██████████| 2141/2141 [18:10<00:00,  1.96it/s]

TRAIN ACC : 0.786367220583511, TRAIN LOSS : 0.6251561731510716



Training:   0%|          | 2/714 [00:00<02:09,  5.49it/s]

Validation steps: 0 Loss: 0.05984318256378174


Training:  14%|█▍        | 102/714 [00:17<01:45,  5.81it/s]

Validation steps: 100 Loss: 0.28135693073272705


Training:  28%|██▊       | 202/714 [00:34<01:29,  5.72it/s]

Validation steps: 200 Loss: 0.2276356816291809


Training:  42%|████▏     | 302/714 [00:52<01:10,  5.89it/s]

Validation steps: 300 Loss: 0.481628954410553


Training:  56%|█████▋    | 402/714 [01:09<00:54,  5.75it/s]

Validation steps: 400 Loss: 0.7310012578964233


Training:  70%|███████   | 502/714 [01:26<00:36,  5.85it/s]

Validation steps: 500 Loss: 0.027949875220656395


Training:  84%|████████▍ | 602/714 [01:43<00:19,  5.80it/s]

Validation steps: 600 Loss: 0.6448305249214172


Training:  98%|█████████▊| 702/714 [02:01<00:02,  5.80it/s]

Validation steps: 700 Loss: 0.16107872128486633


Training: 100%|██████████| 714/714 [02:03<00:00,  5.80it/s]


              precision    recall  f1-score   support

        IT과학       0.83      0.88      0.86      1206
          경제       0.84      0.85      0.85      1555
          사회       0.81      0.77      0.79      1840
        생활문화       0.87      0.92      0.90      1484
          세계       0.94      0.93      0.93      1907
         스포츠       0.97      0.97      0.97      1733
          정치       0.94      0.90      0.92      1688

    accuracy                           0.89     11413
   macro avg       0.89      0.89      0.89     11413
weighted avg       0.89      0.89      0.89     11413

[0.8814262  0.85466238 0.76521739 0.92250674 0.92763503 0.97057126
 0.89751185]
VALID ACC : 0.8880224305616402, VALID LOSS : 0.3700927235525237
{'epoch': 0, 'train_loss': 0.6251561731510716, 'train_acc': 0.786367220583511, 'valid_acc': 0.8880224305616402, 'val_loss': 0.3700927235525237, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2141 [00:00<19:00,  1.88it/s]

Training steps: 0 Loss: 0.34874317049980164


Training:   5%|▍         | 101/2141 [00:51<17:11,  1.98it/s]

Training steps: 100 Loss: 0.536344051361084


Training:   9%|▉         | 201/2141 [01:42<16:42,  1.94it/s]

Training steps: 200 Loss: 0.477652907371521


Training:  14%|█▍        | 301/2141 [02:33<15:26,  1.99it/s]

Training steps: 300 Loss: 0.08252378553152084


Training:  19%|█▊        | 401/2141 [03:24<14:59,  1.93it/s]

Training steps: 400 Loss: 0.7285423874855042


Training:  23%|██▎       | 501/2141 [04:15<13:52,  1.97it/s]

Training steps: 500 Loss: 0.2362859696149826


Training:  28%|██▊       | 601/2141 [05:06<13:15,  1.94it/s]

Training steps: 600 Loss: 0.19590789079666138


Training:  33%|███▎      | 701/2141 [05:57<12:06,  1.98it/s]

Training steps: 700 Loss: 0.42416608333587646


Training:  37%|███▋      | 801/2141 [06:48<11:33,  1.93it/s]

Training steps: 800 Loss: 0.09667113423347473


Training:  42%|████▏     | 901/2141 [07:39<10:24,  1.98it/s]

Training steps: 900 Loss: 0.5218254923820496


Training:  47%|████▋     | 1001/2141 [08:30<09:49,  1.93it/s]

Training steps: 1000 Loss: 0.07559362053871155


Training:  51%|█████▏    | 1101/2141 [09:21<08:45,  1.98it/s]

Training steps: 1100 Loss: 0.1710226684808731


Training:  56%|█████▌    | 1201/2141 [10:12<08:05,  1.93it/s]

Training steps: 1200 Loss: 0.19885802268981934


Training:  61%|██████    | 1301/2141 [11:03<07:03,  1.98it/s]

Training steps: 1300 Loss: 0.3253248929977417


Training:  65%|██████▌   | 1401/2141 [11:54<06:22,  1.94it/s]

Training steps: 1400 Loss: 0.28647637367248535


Training:  70%|███████   | 1501/2141 [12:45<05:21,  1.99it/s]

Training steps: 1500 Loss: 0.17788073420524597


Training:  75%|███████▍  | 1601/2141 [13:36<04:40,  1.93it/s]

Training steps: 1600 Loss: 0.3709111213684082


Training:  79%|███████▉  | 1701/2141 [14:26<03:43,  1.97it/s]

Training steps: 1700 Loss: 0.8599578142166138


Training:  84%|████████▍ | 1801/2141 [15:17<02:54,  1.94it/s]

Training steps: 1800 Loss: 0.18381808698177338


Training:  89%|████████▉ | 1901/2141 [16:08<02:00,  1.99it/s]

Training steps: 1900 Loss: 0.06922803819179535


Training:  93%|█████████▎| 2001/2141 [16:59<01:12,  1.94it/s]

Training steps: 2000 Loss: 0.29194214940071106


Training:  98%|█████████▊| 2101/2141 [17:50<00:20,  1.99it/s]

Training steps: 2100 Loss: 0.18968354165554047


Training: 100%|██████████| 2141/2141 [18:10<00:00,  1.96it/s]

TRAIN ACC : 0.89433719809585, TRAIN LOSS : 0.3287172179181078



Training:   0%|          | 2/714 [00:00<02:07,  5.56it/s]

Validation steps: 0 Loss: 0.040273942053318024


Training:  14%|█▍        | 102/714 [00:17<01:45,  5.80it/s]

Validation steps: 100 Loss: 0.17849385738372803


Training:  28%|██▊       | 202/714 [00:34<01:28,  5.76it/s]

Validation steps: 200 Loss: 0.18056975305080414


Training:  42%|████▏     | 302/714 [00:52<01:10,  5.84it/s]

Validation steps: 300 Loss: 0.7236324548721313


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.79it/s]

Validation steps: 400 Loss: 0.8166411519050598


Training:  70%|███████   | 502/714 [01:26<00:36,  5.75it/s]

Validation steps: 500 Loss: 0.023804761469364166


Training:  84%|████████▍ | 602/714 [01:44<00:19,  5.82it/s]

Validation steps: 600 Loss: 0.2705468535423279


Training:  98%|█████████▊| 702/714 [02:01<00:02,  5.86it/s]

Validation steps: 700 Loss: 0.1269570142030716


Training: 100%|██████████| 714/714 [02:03<00:00,  5.79it/s]


              precision    recall  f1-score   support

        IT과학       0.85      0.82      0.83      1206
          경제       0.90      0.78      0.84      1555
          사회       0.71      0.86      0.77      1840
        생활문화       0.95      0.84      0.89      1484
          세계       0.91      0.95      0.93      1907
         스포츠       0.97      0.97      0.97      1733
          정치       0.93      0.91      0.92      1688

    accuracy                           0.88     11413
   macro avg       0.89      0.87      0.88     11413
weighted avg       0.89      0.88      0.88     11413

[0.81509121 0.77813505 0.8576087  0.8402965  0.94598846 0.9728794
 0.90876777]
VALID ACC : 0.8798738280907736, VALID LOSS : 0.34643936469754416
{'epoch': 1, 'train_loss': 0.3287172179181078, 'train_acc': 0.89433719809585, 'valid_acc': 0.8798738280907736, 'val_loss': 0.34643936469754416, 'learning_rate': 5e-06}
Start Training: Epoch 3


Training:   0%|          | 1/2141 [00:00<18:05,  1.97it/s]

Training steps: 0 Loss: 0.14982327818870544


Training:   5%|▍         | 101/2141 [00:51<17:08,  1.98it/s]

Training steps: 100 Loss: 0.1872488111257553


Training:   9%|▉         | 201/2141 [01:42<16:38,  1.94it/s]

Training steps: 200 Loss: 0.23994937539100647


Training:  14%|█▍        | 301/2141 [02:33<15:30,  1.98it/s]

Training steps: 300 Loss: 0.07569500058889389


Training:  19%|█▊        | 401/2141 [03:24<14:54,  1.95it/s]

Training steps: 400 Loss: 0.43756020069122314


Training:  23%|██▎       | 501/2141 [04:15<13:50,  1.98it/s]

Training steps: 500 Loss: 0.1131746768951416


Training:  28%|██▊       | 601/2141 [05:06<13:21,  1.92it/s]

Training steps: 600 Loss: 0.06797235459089279


Training:  33%|███▎      | 701/2141 [05:57<12:08,  1.98it/s]

Training steps: 700 Loss: 0.21727539598941803


Training:  37%|███▋      | 801/2141 [06:48<11:34,  1.93it/s]

Training steps: 800 Loss: 0.30950310826301575


Training:  42%|████▏     | 901/2141 [07:39<10:26,  1.98it/s]

Training steps: 900 Loss: 0.10927730798721313


Training:  47%|████▋     | 1001/2141 [08:30<09:46,  1.94it/s]

Training steps: 1000 Loss: 0.739419162273407


Training:  51%|█████▏    | 1101/2141 [09:21<08:45,  1.98it/s]

Training steps: 1100 Loss: 0.2964863181114197


Training:  56%|█████▌    | 1201/2141 [10:12<08:03,  1.94it/s]

Training steps: 1200 Loss: 0.07923267781734467


Training:  61%|██████    | 1301/2141 [11:02<07:04,  1.98it/s]

Training steps: 1300 Loss: 0.3017936050891876


Training:  65%|██████▌   | 1401/2141 [11:53<06:20,  1.94it/s]

Training steps: 1400 Loss: 0.03498613089323044


Training:  70%|███████   | 1501/2141 [12:44<05:21,  1.99it/s]

Training steps: 1500 Loss: 0.09699902683496475


Training:  75%|███████▍  | 1601/2141 [13:35<04:38,  1.94it/s]

Training steps: 1600 Loss: 0.22120176255702972


Training:  79%|███████▉  | 1701/2141 [14:26<03:41,  1.98it/s]

Training steps: 1700 Loss: 0.546047031879425


Training:  84%|████████▍ | 1801/2141 [15:17<02:55,  1.93it/s]

Training steps: 1800 Loss: 0.37550055980682373


Training:  89%|████████▉ | 1901/2141 [16:08<02:01,  1.98it/s]

Training steps: 1900 Loss: 0.035043761134147644


Training:  93%|█████████▎| 2001/2141 [16:59<01:12,  1.94it/s]

Training steps: 2000 Loss: 0.229032963514328


Training:  98%|█████████▊| 2101/2141 [17:50<00:20,  1.98it/s]

Training steps: 2100 Loss: 0.11251960694789886


Training: 100%|██████████| 2141/2141 [18:10<00:00,  1.96it/s]

TRAIN ACC : 0.9091732133991414, TRAIN LOSS : 0.27970237067053666



Training:   0%|          | 2/714 [00:00<02:08,  5.54it/s]

Validation steps: 0 Loss: 0.04296819865703583


Training:  14%|█▍        | 102/714 [00:17<01:46,  5.76it/s]

Validation steps: 100 Loss: 0.1015344187617302


Training:  28%|██▊       | 202/714 [00:35<01:29,  5.75it/s]

Validation steps: 200 Loss: 0.20561626553535461


Training:  42%|████▏     | 302/714 [00:52<01:11,  5.79it/s]

Validation steps: 300 Loss: 0.6045036911964417


Training:  56%|█████▋    | 402/714 [01:09<00:53,  5.88it/s]

Validation steps: 400 Loss: 0.8005814552307129


Training:  70%|███████   | 502/714 [01:26<00:36,  5.73it/s]

Validation steps: 500 Loss: 0.011866857297718525


Training:  84%|████████▍ | 602/714 [01:44<00:19,  5.78it/s]

Validation steps: 600 Loss: 0.5913500785827637


Training:  98%|█████████▊| 702/714 [02:01<00:02,  5.84it/s]

Validation steps: 700 Loss: 0.12172214686870575


Training: 100%|██████████| 714/714 [02:03<00:00,  5.78it/s]


              precision    recall  f1-score   support

        IT과학       0.85      0.86      0.85      1206
          경제       0.85      0.83      0.84      1555
          사회       0.81      0.76      0.79      1840
        생활문화       0.92      0.88      0.90      1484
          세계       0.92      0.94      0.93      1907
         스포츠       0.96      0.98      0.97      1733
          정치       0.89      0.94      0.92      1688

    accuracy                           0.89     11413
   macro avg       0.89      0.89      0.89     11413
weighted avg       0.89      0.89      0.89     11413

[0.85737977 0.83022508 0.76467391 0.88140162 0.94336654 0.97922677
 0.94253555]
VALID ACC : 0.8873214755103829, VALID LOSS : 0.36324753507799584
{'epoch': 2, 'train_loss': 0.27970237067053666, 'train_acc': 0.9091732133991414, 'valid_acc': 0.8873214755103829, 'val_loss': 0.36324753507799584, 'learning_rate': 5e-06}
************************************************** auc_avg ****************************

In [19]:
# torch.cuda.empty_cache()

In [20]:
!nvidia-smi

Tue Jul 27 21:46:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    49W / 250W |  14045MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Inference

In [21]:
def inference_main():
    args = parse_args()
    args.model_name = "temp"
    preprocess = Preprocess(args)
    preprocess.load_test_data()
    test_data = preprocess.test_data

    print(f"size of test data : {len(test_data)}")
    torch.cuda.empty_cache()
    # del model
    inference(args, test_data)

inference_main()

size of test data : 9131
Loading Model from: /content/drive/MyDrive/KLUE_TC/models/temp_1.pt


Some weights of the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_1.pt were not used when initializing RobertaForSequenceClassification: ['epoch', 'state_dict']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/KLUE_TC/models/temp_1.pt and are newly initialized: ['encoder.layer.3.output.dense.weight', 'encoder.layer.15.attention.self.key.bias', 'encoder.layer.3.attention.output.LayerNorm.weight', 'encoder.lay

RuntimeError: ignored