In [1]:
!nvidia-smi

Thu Jul 29 03:03:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    54W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Directory 설정, 구글 드라이브 import

In [2]:
cur_dir = '/content/drive/MyDrive/KLUE_TC'

## Utils

In [3]:
!pip install adamp
!pip install transformers



In [4]:
import os
import random
import torch
import numpy as np
from torch import nn

from torch.optim import Adam, AdamW, SGD
from adamp import AdamP
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR, ExponentialLR, \
    CosineAnnealingWarmRestarts
from transformers import get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification


def set_seeds(seed=42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.benchmark = False


def save_checkpoint(state, model_dir, model_filename):
    print('saving model ...')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    # torch.save(state, os.path.join(model_dir, model_filename))
    torch.save(state, model_filename)


def get_optimizer(model, args):
    if args.optimizer == 'adam':
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'adamW':
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'adamP':
        optimizer = AdamP(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()

    return optimizer


def get_scheduler(optimizer, args):
    if args.scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer, patience=args.plateau_patience, factor=args.plateau_factor, mode='max',
                                      verbose=True)
    elif args.scheduler == 'linear_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                    num_training_steps=args.total_steps)
    elif args.scheduler == 'step_lr':
        scheduler = StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
    elif args.scheduler == 'exp_lr':
        scheduler = ExponentialLR(optimizer, gamma=args.gamma)
    elif args.scheduler == 'cosine_annealing':
        scheduler = CosineAnnealingLR(optimizer, T_max=args.t_max, eta_min=args.eta_min)
    elif args.scheduler == 'cosine_annealing_warmstart':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=args.T_0, T_mult=args.T_mult, eta_min=args.eta_min,
                                                last_epoch=-1)

    return scheduler


def update_params(loss, model, optimizer, batch_idx, max_len, args):
    if args.gradient_accumulation:
        # normalize loss to account for batch accumulation
        loss = loss / args.accum_iter 

        # backward pass
        loss.backward()

        # weights update
        if ((batch_idx + 1) % args.accum_iter == 0) or (batch_idx + 1 == max_len):
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
            optimizer.step()
            optimizer.zero_grad()
    else:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
        optimizer.step()
        optimizer.zero_grad()


def load_tokenizer(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name
        else args.model_name_or_path,
        use_fast=True,
    )

    return tokenizer


def load_model(args, model_name=None):
    if not model_name:
        model_name = args.model_name
    model_path = os.path.join(args.model_dir, model_name)
    print("Loading Model from:", model_path)
    load_state = torch.load(model_path)

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        args.config_name
        if args.config_name
        else args.model_name_or_path,
    )

    config.num_labels = 7

    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        from_tf=bool(".ckpt" in model_path),
        config=config
    )

    model.classifier = nn.Sequential(
        nn.Linear(1024, 1024),
        nn.Dropout(p=0.3, inplace=False),
        nn.Linear(1024, 512),
        nn.Dropout(p=0.3, inplace=False),
        nn.Linear(512, 7),
    )

    model.load_state_dict(load_state['state_dict'], strict=True)

    model = model.to(args.device)

    print("Loading Model from:", model_path, "...Finished.")

    return model


def get_model(args):
    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        args.config_name
        if args.config_name
        else args.model_name_or_path,
    )

    config.num_labels = 7
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    )

    model.classifier = nn.Sequential(
        nn.Linear(1024, 1024),
        nn.Dropout(p=0.3, inplace=False),
        nn.Linear(1024, 512),
        nn.Dropout(p=0.3, inplace=False),
        nn.Linear(512, 7),
    )

    model = model.to(args.device)

    return model


def get_loaders(args, train, valid, is_inference=False):
    pin_memory = True
    train_loader, valid_loader = None, None

    if is_inference:
        test_dataset = YNAT_dataset(args, valid, is_inference)
        test_loader = torch.utils.data.DataLoader(test_dataset, num_workers=args.num_workers, shuffle=False,
                                                  batch_size=args.batch_size, pin_memory=pin_memory)
        return test_loader

    if train is not None:
        train_dataset = YNAT_dataset(args, train, is_inference)
        train_loader = torch.utils.data.DataLoader(train_dataset, num_workers=args.num_workers, shuffle=True,
                                                   batch_size=args.batch_size, pin_memory=pin_memory)
    if valid is not None:
        valid_dataset = YNAT_dataset(args, valid, is_inference)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, num_workers=args.num_workers, shuffle=False,
                                                   batch_size=args.batch_size, pin_memory=pin_memory)

    return train_loader, valid_loader


# loss계산하고 parameter update!
def compute_loss(preds, targets, args):
    """
    Args :
        preds   : (batch_size, max_seq_len)
        targets : (batch_size, max_seq_len)
    """
    # print(preds, targets)
    loss = get_criterion(preds, targets, args)
    # 마지막 시퀀스에 대한 값만 loss 계산
    # loss = loss[:, -1]
    # loss = torch.mean(loss)
    return loss


def get_criterion(pred, target, args):
    if args.criterion == 'BCE':
        loss = nn.BCELoss(reduction="none")
    elif args.criterion == "BCELogit":
        loss = nn.BCEWithLogitsLoss(reduction="none")
    elif args.criterion == "MSE":
        loss = nn.MSELoss(reduction="none")
    elif args.criterion == "L1":
        loss = nn.L1Loss(reduction="none")
    elif args.criterion == "CE":
        #weights = [1,1,2,1,1,1,1] #as class distribution
        #class_weights = torch.FloatTensor(weights).cuda()
        #loss = nn.CrossEntropyLoss(weight=class_weights)
        loss = nn.CrossEntropyLoss()
    # NLL, CrossEntropy not available
    return loss(pred, target)


## Dataloader

In [5]:
import os
import torch
import pandas as pd


class Preprocess:
    def __init__(self, args):
        self.args = args
        self.train_data = None
        self.test_data = None

    def load_data(self, file_name):
        csv_file_name = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_name)
        del df['Unnamed: 0']
        return df.values

    def load_train_data(self):
        self.train_data = self.load_data('train_data_kr.csv')

    def load_test_data(self):
        self.test_data = self.load_data('test_data_translated.csv')


class YNAT_dataset(torch.utils.data.Dataset):
    def __init__(self, args, data, is_inference):
        self.args = args
        self.data = data
        self.is_inference = is_inference

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data[index]
        element = [row[i] for i in range(len(row))]
        #print(type(row))
        # np.array -> torch.tensor 형변환
        #for i, col in enumerate(row):
        #    if type(col) == str:
        #        pass
        #    else:
        #        row[i] = torch.tensor(col)

        return element



## Trainer

In [6]:
from sklearn.metrics import accuracy_score
from torch.nn.functional import one_hot
from tqdm import tqdm
from sklearn import metrics


def run(args, tokenizer, train_data, valid_data, cv_count):
    train_loader, valid_loader = get_loaders(args, train_data, valid_data)

    # only when using warmup scheduler
    # args.total_steps = int(len(train_loader.dataset) / args.batch_size) * args.n_epochs
    # args.warmup_steps = int(args.total_steps * args.warmup_ratio)

    model = get_model(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    best_acc = -1
    early_stopping_counter = 0
    for epoch in range(args.n_epochs):

        print(f"Start Training: Epoch {epoch + 1}")

        if not args.cv_strategy:
            model_name = args.run_name
        else:
            model_name = f"{args.run_name.split('.pt')[0]}_{cv_count}.pt"

        # TRAIN
        train_acc, train_loss = train(args, model, tokenizer, train_loader, optimizer)

        # VALID
        acc, val_loss = validate(args, model, tokenizer, valid_loader)

        # TODO: model save or early stopping
        if args.scheduler == 'plateau':
            last_lr = optimizer.param_groups[0]['lr']
        else:
            last_lr = scheduler.get_last_lr()[0]

        print({"epoch": epoch, "train_loss": train_loss, "train_acc": train_acc,
                   "valid_acc": acc, "val_loss": val_loss, "learning_rate": last_lr})

        if acc > best_acc:
            best_acc = acc
            # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
            model_to_save = model.module if hasattr(model, 'module') else model
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
            },
                args.model_dir, model_name,
            )
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= args.patience:
                print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
                break

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_acc)
        else:
            scheduler.step()

    return best_acc


def inference(args, test_data):
    # ckpt_file_names = []
    all_fold_preds = []
    all_fold_argmax_preds = []

    if not args.cv_strategy:
        ckpt_file_names = [args.model_name]
    else:
        ckpt_file_names = [f"{args.model_name.split('.pt')[0]}_{i + 1}.pt" for i in range(args.fold_num)]

    tokenizer = load_tokenizer(args)

    for fold_idx, ckpt in enumerate(ckpt_file_names):
        model = load_model(args, ckpt)
        model.eval()
        test_loader = get_loaders(args, None, test_data, True)

        total_preds = []
        total_argmax_preds = []
        total_ids = []

        for step, batch in tqdm(enumerate(test_loader), desc='Inferencing', total=len(test_loader)):
            idx, text, text_kr = batch
            tokenized_examples = tokenizer(
                #text,
                text_kr,
                max_length=args.max_seq_len,
                padding="max_length",
                return_tensors="pt"
            ).to(args.device)

            preds = model(input_ids = tokenized_examples['input_ids'],
                          attention_mask = tokenized_examples['attention_mask'])

            logits = preds['logits']
            logits = logits[:,0,:]
            argmax_logits = torch.argmax(logits, dim=1)

            if args.device == 'cuda':
                argmax_preds = argmax_logits.to('cpu').detach().numpy()
                preds = logits.to('cpu').detach().numpy()
            else:  # cpu
                argmax_preds = argmax_logits.detach().numpy()
                preds = logits.detach().numpy()

            total_preds += list(preds)
            total_argmax_preds += list(argmax_preds)
            total_ids += list(idx)

        all_fold_preds.append(total_preds)
        all_fold_argmax_preds.append(total_argmax_preds)

        output_file_name = "output.csv" if not args.cv_strategy else f"output_{fold_idx + 1}.csv"
        write_path = os.path.join(args.output_dir, output_file_name)
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(write_path, 'w', encoding='utf8') as w:
            print("writing prediction : {}".format(write_path))
            w.write("index,topic_idx\n")
            for index, p in zip(total_ids, total_argmax_preds):
                w.write('{},{}\n'.format(index, p))

    if len(all_fold_preds) > 1:
        # Soft voting ensemble
        votes = np.sum(all_fold_preds, axis=0)
        votes = np.argmax(votes, axis=1)

        write_path = os.path.join(args.output_dir, "output_softvote.csv")
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(write_path, 'w', encoding='utf8') as w:
            print("writing prediction : {}".format(write_path))
            w.write("index,topic_idx\n")
            for id, p in zip(total_ids, votes):
                w.write('{},{}\n'.format(id, p))


def train(args, model, tokenizer, train_loader, optimizer):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in tqdm(enumerate(train_loader), desc='Training', total=len(train_loader)):
        idx, text, text_kr, label = batch
        label = label.to(args.device)
        tokenized_examples = tokenizer(
            #text,
            text_kr,
            max_length=args.max_seq_len,
            padding="max_length",
            return_tensors="pt"
        ).to(args.device)

        # tokenize
        # 모델의 입력으로
        # label은 one-hot?
        # loss 주고
        # argmax를 golden

        #print(tokenized_examples)
        #print(tokenized_examples['input_ids'][:5])
        #print(tokenized_examples['attention_mask'][:5])
        #print(tokenized_examples['token_type_ids'][:5])

        #preds = model(**tokenized_examples)

        
        preds = model(input_ids = tokenized_examples['input_ids'],
                        attention_mask = tokenized_examples['attention_mask'])
            

        logits = preds['logits']
        logits = logits[:,0,:]
        softmax_logits = nn.Softmax(dim=1)(logits)
        argmax_logits = torch.argmax(logits, dim=1)

        # one_hot_logits = one_hot(argmax_logits, num_classes=7).float()
        # print(one_hot(argmax_logits, num_classes=7).type(torch.FloatTensor))
        loss = compute_loss(logits,
                            label, args)

        # print(loss)

        update_params(loss, model, optimizer, step, len(train_loader), args)

        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")

        if args.device == 'cuda':
            argmax_logits = argmax_logits.to('cpu').detach().numpy()
            label = label.to('cpu').detach().numpy()
            loss = loss.to('cpu').detach().numpy()
            tokenized_examples = tokenized_examples.to('cpu')
            logits = logits.to('cpu').detach().numpy()
        else:  # cpu
            argmax_logits = argmax_logits.detach().numpy()
            label = label.detach().numpy()
            loss = loss.detach().numpy()

        total_preds.append(argmax_logits)
        total_targets.append(label)
        losses.append(loss)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    acc = accuracy_score(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f'TRAIN ACC : {acc}, TRAIN LOSS : {loss_avg}')
    return acc, loss_avg


def validate(args, model, tokenizer, valid_loader):
    model.eval()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in tqdm(enumerate(valid_loader), desc='Training', total=len(valid_loader)):
        idx, text, text_kr, label = batch
        label = label.to(args.device)
        tokenized_examples = tokenizer(
            #text,
            text_kr,
            max_length=args.max_seq_len,
            padding="max_length",
            return_tensors="pt"
        ).to(args.device)

        # tokenize
        # 모델의 입력으로
        # label은 one-hot?
        # loss 주고
        # argmax를 golden
        #print(tokenized_examples)
        #print(tokenized_examples['input_ids'][:5])
        #print(tokenized_examples['attention_mask'][:5])
        #print(tokenized_examples['token_type_ids'][:5])

        #preds = model(**tokenized_examples)

        
        preds = model(input_ids = tokenized_examples['input_ids'],
                        attention_mask = tokenized_examples['attention_mask'])
        
        logits = preds['logits']
        logits = logits[:,0,:]
        softmax_logits = nn.Softmax(dim=1)(logits)
        argmax_logits = torch.argmax(logits, dim=1)

        # one_hot_logits = one_hot(argmax_logits, num_classes=7).float()
        # print(one_hot(argmax_logits, num_classes=7).type(torch.FloatTensor))
        loss = compute_loss(logits,
                            label, args)

        if step % args.log_steps == 0:
            print(f"Validation steps: {step} Loss: {str(loss.item())}")

        if args.device == 'cuda':
            argmax_logits = argmax_logits.to('cpu').detach().numpy()
            label = label.to('cpu').detach().numpy()
            loss = loss.to('cpu').detach().numpy()
            tokenized_examples = tokenized_examples.to('cpu')
            logits = logits.to('cpu').detach().numpy()
        else:  # cpu
            argmax_logits = argmax_logits.detach().numpy()
            label = label.detach().numpy()
            loss = loss.detach().numpy()

        total_preds.append(argmax_logits)
        total_targets.append(label)
        losses.append(loss)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    target_names = ['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']
    print(metrics.classification_report(total_targets, total_preds, target_names=target_names))
    matrix = metrics.confusion_matrix(total_targets, total_preds)
    print(matrix.diagonal()/matrix.sum(axis=1))

    # Train AUC / ACC
    acc = accuracy_score(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f'VALID ACC : {acc}, VALID LOSS : {loss_avg}')
    return acc, loss_avg


## Train

In [7]:
import torch
from sklearn.model_selection import KFold, StratifiedKFold
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from datetime import datetime
from pytz import timezone


def main(args):
    if not args.run_name:
        args.run_name = datetime.now(timezone("Asia/Seoul")).strftime("%Y-%m-%d-%H:%M:%S")

    set_seeds(args.seed)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    args.device = device

    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name
        else args.model_name_or_path,
        #use_fast=True,
    )

    preprocess = Preprocess(args)
    preprocess.load_train_data()
    train_data_origin = preprocess.train_data

    print(f"Size of train data : {len(train_data_origin)}")
    # print(f"size of test data : {len(test_data)}")

    if args.cv_strategy == 'random':
        kf = KFold(n_splits=args.fold_num, shuffle=True)
        splits = kf.split(X=train_data_origin)
    else:
        # default
        # 여기 각 label로 바꿔야됨
        train_labels = [sequence[-1] for sequence in train_data_origin]
        skf = StratifiedKFold(n_splits=args.fold_num, shuffle=True)
        splits = skf.split(X=train_data_origin, y=train_labels)

    acc_avg = 0
    for fold_num, (train_index, valid_index) in enumerate(splits):
        train_data = train_data_origin[train_index]
        valid_data = train_data_origin[valid_index]
        best_acc = run(args, tokenizer, train_data, valid_data, fold_num + 1)

        if not args.cv_strategy:
            break

        acc_avg += best_acc

    if args.cv_strategy:
        acc_avg /= args.fold_num

        print("*" * 50, 'auc_avg', "*" * 50)
        print(acc_avg)


## Run

In [8]:
import argparse
import easydict

def parse_args():
    args = easydict.EasyDict({'run_name' : 'temp',
                             'seed':42,
                             'device' :'cuda',
                             'data_dir': cur_dir + '/data/open/',
                             'model_dir' : '/content/drive/MyDrive/KLUE_TC/models/',
                             'model_name_or_path' : 'klue/roberta-large',
                             #'model_name_or_path' : 'xlm-roberta-large',
                             'config_name' : None,
                             'tokenizer_name' : None,
                             'output_dir' : '/content/drive/MyDrive/KLUE_TC/output/translation/',
                             
                             'accum_iter' : 8,
                             'gradient_accumulation' : True,

                             'cv_strategy' : 'stratified',
                             'fold_num' : 4,

                             'num_workers' : 1,

                             # 훈련
                             'n_epochs' : 3,
                             'batch_size' : 16,
                             'lr' : 5e-6,
                             'clip_grad' : 10,
                             'patience' : 5,
                             'max_seq_len' : 110,

                             # Optimizer
                             'optimizer' : 'adamW',

                             # Optimizer-parameters
                             'weight_decay' : 0.05,
                             'momentum' : 0.9,

                             # Scheduler
                             'scheduler' : 'step_lr',

                             # Scheduler-parameters
                             # plateau
                             'plateau_patience' : 10,
                             'plateau_factor' : 0.5,
                              
                             't_max' : 10,
                             'T_0' : 10,
                             'T_mult' : 2,
                             '--eta_min' : 0.01,

                             # linear_warmup
                             'warmup_ratio' : 0.3,

                             # Step LR
                             'step_size' : 50,
                             'gamma' : 0.1,

                             'criterion' : 'CE',

                             'log_steps' : 100})
    
    return args

In [9]:
if __name__ == '__main__':
    args = parse_args()
    main(args)

Size of train data : 45654


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

Start Training: Epoch 1


Training:   0%|          | 1/2140 [00:00<10:41,  3.33it/s]

Training steps: 0 Loss: 1.9433690309524536


Training:   5%|▍         | 101/2140 [00:29<10:00,  3.40it/s]

Training steps: 100 Loss: 1.9136300086975098


Training:   9%|▉         | 201/2140 [00:59<09:49,  3.29it/s]

Training steps: 200 Loss: 1.806891679763794


Training:  14%|█▍        | 301/2140 [01:29<09:02,  3.39it/s]

Training steps: 300 Loss: 1.4294134378433228


Training:  19%|█▊        | 401/2140 [01:59<08:48,  3.29it/s]

Training steps: 400 Loss: 1.0813603401184082


Training:  23%|██▎       | 501/2140 [02:29<08:04,  3.38it/s]

Training steps: 500 Loss: 1.0171688795089722


Training:  28%|██▊       | 601/2140 [02:59<07:45,  3.30it/s]

Training steps: 600 Loss: 0.7323077321052551


Training:  33%|███▎      | 701/2140 [03:29<07:03,  3.40it/s]

Training steps: 700 Loss: 0.7875252962112427


Training:  37%|███▋      | 801/2140 [03:58<06:46,  3.29it/s]

Training steps: 800 Loss: 0.592852771282196


Training:  42%|████▏     | 901/2140 [04:28<06:04,  3.39it/s]

Training steps: 900 Loss: 0.49834778904914856


Training:  47%|████▋     | 1001/2140 [04:58<05:45,  3.30it/s]

Training steps: 1000 Loss: 0.7653273940086365


Training:  51%|█████▏    | 1101/2140 [05:28<05:06,  3.39it/s]

Training steps: 1100 Loss: 0.2410917580127716


Training:  56%|█████▌    | 1201/2140 [05:58<04:46,  3.28it/s]

Training steps: 1200 Loss: 1.0456475019454956


Training:  61%|██████    | 1301/2140 [06:28<04:07,  3.39it/s]

Training steps: 1300 Loss: 0.08563227951526642


Training:  65%|██████▌   | 1401/2140 [06:57<03:44,  3.29it/s]

Training steps: 1400 Loss: 0.24742986261844635


Training:  70%|███████   | 1501/2140 [07:27<03:08,  3.40it/s]

Training steps: 1500 Loss: 0.3870387375354767


Training:  75%|███████▍  | 1601/2140 [07:57<02:43,  3.29it/s]

Training steps: 1600 Loss: 0.30874815583229065


Training:  79%|███████▉  | 1701/2140 [08:27<02:09,  3.39it/s]

Training steps: 1700 Loss: 0.3583820164203644


Training:  84%|████████▍ | 1801/2140 [08:57<01:42,  3.29it/s]

Training steps: 1800 Loss: 0.49719908833503723


Training:  89%|████████▉ | 1901/2140 [09:27<01:10,  3.38it/s]

Training steps: 1900 Loss: 0.5124335289001465


Training:  94%|█████████▎| 2001/2140 [09:56<00:42,  3.31it/s]

Training steps: 2000 Loss: 0.6722394227981567


Training:  98%|█████████▊| 2101/2140 [10:26<00:11,  3.39it/s]

Training steps: 2100 Loss: 0.6797186136245728


Training: 100%|██████████| 2140/2140 [10:38<00:00,  3.35it/s]

TRAIN ACC : 0.7533002336448598, TRAIN LOSS : 0.7996995668475316



Training:   0%|          | 1/714 [00:00<01:17,  9.18it/s]

Validation steps: 0 Loss: 0.4153074324131012


Training:  14%|█▍        | 102/714 [00:10<01:01,  9.92it/s]

Validation steps: 100 Loss: 0.41167908906936646


Training:  28%|██▊       | 201/714 [00:20<00:50, 10.06it/s]

Validation steps: 200 Loss: 0.5745862722396851


Training:  42%|████▏     | 303/714 [00:30<00:40, 10.10it/s]

Validation steps: 300 Loss: 0.33740565180778503


Training:  56%|█████▋    | 403/714 [00:40<00:30, 10.04it/s]

Validation steps: 400 Loss: 0.5405115485191345


Training:  70%|███████   | 503/714 [00:50<00:20, 10.09it/s]

Validation steps: 500 Loss: 0.1197073683142662


Training:  84%|████████▍ | 603/714 [01:00<00:10, 10.10it/s]

Validation steps: 600 Loss: 0.4015890061855316


Training:  98%|█████████▊| 703/714 [01:09<00:01, 10.05it/s]

Validation steps: 700 Loss: 0.2175680249929428


Training: 100%|██████████| 714/714 [01:11<00:00, 10.04it/s]


              precision    recall  f1-score   support

        IT과학       0.76      0.91      0.82      1206
          경제       0.83      0.80      0.82      1555
          사회       0.77      0.72      0.74      1841
        생활문화       0.88      0.84      0.86      1483
          세계       0.88      0.89      0.89      1908
         스포츠       0.94      0.97      0.95      1734
          정치       0.89      0.85      0.87      1687

    accuracy                           0.85     11414
   macro avg       0.85      0.85      0.85     11414
weighted avg       0.85      0.85      0.85     11414

[0.90713101 0.80385852 0.71700163 0.84086312 0.89465409 0.96828143
 0.85299348]
VALID ACC : 0.8529875591379008, VALID LOSS : 0.46265474338011414
{'epoch': 0, 'train_loss': 0.7996995668475316, 'train_acc': 0.7533002336448598, 'valid_acc': 0.8529875591379008, 'val_loss': 0.46265474338011414, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 2


Training:   0%|          | 1/2140 [00:00<15:05,  2.36it/s]

Training steps: 0 Loss: 0.5124078989028931


Training:   5%|▍         | 101/2140 [00:30<10:00,  3.40it/s]

Training steps: 100 Loss: 0.20073184370994568


Training:   9%|▉         | 201/2140 [01:00<09:48,  3.30it/s]

Training steps: 200 Loss: 0.7373939752578735


Training:  14%|█▍        | 301/2140 [01:29<09:03,  3.38it/s]

Training steps: 300 Loss: 0.10243690758943558


Training:  19%|█▊        | 401/2140 [01:59<08:48,  3.29it/s]

Training steps: 400 Loss: 0.6937940716743469


Training:  23%|██▎       | 501/2140 [02:29<08:02,  3.40it/s]

Training steps: 500 Loss: 0.5122249722480774


Training:  28%|██▊       | 601/2140 [02:59<07:45,  3.31it/s]

Training steps: 600 Loss: 0.21854956448078156


Training:  33%|███▎      | 701/2140 [03:29<07:04,  3.39it/s]

Training steps: 700 Loss: 0.09707139432430267


Training:  37%|███▋      | 801/2140 [03:58<06:46,  3.29it/s]

Training steps: 800 Loss: 0.5887240767478943


Training:  42%|████▏     | 901/2140 [04:28<06:04,  3.39it/s]

Training steps: 900 Loss: 0.3460904657840729


Training:  47%|████▋     | 1001/2140 [04:58<05:45,  3.30it/s]

Training steps: 1000 Loss: 0.27482932806015015


Training:  51%|█████▏    | 1101/2140 [05:28<05:06,  3.39it/s]

Training steps: 1100 Loss: 0.9223419427871704


Training:  56%|█████▌    | 1201/2140 [05:58<04:44,  3.30it/s]

Training steps: 1200 Loss: 1.0811665058135986


Training:  61%|██████    | 1301/2140 [06:28<04:07,  3.39it/s]

Training steps: 1300 Loss: 0.08836578577756882


Training:  65%|██████▌   | 1401/2140 [06:57<03:43,  3.31it/s]

Training steps: 1400 Loss: 0.06638966500759125


Training:  70%|███████   | 1501/2140 [07:27<03:08,  3.39it/s]

Training steps: 1500 Loss: 0.13151131570339203


Training:  75%|███████▍  | 1601/2140 [07:57<02:43,  3.30it/s]

Training steps: 1600 Loss: 0.35465675592422485


Training:  79%|███████▉  | 1701/2140 [08:27<02:09,  3.39it/s]

Training steps: 1700 Loss: 0.30872491002082825


Training:  84%|████████▍ | 1801/2140 [08:57<01:42,  3.30it/s]

Training steps: 1800 Loss: 0.1310228556394577


Training:  89%|████████▉ | 1901/2140 [09:27<01:10,  3.39it/s]

Training steps: 1900 Loss: 0.16465263068675995


Training:  94%|█████████▎| 2001/2140 [09:56<00:42,  3.30it/s]

Training steps: 2000 Loss: 0.4079093039035797


Training:  98%|█████████▊| 2101/2140 [10:26<00:11,  3.39it/s]

Training steps: 2100 Loss: 0.6498906016349792


Training: 100%|██████████| 2140/2140 [10:38<00:00,  3.35it/s]

TRAIN ACC : 0.8610689252336449, TRAIN LOSS : 0.4313633055643779



Training:   0%|          | 1/714 [00:00<01:16,  9.29it/s]

Validation steps: 0 Loss: 0.4538847804069519


Training:  14%|█▍        | 103/714 [00:10<01:00, 10.09it/s]

Validation steps: 100 Loss: 0.2598813474178314


Training:  28%|██▊       | 203/714 [00:20<00:50, 10.05it/s]

Validation steps: 200 Loss: 0.5158703327178955


Training:  42%|████▏     | 303/714 [00:30<00:40, 10.11it/s]

Validation steps: 300 Loss: 0.2659655511379242


Training:  56%|█████▋    | 403/714 [00:39<00:30, 10.12it/s]

Validation steps: 400 Loss: 0.5282600522041321


Training:  70%|███████   | 503/714 [00:49<00:20, 10.12it/s]

Validation steps: 500 Loss: 0.08703335374593735


Training:  84%|████████▍ | 603/714 [00:59<00:10, 10.13it/s]

Validation steps: 600 Loss: 0.39144811034202576


Training:  98%|█████████▊| 703/714 [01:09<00:01, 10.11it/s]

Validation steps: 700 Loss: 0.17801326513290405


Training: 100%|██████████| 714/714 [01:10<00:00, 10.10it/s]


              precision    recall  f1-score   support

        IT과학       0.81      0.84      0.83      1206
          경제       0.83      0.81      0.82      1555
          사회       0.77      0.73      0.75      1841
        생활문화       0.86      0.87      0.86      1483
          세계       0.87      0.91      0.89      1908
         스포츠       0.94      0.97      0.95      1734
          정치       0.90      0.85      0.87      1687

    accuracy                           0.86     11414
   macro avg       0.85      0.85      0.85     11414
weighted avg       0.85      0.86      0.85     11414

[0.83996683 0.81414791 0.72569256 0.86581254 0.90828092 0.97289504
 0.85477178]
VALID ACC : 0.8551778517609953, VALID LOSS : 0.4374374602725651
{'epoch': 1, 'train_loss': 0.4313633055643779, 'train_acc': 0.8610689252336449, 'valid_acc': 0.8551778517609953, 'val_loss': 0.4374374602725651, 'learning_rate': 5e-06}
saving model ...
Start Training: Epoch 3


Training:   0%|          | 1/2140 [00:00<11:29,  3.10it/s]

Training steps: 0 Loss: 0.2774089276790619


Training:   5%|▍         | 101/2140 [00:30<10:00,  3.40it/s]

Training steps: 100 Loss: 0.5430668592453003


Training:   9%|▉         | 201/2140 [00:59<09:48,  3.30it/s]

Training steps: 200 Loss: 0.41121089458465576


Training:  14%|█▍        | 301/2140 [01:29<09:02,  3.39it/s]

Training steps: 300 Loss: 0.5244501829147339


Training:  19%|█▊        | 401/2140 [01:59<08:47,  3.30it/s]

Training steps: 400 Loss: 0.11308394372463226


Training:  23%|██▎       | 501/2140 [02:29<08:03,  3.39it/s]

Training steps: 500 Loss: 0.056051794439554214


Training:  28%|██▊       | 601/2140 [02:59<07:46,  3.30it/s]

Training steps: 600 Loss: 0.19462290406227112


Training:  33%|███▎      | 701/2140 [03:28<07:03,  3.40it/s]

Training steps: 700 Loss: 1.1651599407196045


Training:  37%|███▋      | 801/2140 [03:58<06:46,  3.29it/s]

Training steps: 800 Loss: 0.606045126914978


Training:  42%|████▏     | 901/2140 [04:28<06:04,  3.40it/s]

Training steps: 900 Loss: 0.5682233572006226


Training:  47%|████▋     | 1001/2140 [04:58<05:45,  3.30it/s]

Training steps: 1000 Loss: 0.42930343747138977


Training:  51%|█████▏    | 1101/2140 [05:28<05:05,  3.40it/s]

Training steps: 1100 Loss: 0.1112217977643013


Training:  56%|█████▌    | 1201/2140 [05:58<04:44,  3.30it/s]

Training steps: 1200 Loss: 0.09100695699453354


Training:  61%|██████    | 1301/2140 [06:27<04:07,  3.39it/s]

Training steps: 1300 Loss: 0.597690761089325


Training:  65%|██████▌   | 1401/2140 [06:57<03:44,  3.29it/s]

Training steps: 1400 Loss: 0.3262377679347992


Training:  70%|███████   | 1501/2140 [07:27<03:08,  3.39it/s]

Training steps: 1500 Loss: 0.28694239258766174


Training:  75%|███████▍  | 1601/2140 [07:57<02:43,  3.30it/s]

Training steps: 1600 Loss: 0.4860115051269531


Training:  79%|███████▉  | 1701/2140 [08:27<02:09,  3.39it/s]

Training steps: 1700 Loss: 0.5095551609992981


Training:  84%|████████▍ | 1801/2140 [08:57<01:42,  3.30it/s]

Training steps: 1800 Loss: 0.19981536269187927


Training:  89%|████████▉ | 1901/2140 [09:26<01:10,  3.40it/s]

Training steps: 1900 Loss: 0.6343971490859985


Training:  94%|█████████▎| 2001/2140 [09:56<00:42,  3.29it/s]

Training steps: 2000 Loss: 0.5093687176704407


Training:  98%|█████████▊| 2101/2140 [10:26<00:11,  3.39it/s]

Training steps: 2100 Loss: 0.3751487731933594


Training: 100%|██████████| 2140/2140 [10:38<00:00,  3.35it/s]

TRAIN ACC : 0.8788843457943926, TRAIN LOSS : 0.3686391635649951



Training:   0%|          | 1/714 [00:00<01:17,  9.23it/s]

Validation steps: 0 Loss: 0.4746152460575104


Training:  14%|█▍        | 103/714 [00:10<01:00, 10.13it/s]

Validation steps: 100 Loss: 0.2608073353767395


Training:  28%|██▊       | 203/714 [00:20<00:50, 10.08it/s]

Validation steps: 200 Loss: 0.5237995386123657


Training:  42%|████▏     | 303/714 [00:30<00:41, 10.01it/s]

Validation steps: 300 Loss: 0.21108528971672058


Training:  56%|█████▋    | 403/714 [00:39<00:30, 10.08it/s]

Validation steps: 400 Loss: 0.5969356894493103


Training:  70%|███████   | 503/714 [00:49<00:20, 10.11it/s]

Validation steps: 500 Loss: 0.06738651543855667


Training:  84%|████████▍ | 603/714 [00:59<00:11, 10.06it/s]

Validation steps: 600 Loss: 0.3271309733390808


Training:  98%|█████████▊| 703/714 [01:09<00:01, 10.09it/s]

Validation steps: 700 Loss: 0.08999928832054138


Training: 100%|██████████| 714/714 [01:10<00:00, 10.09it/s]


              precision    recall  f1-score   support

        IT과학       0.78      0.89      0.83      1206
          경제       0.82      0.82      0.82      1555
          사회       0.74      0.74      0.74      1841
        생활문화       0.89      0.83      0.86      1483
          세계       0.89      0.90      0.89      1908
         스포츠       0.94      0.97      0.95      1734
          정치       0.92      0.83      0.87      1687

    accuracy                           0.85     11414
   macro avg       0.85      0.85      0.85     11414
weighted avg       0.86      0.85      0.85     11414

[0.8880597  0.81672026 0.74416078 0.83412003 0.9009434  0.97174164
 0.82691168]
VALID ACC : 0.8539512878920624, VALID LOSS : 0.4393319756924814
{'epoch': 2, 'train_loss': 0.3686391635649951, 'train_acc': 0.8788843457943926, 'valid_acc': 0.8539512878920624, 'val_loss': 0.4393319756924814, 'learning_rate': 5e-06}


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

Start Training: Epoch 1


Training:   0%|          | 1/2140 [00:00<11:37,  3.07it/s]

Training steps: 0 Loss: 1.938393473625183


Training:   5%|▍         | 101/2140 [00:30<10:00,  3.39it/s]

Training steps: 100 Loss: 1.838400959968567


Training:   9%|▉         | 201/2140 [00:59<09:47,  3.30it/s]

Training steps: 200 Loss: 1.8236443996429443


Training:  14%|█▍        | 301/2140 [01:29<09:02,  3.39it/s]

Training steps: 300 Loss: 1.7307792901992798


Training:  19%|█▊        | 401/2140 [01:59<08:49,  3.29it/s]

Training steps: 400 Loss: 1.3967061042785645


Training:  23%|██▎       | 501/2140 [02:29<08:02,  3.40it/s]

Training steps: 500 Loss: 0.9632500410079956


Training:  28%|██▊       | 601/2140 [02:59<07:45,  3.31it/s]

Training steps: 600 Loss: 0.7551053166389465


Training:  33%|███▎      | 701/2140 [03:29<07:03,  3.39it/s]

Training steps: 700 Loss: 0.27627038955688477


Training:  37%|███▋      | 801/2140 [03:59<06:46,  3.30it/s]

Training steps: 800 Loss: 0.8268328309059143


Training:  42%|████▏     | 901/2140 [04:28<06:05,  3.39it/s]

Training steps: 900 Loss: 0.4033464193344116


Training:  47%|████▋     | 1001/2140 [04:58<05:45,  3.30it/s]

Training steps: 1000 Loss: 0.7108793258666992


Training:  51%|█████▏    | 1101/2140 [05:28<05:06,  3.39it/s]

Training steps: 1100 Loss: 0.5290884971618652


Training:  56%|█████▌    | 1201/2140 [05:58<04:44,  3.30it/s]

Training steps: 1200 Loss: 0.9123919606208801


Training:  60%|█████▉    | 1274/2140 [06:20<04:19,  3.34it/s]

KeyboardInterrupt: ignored

In [None]:
# torch.cuda.empty_cache()

In [None]:
!nvidia-smi

## Inference

In [None]:
def inference_main():
    args = parse_args()
    args.model_name = "temp"
    preprocess = Preprocess(args)
    preprocess.load_test_data()
    test_data = preprocess.test_data

    print(f"size of test data : {len(test_data)}")
    torch.cuda.empty_cache()
    # del model
    inference(args, test_data)

inference_main()