# KLUE-benchmark 논문 보고 재현해보자..

In [1]:
from omegaconf import OmegaConf
from lightning_transformers.core.nlp import HFTransformerDataConfig

# load data_args configure
args = OmegaConf.load('dm_config/ynat_base.yaml')
data_args = HFTransformerDataConfig(batch_size=args.batch_size)
data_args = OmegaConf.create(vars(data_args))
data_args = OmegaConf.create(data_args)
data_args = OmegaConf.merge(data_args, args)

  '"sox" backend is being deprecated. '


In [2]:
from typing import Optional
from dataclasses import dataclass, field
from transformers import TrainingArguments

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=False,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )
               
model_args = ModelArguments(model_name_or_path='klue/roberta-small')
training_args = TrainingArguments(
    output_dir='ckpt/ynat',
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=False,
    evaluation_strategy='steps',
    logging_strategy='steps',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    seed=42,
    metric_for_best_model='macro-f1',
    greater_is_better=True,
    report_to="none"
)

In [3]:
from transformers import AutoTokenizer
from src.datamodules.task.nlp import TextClassificationDataModule

tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
dm = TextClassificationDataModule(tokenizer, data_args)

In [4]:
from typing import Dict
from datetime import datetime

import numpy as np
import torch
from torch import nn

import pytorch_lightning as pl
from datasets import load_metric
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from transformers.modeling_outputs import SequenceClassifierOutput

In [5]:
class LitSequneceClassification(pl.LightningModule):
    """    
    Args:
    
    """
    def __init__(
        self, 
        model_args, 
        training_args,
        id2label: Dict,
        task_name: str,
    ):
        super().__init__()
        self.save_hyperparameters()
        
        # init model
        self.config = AutoConfig.from_pretrained(
            self.hparams.model_args.model_name_or_path, 
            num_labels=len(self.hparams.id2label),
            id2label=self.hparams.id2label,
            label2id={l:i for i, l in self.hparams.id2label.items()},
            output_hidden_states=True # get all hidden states
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.hparams.model_args.model_name_or_path, 
            config=self.config
        )
        self.num_labels = self.config.num_labels

        # init metric
        self.metric = load_metric('f1', self.hparams.task_name, experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S"))

    
    def forward(self, **inputs):
        return self.model(**inputs)
    
    def _step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        logits = outputs.logits
        
        preds = logits.argmax(dim=-1)
        labels = batch['labels']
        
        return {
            "loss": loss,
            "y_true": labels,
            "y_pred": preds
        }
    
    def training_step(self, batch, batch_idx):
        self._step(batch, batch_idx)
        return 

    def validation_step(self, batch, batch_idx):
        return self._step(batch, batch_idx)

    def training_step_end(self, batch_parts):
        losses = batch_parts['loss']
        y_true_set = batch_parts['y_true']
        y_pred_set = batch_parts['y_pred']
        if not torch.is_tensor(losses):
            losses = torch.stack(losses).mean()  
            y_true_set = torch.cat(y_true_set)
            y_pred_set = torch.cat(y_pred_set)
            
        self.log('tr_loss', losses, on_step=True, prog_bar=True)
        return {
            "loss": losses,
            "y_true": y_true_set,
            "y_pred": y_pred_set
        }
    
    def validation_step_end(self, batch_parts):
        losses = batch_parts['loss']
        y_true_set = batch_parts['y_true']
        y_pred_set = batch_parts['y_pred']
        if not torch.is_tensor(losses):
            losses = torch.stack(losses).mean()  
            y_true_set = torch.cat(y_true_set)
            y_pred_set = torch.cat(y_pred_set)
        self.log('vl_loss', losses, on_step=True, prog_bar=True)
        return {
            "loss": losses,
            "y_true": y_true_set,
            "y_pred": y_pred_set
        }
    
    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('tr_avg_loss', loss, on_epoch=True, prog_bar=True)
    
    def validation_epoch_end(self, outputs):
        y_true = torch.cat([x['y_true'] for x in outputs]).detach().cpu().numpy()
        y_pred = torch.cat([x['y_pred'] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        
        self.log('val_avg_loss', loss, on_epoch=True, prog_bar=True)
        self.log_dict(self.metric.compute(predictions=y_pred, references=y_true, average='macro'), on_epoch=True, prog_bar=True)
        return loss
    
    def setup(self, stage=None) -> None:
        if stage == 'fit':
            # Get dataloader by calling it - train_dataloader() is called after setup() by default
            train_loader = self.train_dataloader()

            # Calculate total steps
            tb_size = self.hparams.training_args.train_batch_size * max(1, self.trainer.num_gpus)
            ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
            self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size
    
    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.training_args.weight_decay,
            },
            {
                "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters, lr=self.hparams.training_args.learning_rate, eps=self.hparams.training_args.adam_epsilon
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.training_args.warmup_steps, num_training_steps=self.total_steps
        )
        scheduler = {'scheduler': scheduler, 'interval': 'step', 'frequency': 1}
        return [optimizer], [scheduler]


In [6]:
from pytorch_lightning import seed_everything
seed_everything(training_args.seed)
tb_logger = pl.loggers.TensorBoardLogger('logs/')

Global seed set to 42


In [7]:
dm.setup()
model = LitSequneceClassification(model_args, training_args, dm.id2label, data_args.finetuning_task)

Using custom data configuration default-483d06c09187902b
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-483d06c09187902b/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classif

In [8]:
from pytorch_lightning import Trainer
trainer = Trainer(max_epochs=3, gpus='1', logger=tb_logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [9]:
trainer.fit(model, dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | RobertaForSequenceClassification | 68.1 M
-----------------------------------------------------------
68.1 M    Trainable params
0         Non-trainable params
68.1 M    Total params
272.379   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 42


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






In [15]:
for batch in dm.train_dataloader():
    print(tokenizer.decode(batch['input_ids'][0]))
    

[CLS] 유튜브 내달 2일까지 크리에이터 지원 공간 운영 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 日 경제산업상 경제보복 비판여론에 언론이 제대로 이해못해 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 신간 책에 빠져 죽지 않기 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 北 국방위 검열단 천안함사건 북소행설은 억지 주장 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] KT 라인프렌즈 스마트폰에 데이터 제한 기능 추가 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] KTB투자 증시 추가 하락 우려 … 저점 매수 시점 아냐 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] MWC 2018 삼성의 AR 마법 갤럭시S9 언팩 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] KT 양양 서피비치서 5G 액티비티 선보여 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 피닉스 NBA 최초로 非 북미 출신 코코스코프 감독 선임 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 한국시인협회 회장에 윤석산 시인 취임 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 센터 활용도까지 높인 대한항공 1승만 더하면 첫 챔프... [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] 홍콩에 중국경찰 투입 시 체포자는 中 서 재판받을 것 [SEP] [PAD] [PAD] [PAD] [PAD]
[CLS] 광주에서 이어진 세계기자대회 … 5 · 18묘역 참

KeyboardInterrupt: 

In [10]:
dm.train_dataloader().dataset['text']

Dataset({
    features: ['Id', 'attention_mask', 'input_ids', 'label', 'labels', 'text', 'token_type_ids'],
    num_rows: 45678
})