# KoBART를 이용한 한국어 감정분류
사전학습모델 : [KoBART](https://github.com/SKT-AI/KoBART) <br>
데이터 : [NAVER Sentiment Movie Corpus](https://github.com/e9t/nsmc/)

In [None]:
!pip install git+https://github.com/SKT-AI/KoBART#egg=kobart
!pip install transfomers
!pip install datasets

In [2]:
import warnings

# 경고메세지 끄기
warnings.filterwarnings(action='ignore')

**NSMC 데이터 불러오기**

In [3]:
from datasets import load_dataset

datasets = load_dataset("nsmc")

Downloading builder script:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

Downloading and preparing dataset nsmc/default (download: 18.62 MiB, generated: 20.90 MiB, post-processed: Unknown size, total: 39.52 MiB) to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset nsmc downloaded and prepared to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [5]:
# label 0: negative(부정) / 1: positive(긍정)
for i in range(3):
    print("train", datasets["train"][i])
    print("test", datasets["test"][i])

train {'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}
test {'id': '6270596', 'document': '굳 ㅋ', 'label': 1}
train {'id': '3819312', 'document': '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'label': 1}
test {'id': '9274899', 'document': 'GDNTOPCLASSINTHECLUB', 'label': 0}
train {'id': '10265843', 'document': '너무재밓었다그래서보는것을추천한다', 'label': 0}
test {'id': '8544678', 'document': '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', 'label': 0}


**KoBART 모델과 토크나이저 불러오기**

In [6]:
from transformers import BartModel
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer

In [7]:
kobart_tokenizer = get_kobart_tokenizer()
model = BartModel.from_pretrained(get_pytorch_kobart_model())

/content/.cache/kobart_base_tokenizer_cased_cf74400bce.zip[██████████████████████████████████████████████████]
/content/.cache/kobart_base_cased_ff4bda5738.zip[██████████████████████████████████████████████████]


In [8]:
model.config

BartConfig {
  "_name_or_path": "/content/.cache/kobart_from_pretrained",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 1

# 데이터 구축

**데이터 준비** <br>
train:validation:test = 3 : 1 : 1

In [9]:
from tqdm.auto import tqdm as tqdm_auto

In [10]:
# 150000개 처리 시간 약 8시간
ids = int((datasets['train'].num_rows)//15)
train_doc = [datasets['train']['document'][idx] for idx in tqdm_auto(range(0, ids))]
train_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(0, ids))]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [11]:
# NSMC는 validation 데이터가 없어 따로 만든다.
val_doc = [datasets['train']['document'][idx] for idx in tqdm_auto(range(ids, ids+3333))]
val_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(ids, ids+3333))]

  0%|          | 0/3333 [00:00<?, ?it/s]

  0%|          | 0/3333 [00:00<?, ?it/s]

In [12]:
ids = int((datasets['test'].num_rows)//15)
test_doc = [datasets['test']['document'][idx] for idx in tqdm_auto(range(0, ids))]
test_label = [datasets['test']['label'][idx] for idx in tqdm_auto(range(0, ids))]

  0%|          | 0/3333 [00:00<?, ?it/s]

  0%|          | 0/3333 [00:00<?, ?it/s]

**토크나이징**

In [13]:
# 패딩 채우기
train_input = kobart_tokenizer(train_doc, padding=True, return_tensors="pt")
val_input = kobart_tokenizer(val_doc, padding=True, return_tensors="pt")
test_input = kobart_tokenizer(test_doc, padding=True, return_tensors="pt")

**데이터셋 변환**

In [14]:
import torch

In [15]:
class NSMCDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
train_dataset = NSMCDataset(train_input, train_label)
val_dataset = NSMCDataset(val_input, val_label)
test_dataset = NSMCDataset(test_input, test_label)

In [17]:
print("train_dataset")
print(train_dataset[0])
print("val_dataset")
print(val_dataset[0])
print("test_dataset")
print(test_dataset[0])

train_dataset
{'input_ids': tensor([14041, 14166, 11042, 14176, 17240, 17345, 12325,  9495, 29221, 20503,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

**데이터로더 정의**

In [18]:
from torch.utils.data import DataLoader

In [19]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [20]:
# 데이터로더 확인
next(iter(train_loader))

{'input_ids': tensor([[27417, 19436, 17657,  ...,     3,     3,     3],
         [14408,  9866, 14904,  ...,     3,     3,     3],
         [20683,  8981, 14176,  ...,     3,     3,     3],
         ...,
         [25880, 14877, 12353,  ...,     3,     3,     3],
         [19151, 10564, 10746,  ...,     3,     3,     3],
         [14025, 20271, 14581,  ...,     3,     3,     3]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1])}

# 모델 학습

**모델 정의**

In [21]:
import torch.nn as nn

In [22]:
# ELECTRA를 포함한 신경망 모형
class BART_Model(torch.nn.Module):
    def __init__(self, pretrained_model, token_size, num_labels): 
        super(BART_Model, self).__init__()
        self.token_size = token_size
        self.num_labels = num_labels
        self.pretrained_model = pretrained_model

        # 분류기 정의
        self.classifier = torch.nn.Linear(self.token_size, self.num_labels)

    def forward(self, inputs):
        # 모형에 입력을 넣고 출력을 받음
        outputs = self.pretrained_model(**inputs)
        # 출력에서 CLS 토큰에 해당하는 부분만 가져옴
        clf_token = outputs.last_hidden_state[:,0,:]
        
        return self.classifier(clf_token)
        
bart = BART_Model(model, num_labels=2, token_size=model.config.hidden_size)

**모델 파라미터 설정**

In [23]:
from transformers import get_linear_schedule_with_warmup
# GPU 가속을 사용할 수 있으면 device를 cuda로 설정하고, 아니면 cpu로 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 5
learning_rate = 1e-5

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# 학습 스케줄러 설정
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                            num_training_steps=len(train_loader),
                                            num_warmup_steps=0)

step = 0
eval_steps = 625

In [24]:
bart.to(device)

BART_Model(
  (pretrained_model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768, padding_idx=3)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )


**학습 진행**

In [25]:
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

In [27]:
bart.train()

for epoch in range(num_epochs):
    n = 0
    train_accuracy = 0
    loss = 0
    train_loss = 0.0
    
    for batch in tqdm_auto(train_loader, mininterval=0.01, leave=True):
        optimizer.zero_grad()     # 그래디언트 초기화

        # 배치에서 label을 제외한 입력만 추출하여 GPU로 복사
        del batch['token_type_ids']     # BART는 token_type_ids가 입력에 없다
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)     # 배치에서 라벨을 추출하여 GPU로 복사
        outputs = bart(inputs)    # 모형으로 결과 예측

        outputs.argmax(dim=1)
        loss = criterion(outputs, labels)
        train_loss += loss
        
        loss.backward()
        optimizer.step()

        step += 1
        if step % eval_steps == 0:  # eval_steps 마다 loss를 출력
            i = 0
            val_accuracy = 0
            
            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                bart.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_loader, mininterval=0.01, leave=True):

                    # 배치에서 label을 제외한 입력만 추출하여 GPU로 복사
                    del val_batch['token_type_ids']
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
                    labels = batch['labels'].to(device)     # 배치에서 라벨을 추출하여 GPU로 복사
                    outputs = bart(inputs)     # 모형으로 결과 예측

                    outputs.argmax(dim=1)
                    loss = criterion(outputs, labels)  
                    val_loss += loss

                    val_accuracy += accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
                    i += 1

                avg_val_loss = val_loss / len(val_loader)

            val_accuracy /= i
            print('Step %d, validation loss: %.4f, accuracy_score: %.3f' % (step, avg_val_loss, val_accuracy))
            
        avg_train_loss = train_loss / len(train_loader)
        train_accuracy += accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
        n += 1

    train_accuracy /= n
    print('epoch %d, train loss: %.4f, accuracy_score: %.3f \n' % (epoch, avg_train_loss, train_accuracy))

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

Step 625, validation loss: 0.3811, accuracy_score: 0.812
epoch 0, train loss: 0.5719, accuracy_score: 0.713 



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

Step 1250, validation loss: 0.3271, accuracy_score: 0.812
epoch 1, train loss: 0.3716, accuracy_score: 0.834 



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

Step 1875, validation loss: 0.3178, accuracy_score: 0.875
epoch 2, train loss: 0.2064, accuracy_score: 0.918 



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

Step 2500, validation loss: 0.0125, accuracy_score: 1.000
epoch 3, train loss: 0.0631, accuracy_score: 0.978 



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

Step 3125, validation loss: 0.0202, accuracy_score: 1.000
epoch 4, train loss: 0.0300, accuracy_score: 0.990 



In [28]:
bart.eval()
test_accuracy = 0
n = 0

for test_batch in test_loader:
    del test_batch['token_type_ids']
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    
    with torch.no_grad():   # 학습 X (그래디언트 계산 X)
        outputs = bart(inputs)

    test_accuracy += accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
    n += 1

test_accuracy /= n

print('test accuracy_score: %.3f \n' % (test_accuracy))

test accuracy_score: 1.000 

