# KoBART를 이용한 한국어 감정분류
사전학습모델 : [KoBART](https://github.com/SKT-AI/KoBART) <br>
데이터 : [NAVER Sentiment Movie Corpus](https://github.com/e9t/nsmc/)

In [None]:
!pip install git+https://github.com/SKT-AI/KoBART#egg=kobart
!pip install transfomers
!pip install datasets

**NSMC 데이터 불러오기**

In [2]:
from datasets import load_dataset

datasets = load_dataset("nsmc")

Downloading builder script:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/807 [00:00<?, ?B/s]



Downloading and preparing dataset nsmc/default (download: 18.62 MiB, generated: 20.90 MiB, post-processed: Unknown size, total: 39.52 MiB) to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset nsmc downloaded and prepared to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [4]:
# label 0: negative(부정) / 1: positive(긍정)
for i in range(3):
    print("train", datasets["train"][i])
    print("test", datasets["test"][i])

train {'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}
test {'id': '6270596', 'document': '굳 ㅋ', 'label': 1}
train {'id': '3819312', 'document': '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'label': 1}
test {'id': '9274899', 'document': 'GDNTOPCLASSINTHECLUB', 'label': 0}
train {'id': '10265843', 'document': '너무재밓었다그래서보는것을추천한다', 'label': 0}
test {'id': '8544678', 'document': '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', 'label': 0}


**KoBART 모델과 토크나이저 불러오기**

In [5]:
from transformers import BartModel
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer

In [6]:
kobart_tokenizer = get_kobart_tokenizer()
model = BartModel.from_pretrained(get_pytorch_kobart_model())

/content/.cache/kobart_base_tokenizer_cased_cf74400bce.zip[██████████████████████████████████████████████████]
/content/.cache/kobart_base_cased_ff4bda5738.zip[██████████████████████████████████████████████████]


In [7]:
model.config

BartConfig {
  "_name_or_path": "/content/.cache/kobart_from_pretrained",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 1

# 데이터 구축

**데이터 준비** <br>
train:validation:test = 3 : 1 : 1

In [8]:
from tqdm.auto import tqdm as tqdm_auto

In [9]:
# 150000개 처리 시간 약 8시간
ids = int((datasets['train'].num_rows)//15)
train_doc = [datasets['train']['document'][idx] for idx in tqdm_auto(range(0, ids))]
train_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(0, ids))]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [10]:
# NSMC는 validation 데이터가 없어 따로  만든다.
val_doc = [datasets['train']['document'][idx] for idx in tqdm_auto(range(ids, ids+3333))]
val_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(ids, ids+3333))]

  0%|          | 0/3333 [00:00<?, ?it/s]

  0%|          | 0/3333 [00:00<?, ?it/s]

In [11]:
ids = int((datasets['test'].num_rows)//15)
test_doc = [datasets['test']['document'][idx] for idx in tqdm_auto(range(0, ids))]
test_label = [datasets['test']['label'][idx] for idx in tqdm_auto(range(0, ids))]

  0%|          | 0/3333 [00:00<?, ?it/s]

  0%|          | 0/3333 [00:00<?, ?it/s]

**토크나이징**

In [13]:
# 패딩 채우기
train_input = kobart_tokenizer(train_doc, padding=True, return_tensors="pt")
val_input = kobart_tokenizer(val_doc, padding=True, return_tensors="pt")
test_input = kobart_tokenizer(test_doc, padding=True, return_tensors="pt")

**데이터셋 변환**

In [14]:
import torch

In [15]:
class NSMCDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
train_dataset = NSMCDataset(train_input, train_label)
val_dataset = NSMCDataset(val_input, val_label)
test_dataset = NSMCDataset(test_input, test_label)

In [19]:
print("train_dataset")
print(train_dataset[0])
print("val_dataset")
print(val_dataset[0])
print("test_dataset")
print(test_dataset[0])

train_dataset
{'input_ids': tensor([14041, 14166, 11042, 14176, 17240, 17345, 12325,  9495, 29221, 20503,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  import sys


**데이터로더 정의**

In [20]:
from torch.utils.data import DataLoader

In [21]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(test_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [23]:
# 데이터로더 확인
next(iter(train_loader))

  import sys


{'input_ids': tensor([[17428, 14469, 11788,  ...,     3,     3,     3],
         [14300, 11471, 12191,  ...,     3,     3,     3],
         [14126, 15142, 19260,  ...,     3,     3,     3],
         ...,
         [14067, 12037, 11239,  ...,     3,     3,     3],
         [14075,   243, 25041,  ...,     3,     3,     3],
         [14081, 11011, 10476,  ...,     3,     3,     3]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])}