## 준비

In [1]:
# koBERT tokenizer용 라이브러리
!pip install transformers
!pip install mxnet
!pip install gluonnlp==0.8.0
!pip install tqdm pandas
!pip install sentencepiece
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf' # koBERT tokenizer

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.0 MB/s[0m eta [36m0:00:

In [2]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple
import numpy as np
import logging
import json
import gluonnlp as nlp
import pandas as pd

import torch
from torch import nn,Tensor
import torch.nn.functional as F
from torch.nn import Transformer, TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split

# for koBERT
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel



In [3]:
# gpu 사용 확인
device = torch.device("cuda:0")

## 데이터처리

In [6]:
train_data = pd.read_excel('감성대화말뭉치(최종데이터)_Training.xlsx').fillna("")
val_data = pd.read_excel('감성대화말뭉치(최종데이터)_Validation.xlsx').fillna("")

In [7]:
train_data['사람문장'] = train_data['사람문장1'].astype(str)+train_data['사람문장2'].astype(str)+train_data['사람문장3'].astype(str)
val_data['사람문장'] = val_data['사람문장1'].astype(str)+val_data['사람문장2'].astype(str)+val_data['사람문장3'].astype(str)

In [8]:
train_data = train_data[['감정_대분류','사람문장']]
val_data = val_data[['감정_대분류','사람문장']]

In [9]:
train_data = train_data.rename({'감정_대분류':'감정'},axis=1)
val_data = val_data.rename({'감정_대분류':'감정'},axis=1)

In [10]:
train_data["감정"]=train_data["감정"].apply(lambda x:x.strip())
val_data["감정"]=val_data["감정"].apply(lambda x:x.strip())

In [11]:
train_data.columns = ['label', 'data']
val_data.columns = ['label', 'data']

In [12]:
label_mapping = {
    '기쁨': 0,
    '불안': 1,
    '당황': 2,
    '슬픔': 3,
    '분노': 4,
    '상처': 5
}

train_data['label'] = train_data['label'].str.strip().replace(label_mapping)
val_data['label'] = val_data['label'].str.strip().replace(label_mapping)

## 모델 클래스, 함수

In [13]:
'''class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))'''

'class BERTDataset(Dataset):\n    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,\n                 pad, pair):\n   \n        transform = nlp.data.BERTSentenceTransform(\n            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)\n        \n        self.sentences = [transform([i[sent_idx]]) for i in dataset]\n        self.labels = [np.int32(i[label_idx]) for i in dataset]\n\n    def __getitem__(self, i):\n        return (self.sentences[i] + (self.labels[i], ))\n         \n    def __len__(self):\n        return (len(self.labels))'

In [14]:
class BERTSentenceTransform:
    r"""BERT style data transformation.

    Parameters
    ----------
    tokenizer : BERTTokenizer.
        Tokenizer for the sentences.
    max_seq_length : int.
        Maximum sequence length of the sentences.
    pad : bool, default True
        Whether to pad the sentences to maximum length.
    pair : bool, default True
        Whether to transform sentences or sentence pairs.
    """

    def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad
        self._pair = pair
        self._vocab = vocab

    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
        sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 2 strings:
        text_a, text_b.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14

        For single sequences, the input is a tuple of single string:
        text_a.

        Inputs:
            text_a: 'the dog is hairy .'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a: '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 2 strings:
            (text_a, text_b). For single sequences, the input is a tuple of single
            string: (text_a,).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)

        """

        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        #vocab = self._tokenizer.vocab
        vocab = self._vocab
        tokens = []
        tokens.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            # use padding tokens for the rest
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')

In [15]:
'''class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        #transform = nlp.data.BERTSentenceTransform(
        #    tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))'''


'class BERTDataset(Dataset):\n    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,\n                 pad, pair):\n        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)\n        #transform = nlp.data.BERTSentenceTransform(\n        #    tokenizer, max_seq_length=max_len, pad=pad, pair=pair)\n        self.sentences = [transform([i[sent_idx]]) for i in dataset]\n        self.labels = [np.int32(i[label_idx]) for i in dataset]\n\n    def __getitem__(self, i):\n        return (self.sentences[i] + (self.labels[i], ))\n\n    def __len__(self):\n        return (len(self.labels))'

In [16]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return len(self.labels)


In [17]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [18]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')


# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Downloading (…)lve/main/config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [19]:
train_data_set = [[i, str(j)] for i, j in zip(train_data['data'], train_data['label'])]
val_data_set = [[i, str(j)] for i, j in zip(val_data['data'], val_data['label'])]

train_data_set, test_data_set = train_test_split(train_data_set, test_size = 0.2, random_state=4)

train_data_set = BERTDataset(train_data_set, 0, 1, tokenizer, vocab, max_len, True, False)
test_data_set = BERTDataset(test_data_set, 0, 1, tokenizer, vocab, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_data_set, batch_size=batch_size, num_workers=2)

In [20]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [21]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} val acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.7586650848388672 train acc 0.21875
epoch 1 batch id 201 loss 1.1676665544509888 train acc 0.3016946517412935
epoch 1 batch id 401 loss 1.1485873460769653 train acc 0.4491505610972569
epoch 1 batch id 601 loss 0.7779248356819153 train acc 0.5069415557404326
epoch 1 train acc 0.5166650541795665


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 1 val acc 0.6471924102132436


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.9032331705093384 train acc 0.65625
epoch 2 batch id 201 loss 0.884660542011261 train acc 0.646377487562189
epoch 2 batch id 401 loss 0.9657914042472839 train acc 0.6601075436408977
epoch 2 batch id 601 loss 0.7190017104148865 train acc 0.6684952163061564
epoch 2 train acc 0.6714880030959752


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 2 val acc 0.6688061167227833


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.6743007898330688 train acc 0.796875
epoch 3 batch id 201 loss 0.7439102530479431 train acc 0.6979166666666666
epoch 3 batch id 401 loss 0.7792180776596069 train acc 0.7134507481296758
epoch 3 batch id 601 loss 0.651840090751648 train acc 0.7229877287853578
epoch 3 train acc 0.7270381836945304


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 3 val acc 0.6671401515151515


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.5456327795982361 train acc 0.8125
epoch 4 batch id 201 loss 0.6076023578643799 train acc 0.761660447761194
epoch 4 batch id 401 loss 0.6308123469352722 train acc 0.7742752493765586
epoch 4 batch id 601 loss 0.4394153952598572 train acc 0.7820559484193012
epoch 4 train acc 0.7851844685242518


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 4 val acc 0.670314253647587


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.514343798160553 train acc 0.78125
epoch 5 batch id 201 loss 0.5129515528678894 train acc 0.8061256218905473
epoch 5 batch id 401 loss 0.5009703040122986 train acc 0.8155003117206983
epoch 5 batch id 601 loss 0.3844325840473175 train acc 0.8188435940099834
epoch 5 train acc 0.8210219943240454


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 5 val acc 0.6709017255892256


In [22]:
from google.colab import drive
drive.mount('/content/drive')


torch.save(model, f'/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisKoBert.pt')
torch.save(model.state_dict(), f'/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisKoBert_StateDict.pt')

Mounted at /content/drive


## Inference

In [25]:
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tokenizer, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)

        emotion = {
            0: '기쁨',
            1: '불안',
            2: '당황',
            3: '슬픔',
            4: '분노',
            5: '상처'
        }[answer]
    return emotion

In [26]:
your_example = ["폼 미쳤다",
                "야구 드럽게 못하네",
                "오늘 우리 팀이 져서 짜증나",
                "내일은 이기면 좋겠다",
                "야구 보다가 과제 못 낼 뻔."
                "딥인투딥 고생 많으셨습니다"
                ]

for sentence in your_example:
  output = predict(sentence)
  print(f"문장 \"{sentence}\"에 대한 감성은 <{output}>입니다.")

문장 "폼 미쳤다"에 대한 감성은 <분노>입니다.
문장 "야구 드럽게 못하네"에 대한 감성은 <슬픔>입니다.
문장 "오늘 우리 팀이 져서 짜증나"에 대한 감성은 <분노>입니다.
문장 "내일은 이기면 좋겠다"에 대한 감성은 <기쁨>입니다.
문장 "야구 보다가 과제 못 낼 뻔.딥인투딥 고생 많으셨습니다"에 대한 감성은 <분노>입니다.
