In [None]:
!pip install gluonnlp pandas tqdm
!pip install mxnet
!pip install sentencepiece==0.1.91
!pip install transformers==4.8.2
!pip install torch

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [3]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')
model = AutoModel.from_pretrained('skt/kobert-base-v1')

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

# Transformer
from transformers import AdamW, BertModel
from transformers.optimization import get_cosine_schedule_with_warmup

# KoBERT
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert_tokenizer import KoBERTTokenizer

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action = 'ignore')

device = torch.device('cuda:0')

In [5]:
train = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Sentiment_Analysis/감성대화말뭉치(최종데이터)_Training.xlsx')
valid = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Sentiment_Analysis/감성대화말뭉치(최종데이터)_Validation.xlsx')

In [6]:
train = train[['사람문장1', '감정_대분류']]
valid = valid[['사람문장1', '감정_대분류']]

In [7]:
valid.value_counts('감정_대분류')

감정_대분류
분노    1257
기쁨    1213
불안    1113
당황    1048
상처    1007
슬픔    1003
dtype: int64

In [8]:
## preprocessing
train.loc[(train['감정_대분류'] == '불안'), '감정_대분류'] = 0  ## 불안
train.loc[(train['감정_대분류'] == '분노'), '감정_대분류'] = 1  ## 분노
train.loc[(train['감정_대분류'] == '상처'), '감정_대분류'] = 2  ## 상처
train.loc[(train['감정_대분류'] == '슬픔'), '감정_대분류'] = 3  ## 슬픔
train.loc[(train['감정_대분류'] == '당황'), '감정_대분류'] = 4  ## 당황
train.loc[(train['감정_대분류'] == '기쁨'), '감정_대분류'] = 5  ## 기쁨

valid.loc[(valid['감정_대분류'] == '불안'), '감정_대분류'] = 0  ## 불안
valid.loc[(valid['감정_대분류'] == '분노'), '감정_대분류'] = 1  ## 분노
valid.loc[(valid['감정_대분류'] == '상처'), '감정_대분류'] = 2  ## 상처
valid.loc[(valid['감정_대분류'] == '슬픔'), '감정_대분류'] = 3  ## 슬픔
valid.loc[(valid['감정_대분류'] == '당황'), '감정_대분류'] = 4  ## 당황
valid.loc[(valid['감정_대분류'] == '기쁨'), '감정_대분류'] = 5  ## 기쁨

In [9]:
valid.head(1)

Unnamed: 0,사람문장1,감정_대분류
0,이번 프로젝트에서 발표를 하는데 내가 실수하는 바람에 우리 팀이 감점을 받았어. 너...,0


In [11]:
train_data_list = []
valid_data_list = []

for ques, label in zip(train['사람문장1'], train['감정_대분류']):
    train_data = []
    train_data.append(ques)
    train_data.append(str(label))

    train_data_list.append(train_data)

for ques, label in zip(valid['사람문장1'], valid['감정_대분류']):
    valid_data = []
    valid_data.append(ques)
    valid_data.append(str(label))

    valid_data_list.append(valid_data)

In [12]:
valid_data_list[0:2]

[['이번 프로젝트에서 발표를 하는데 내가 실수하는 바람에 우리 팀이 감점을 받았어. 너무 미안해.', '0'],
 ['회사에서 중요한 프로젝트를 혼자 하게 됐는데 솔직히 두렵고 무서워.', '0']]

In [13]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sentiment_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
        super(BERTDataset, self).__init__()

        transform = nlp.data.BERTSentenceTransform(bert_tokenizer,
                                                   max_seq_length = max_len,
                                                   vocab = vocab,
                                                   pad = pad,
                                                   pair = pair)
        self.sentences = [transform([i[sentiment_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [14]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [18]:
kcbert_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1")
kcbert = AutoModelForMaskedLM.from_pretrained("skt/kobert-base-v1")

result = kcbert_tokenizer.tokenize("너는 내년 대선 때 투표할 수 있어?")
print(result)
print(kcbert_tokenizer.vocab['사랑'])
print([kcbert_tokenizer.encode(token) for token in result])

Some weights of BertForMaskedLM were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['▁', '너는', '▁', '내년', '▁', '대선', '▁', '때', '▁', '투표', 'ᄒ', 'ᅡᆯ', '▁', '수', '▁', 'ᄋ', 'ᅵ', 'ᆻ', 'ᄋ', 'ᅥ', '?']
6499
[[517, 0, 0], [517, 0, 0, 0], [517, 0, 0], [517, 0, 0, 0], [517, 0, 0], [517, 0, 0, 0], [517, 0, 0], [517, 0, 0, 0], [517, 0, 0], [517, 0, 0, 0], [517, 493, 0, 0], [517, 0, 0, 0], [517, 0, 0], [517, 0, 0, 0], [517, 0, 0], [517, 491, 0, 0], [517, 494, 0, 0], [517, 0, 0, 0], [517, 491, 0, 0], [517, 0, 0, 0], [633, 0, 0]]


In [19]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5  
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [29]:
# 모델, vocab 불러오기
bertmodel, vocab = get_pytorch_kobert_model()
# 토크나이저
tok = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

# tokenizer = get_tokenizer()
# tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

# tok = tokenizer.tokenize
# vocab = kobert_tokenizer.get_vocab()

train_ds, test_ds = train_test_split(train_data_list, test_size = .2, random_state = 123)

train_dataset = BERTDataset(train_ds, 0, 1, tok, vocab, max_len, True, False)
test_dataset = BERTDataset(test_ds, 0, 1, tok, vocab, max_len, True, False)

using cached model. /content/.cache/kobert_v1.zip
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [30]:
train_loader = DataLoader(train_dataset, batch_size = batch_size, num_workers = 5)
test_loader = DataLoader(test_dataset, batch_size = batch_size, num_workers = 5)

In [31]:
# KoBERT 모델 구현
class KoBERTClassifier(nn.Module):
    def __init__(self, BERTmodel, hidden_size = 768, num_classes = 6, dr_rate = None, params = None):
        super(KoBERTClassifier, self).__init__()
        self.BERTmodel = BERTmodel
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)

        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1

        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.BERTmodel(input_ids = token_ids, 
                                   token_type_ids = segment_ids.long(),
                                   attention_mask = attention_mask.float().to(token_ids.device),
                                   return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)

        return self.classifier(out)

In [32]:
model = KoBERTClassifier(bertmodel, dr_rate = .5).to(device)

# optim, scheduler
no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params' : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay' : 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_loader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [33]:
train_history=[]
test_history=[]
loss_history=[]

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0

    model.train()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_loader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
         
        #print(label.shape,out.shape)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            train_history.append(train_acc / (batch_id+1))
            loss_history.append(loss.data.cpu().numpy())
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    #train_history.append(train_acc / (batch_id+1))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_loader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    test_history.append(test_acc / (batch_id+1))

  0%|          | 0/646 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.8784534931182861 train acc 0.125
epoch 1 batch id 201 loss 1.790788173675537 train acc 0.17568407960199006
epoch 1 batch id 401 loss 1.864475965499878 train acc 0.17557668329177056
epoch 1 batch id 601 loss 1.7807881832122803 train acc 0.1773866472545757
epoch 1 train acc 0.1783894478844169


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 1 test acc 0.1802311307519641


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.7677626609802246 train acc 0.203125
epoch 2 batch id 201 loss 1.8227430582046509 train acc 0.1756063432835821
epoch 2 batch id 401 loss 1.8421351909637451 train acc 0.17662874064837905
epoch 2 batch id 601 loss 1.78406822681427 train acc 0.17663269550748753
epoch 2 train acc 0.17674471104231165


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 2 test acc 0.1802311307519641


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.7982847690582275 train acc 0.203125
epoch 3 batch id 201 loss 1.8062505722045898 train acc 0.17086442786069653
epoch 3 batch id 401 loss 1.8092000484466553 train acc 0.17230361596009974
epoch 3 batch id 601 loss 1.7769333124160767 train acc 0.1740068635607321
epoch 3 train acc 0.17447110423116613


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 3 test acc 0.1802311307519641


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.7932523488998413 train acc 0.171875
epoch 4 batch id 201 loss 1.782164454460144 train acc 0.17848258706467662
epoch 4 batch id 401 loss 1.8028606176376343 train acc 0.17955112219451372
epoch 4 batch id 601 loss 1.7884706258773804 train acc 0.17892054908485858
epoch 4 train acc 0.17862325851393188


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 4 test acc 0.1802311307519641


  0%|          | 0/646 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.7794647216796875 train acc 0.234375
epoch 5 batch id 201 loss 1.7785524129867554 train acc 0.18190298507462688
epoch 5 batch id 401 loss 1.7962005138397217 train acc 0.17943422693266833
epoch 5 batch id 601 loss 1.770381212234497 train acc 0.1798564891846922
epoch 5 train acc 0.17880063209494323


  0%|          | 0/162 [00:00<?, ?it/s]

epoch 5 test acc 0.17330422278338944


In [36]:
def predict(sentence):
    data = [sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    another_test_loader = DataLoader(another_test, batch_size, batch_size, num_workers = 2)
    model.eval()
    answer = 0

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(another_test_loader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("공포가")
            elif np.argmax(logits) == 1:
                test_eval.append("놀람이")
            elif np.argmax(logits) == 2:
                test_eval.append("분노가")
            elif np.argmax(logits) == 3:
                test_eval.append("슬픔이")
            elif np.argmax(logits) == 4:
                test_eval.append("중립이")
            elif np.argmax(logits) == 5:
                test_eval.append("행복이")
            elif np.argmax(logits) == 6:
                test_eval.append("혐오가")

        print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")

In [38]:
#질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0" :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 엘리베이터에 갇혔었어
>> 입력하신 내용에서 분노가 느껴집니다.


하고싶은 말을 입력해주세요 : 요즘 너무 행복해
>> 입력하신 내용에서 분노가 느껴집니다.


하고싶은 말을 입력해주세요 : 0
