In [1]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋.csv', index_col=0) 
train_set.sample(n=5)

Unnamed: 0,sentence,emotion
10092,자업자득인거다...,놀람
4962,예언 목사님이거든!,분노
44757,친구가 당연한 걸 물어서 친구에게 넌 왜 이렇게 애가 멍청하다고 말했더니 죄책감이 ...,놀람
12238,"눈이 오려나? 어, 저거 뭐야?",놀람
2748,어딜가야될지잘모르겠어요.,불안


In [2]:
# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
# emotions = {'기쁨': 0, '불안': 1, '당황': 2, '슬픔': 3, '분노': 4, '상처': 5} // 감성대화 말뭉치

train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
30418,존경합니다,0
48710,아직 나이가 많지도 않은데 여자라는 이유로 해고당했어. 너무 불행하다고 생각이 들어.,3
8289,손연재 갑자기 뜨더라,2
53785,내 여동생 온대.,5
27611,도서실에서 혼자 공부하니깐 좀 외롭다.,1


In [3]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device("cuda")
#device = torch.device('cpu')

In [4]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters(KoBERT finetuning 베에스 라인) -> 
max_len = 64    #베이스라인 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [5]:
# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [6]:
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,     
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [7]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

In [8]:
# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [9]:
# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [10]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/3559 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.074551582336426 train acc 0.09375
epoch 1 batch id 201 loss 1.6258450746536255 train acc 0.23056592039800994
epoch 1 batch id 401 loss 1.3003720045089722 train acc 0.3376714463840399
epoch 1 batch id 601 loss 1.1646114587783813 train acc 0.40656198003327787
epoch 1 batch id 801 loss 1.1231086254119873 train acc 0.45072565543071164
epoch 1 batch id 1001 loss 1.217494249343872 train acc 0.4818306693306693
epoch 1 batch id 1201 loss 0.9632625579833984 train acc 0.5051259367194005
epoch 1 batch id 1401 loss 1.103046178817749 train acc 0.5221047466095646
epoch 1 batch id 1601 loss 0.8962392210960388 train acc 0.5346658338538414
epoch 1 batch id 1801 loss 1.1236183643341064 train acc 0.5446106329816769
epoch 1 batch id 2001 loss 1.302586555480957 train acc 0.5533951774112944
epoch 1 batch id 2201 loss 1.280333399772644 train acc 0.5618184915947296
epoch 1 batch id 2401 loss 0.9095034599304199 train acc 0.5678883798417326
epoch 1 batch id 2601 loss 0.9173089861869812

  0%|          | 0/890 [00:00<?, ?it/s]

epoch 1 test acc 0.6608700473092844


  0%|          | 0/3559 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.859921932220459 train acc 0.71875
epoch 2 batch id 201 loss 0.9067028760910034 train acc 0.6557835820895522
epoch 2 batch id 401 loss 0.8948537111282349 train acc 0.6587437655860349
epoch 2 batch id 601 loss 0.7162104249000549 train acc 0.6612416805324459
epoch 2 batch id 801 loss 0.8603163361549377 train acc 0.6656523096129837
epoch 2 batch id 1001 loss 0.9884093403816223 train acc 0.6677697302697303
epoch 2 batch id 1201 loss 0.8871845602989197 train acc 0.6706130308076603
epoch 2 batch id 1401 loss 0.8918684124946594 train acc 0.6730906495360457
epoch 2 batch id 1601 loss 1.0160996913909912 train acc 0.6759837601499064
epoch 2 batch id 1801 loss 1.0455106496810913 train acc 0.6790671848972792
epoch 2 batch id 2001 loss 1.1696271896362305 train acc 0.6812843578210894
epoch 2 batch id 2201 loss 1.1378229856491089 train acc 0.6840782598818719
epoch 2 batch id 2401 loss 0.6163977980613708 train acc 0.6858600583090378
epoch 2 batch id 2601 loss 0.810137808322906

  0%|          | 0/890 [00:00<?, ?it/s]

epoch 2 test acc 0.6756745269071556


  0%|          | 0/3559 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.6796753406524658 train acc 0.8125
epoch 3 batch id 201 loss 0.7971736788749695 train acc 0.7209266169154229
epoch 3 batch id 401 loss 0.7923454642295837 train acc 0.7227244389027432
epoch 3 batch id 601 loss 0.4854649603366852 train acc 0.7250935940099834
epoch 3 batch id 801 loss 0.7122911810874939 train acc 0.728854556803995
epoch 3 batch id 1001 loss 0.8103989362716675 train acc 0.7299887612387612
epoch 3 batch id 1201 loss 0.6338964700698853 train acc 0.7332431307243963
epoch 3 batch id 1401 loss 0.744411289691925 train acc 0.7365497858672377
epoch 3 batch id 1601 loss 0.7817335724830627 train acc 0.7397134603372892
epoch 3 batch id 1801 loss 0.7209338545799255 train acc 0.7418621599111604
epoch 3 batch id 2001 loss 0.8373080492019653 train acc 0.743815592203898
epoch 3 batch id 2201 loss 1.1211225986480713 train acc 0.7460813266696956
epoch 3 batch id 2401 loss 0.5144734978675842 train acc 0.7480086422324032
epoch 3 batch id 2601 loss 0.7059633731842041 t

  0%|          | 0/890 [00:00<?, ?it/s]

epoch 3 test acc 0.6831774837374335


  0%|          | 0/3559 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.47918447852134705 train acc 0.90625
epoch 4 batch id 201 loss 0.5809556245803833 train acc 0.7831156716417911
epoch 4 batch id 401 loss 0.6121208667755127 train acc 0.7826527431421446
epoch 4 batch id 601 loss 0.25183266401290894 train acc 0.7872816139767055
epoch 4 batch id 801 loss 0.5052704215049744 train acc 0.7911985018726592
epoch 4 batch id 1001 loss 0.5975937247276306 train acc 0.7939872627372627
epoch 4 batch id 1201 loss 0.2869487404823303 train acc 0.7960553705245629
epoch 4 batch id 1401 loss 0.546409547328949 train acc 0.8002542826552462
epoch 4 batch id 1601 loss 0.482979416847229 train acc 0.8038335415365396
epoch 4 batch id 1801 loss 0.5907915830612183 train acc 0.8055420599666852
epoch 4 batch id 2001 loss 0.8337492346763611 train acc 0.8071120689655172
epoch 4 batch id 2201 loss 0.8659434914588928 train acc 0.8095184007269423
epoch 4 batch id 2401 loss 0.3670334815979004 train acc 0.8114717825905873
epoch 4 batch id 2601 loss 0.48719280958175

  0%|          | 0/890 [00:00<?, ?it/s]

epoch 4 test acc 0.676014562389119


  0%|          | 0/3559 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.25311607122421265 train acc 0.9375
epoch 5 batch id 201 loss 0.36403998732566833 train acc 0.8426616915422885
epoch 5 batch id 401 loss 0.5801221132278442 train acc 0.8400872817955112
epoch 5 batch id 601 loss 0.13222357630729675 train acc 0.8403702163061564
epoch 5 batch id 801 loss 0.37182262539863586 train acc 0.8433988764044944
epoch 5 batch id 1001 loss 0.3651731610298157 train acc 0.8454982517482518
epoch 5 batch id 1201 loss 0.3378058969974518 train acc 0.8463780183180682
epoch 5 batch id 1401 loss 0.5292795896530151 train acc 0.8485010706638115
epoch 5 batch id 1601 loss 0.41298744082450867 train acc 0.8509134915677702
epoch 5 batch id 1801 loss 0.4454478323459625 train acc 0.8515234591893392
epoch 5 batch id 2001 loss 0.6410144567489624 train acc 0.8524487756121939
epoch 5 batch id 2201 loss 0.6913407444953918 train acc 0.853575079509314
epoch 5 batch id 2401 loss 0.30798980593681335 train acc 0.8544616826322365
epoch 5 batch id 2601 loss 0.3798934817

  0%|          | 0/890 [00:00<?, ?it/s]

epoch 5 test acc 0.673521584861029


In [11]:
torch.save(model, 'model/kobert-v3.pt')

In [12]:
# 모델 사이즈 확인(파라미터는 v1과 동일)
import os

model_path3 = 'model/kobert-v3.pt'
size2 = os.path.getsize(model_path3) / (1024*1024) # mb 단위
print(f"Model size: {size2:.2f} MB")

Model size: 351.79 MB
