In [1]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v2.csv', index_col=0) 

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
114811,옛날 재능만 믿고 게으름 피다 퇴물 되놓고... 어디다 탓을 해?!!,4
80840,꿈이 현실로,0
44819,나 지원이랑 친구 문제로 싸웠어. 요새 맨날 싸워서 지친다.,1
43240,학교에 가는 게 너무 무서워. 친구들이 또 괴롭히러 올 것 같아.,1
46105,오늘도 여자친구와 싸웠어. 그래서 너무 기분이 안 좋아.,1


In [2]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

In [3]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters(KoBERT finetuning 베에스 라인) -> 
max_len = 64    #베이스라인 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [4]:
# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [5]:
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,     
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [6]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

In [7]:
# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [8]:
# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [9]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/3408 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.1148581504821777 train acc 0.125
epoch 1 batch id 201 loss 1.7056244611740112 train acc 0.23491915422885573
epoch 1 batch id 401 loss 1.4151387214660645 train acc 0.3218516209476309
epoch 1 batch id 601 loss 1.277246356010437 train acc 0.3834234608985025
epoch 1 batch id 801 loss 1.174025297164917 train acc 0.43184300873907616
epoch 1 batch id 1001 loss 1.0892586708068848 train acc 0.4698114385614386
epoch 1 batch id 1201 loss 1.213165283203125 train acc 0.49594088259783514
epoch 1 batch id 1401 loss 0.9456455707550049 train acc 0.5145208779443254
epoch 1 batch id 1601 loss 1.2842438220977783 train acc 0.5286930043722673
epoch 1 batch id 1801 loss 1.0243161916732788 train acc 0.540047196002221
epoch 1 batch id 2001 loss 0.9356043338775635 train acc 0.5483195902048975
epoch 1 batch id 2201 loss 0.6708167195320129 train acc 0.5568491594729669
epoch 1 batch id 2401 loss 0.711145281791687 train acc 0.5647646813827571
epoch 1 batch id 2601 loss 0.8457648754119873 t

  0%|          | 0/852 [00:00<?, ?it/s]

epoch 1 test acc 0.6717282863849765


  0%|          | 0/3408 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.7657519578933716 train acc 0.6875
epoch 2 batch id 201 loss 0.9162696003913879 train acc 0.6623134328358209
epoch 2 batch id 401 loss 0.7363892197608948 train acc 0.660458229426434
epoch 2 batch id 601 loss 0.8943337798118591 train acc 0.6624376039933444
epoch 2 batch id 801 loss 0.8812430500984192 train acc 0.6655742821473158
epoch 2 batch id 1001 loss 1.0940327644348145 train acc 0.6683004495504495
epoch 2 batch id 1201 loss 0.9513697624206543 train acc 0.6724604496253123
epoch 2 batch id 1401 loss 0.8898323178291321 train acc 0.6756334760885082
epoch 2 batch id 1601 loss 1.1755784749984741 train acc 0.678384603372892
epoch 2 batch id 1801 loss 0.7788414359092712 train acc 0.6802991393670184
epoch 2 batch id 2001 loss 0.7936946153640747 train acc 0.6817684907546226
epoch 2 batch id 2201 loss 0.6076501607894897 train acc 0.6839930713312131
epoch 2 batch id 2401 loss 0.6516362428665161 train acc 0.6864978134110787
epoch 2 batch id 2601 loss 0.7371137738227844 

  0%|          | 0/852 [00:00<?, ?it/s]

epoch 2 test acc 0.6839055164319249


  0%|          | 0/3408 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.6667922735214233 train acc 0.6875
epoch 3 batch id 201 loss 0.7204795479774475 train acc 0.7288557213930348
epoch 3 batch id 401 loss 0.5676817297935486 train acc 0.7220230673316709
epoch 3 batch id 601 loss 0.6431615948677063 train acc 0.7214538269550749
epoch 3 batch id 801 loss 0.7104094624519348 train acc 0.7249141697877652
epoch 3 batch id 1001 loss 1.2185546159744263 train acc 0.7277722277722277
epoch 3 batch id 1201 loss 0.8469130992889404 train acc 0.7325145711906744
epoch 3 batch id 1401 loss 0.6633368134498596 train acc 0.7362821199143469
epoch 3 batch id 1601 loss 0.8722359538078308 train acc 0.7391083697688945
epoch 3 batch id 1801 loss 0.5395464897155762 train acc 0.7422612437534702
epoch 3 batch id 2001 loss 0.40944766998291016 train acc 0.7445183658170914
epoch 3 batch id 2201 loss 0.4978044629096985 train acc 0.7473875511131304
epoch 3 batch id 2401 loss 0.6277294158935547 train acc 0.7505987088713036
epoch 3 batch id 2601 loss 0.62730604410171

  0%|          | 0/852 [00:00<?, ?it/s]

epoch 3 test acc 0.6806778169014085


  0%|          | 0/3408 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.6173720955848694 train acc 0.71875
epoch 4 batch id 201 loss 0.6561905145645142 train acc 0.7888681592039801
epoch 4 batch id 401 loss 0.4376693069934845 train acc 0.7874064837905237
epoch 4 batch id 601 loss 0.6127034425735474 train acc 0.7888415141430949
epoch 4 batch id 801 loss 0.6352947354316711 train acc 0.7931101747815231
epoch 4 batch id 1001 loss 0.7727664709091187 train acc 0.794080919080919
epoch 4 batch id 1201 loss 0.5640541315078735 train acc 0.7971221898417985
epoch 4 batch id 1401 loss 0.3727705776691437 train acc 0.7999643112062812
epoch 4 batch id 1601 loss 0.8904934525489807 train acc 0.803853060587133
epoch 4 batch id 1801 loss 0.3728691637516022 train acc 0.8058717379233759
epoch 4 batch id 2001 loss 0.2695982754230499 train acc 0.8085488505747126
epoch 4 batch id 2201 loss 0.4039473831653595 train acc 0.8115771240345298
epoch 4 batch id 2401 loss 0.36119940876960754 train acc 0.8143612036651395
epoch 4 batch id 2601 loss 0.464615643024444

  0%|          | 0/852 [00:00<?, ?it/s]

epoch 4 test acc 0.6826951291079812


  0%|          | 0/3408 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.3979465067386627 train acc 0.875
epoch 5 batch id 201 loss 0.38970738649368286 train acc 0.8390858208955224
epoch 5 batch id 401 loss 0.366070032119751 train acc 0.8396197007481296
epoch 5 batch id 601 loss 0.575049638748169 train acc 0.8407861896838602
epoch 5 batch id 801 loss 0.49514150619506836 train acc 0.8416432584269663
epoch 5 batch id 1001 loss 0.6481998562812805 train acc 0.842313936063936
epoch 5 batch id 1201 loss 0.4193129539489746 train acc 0.8452851790174855
epoch 5 batch id 1401 loss 0.34907081723213196 train acc 0.8468058529621699
epoch 5 batch id 1601 loss 0.7552070617675781 train acc 0.8493714865708932
epoch 5 batch id 1801 loss 0.31389153003692627 train acc 0.8509682121043864
epoch 5 batch id 2001 loss 0.14670021831989288 train acc 0.8529953773113443
epoch 5 batch id 2201 loss 0.38508206605911255 train acc 0.8549238982280781
epoch 5 batch id 2401 loss 0.23026664555072784 train acc 0.8567003331945023
epoch 5 batch id 2601 loss 0.365781664848

  0%|          | 0/852 [00:00<?, ?it/s]

epoch 5 test acc 0.6814113849765259


In [10]:
torch.save(model, 'model/kobert-v4.pt')

In [11]:
# 모델 사이즈 확인(파라미터 동일)
import os

model_path = 'model/kobert-v4.pt'
size = os.path.getsize(model_path) / (1024*1024) # mb 단위
print(f"Model size: {size:.2f} MB")

Model size: 351.79 MB
