In [1]:
# 런타임 애러 방지
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [2]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v3.csv', index_col=0) 

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
37820,빨리 들어가서 일이나 봐아...,5
66035,김 여사가 나눠 준 음식에 땅콩이 있었어. 모르고 먹었는데 고생했어.,1
15418,딱 이거다.,2
11171,내가 돈이 없어서 취업을 해야 하는데 나이도 많고 해서 나를 원하는 직장이 없을까 ...,1
61891,은퇴할 때는 점점 다가오는데 자식들이 커서 나한테 도움을 주지 않을까 봐 무서워.,1


In [3]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

In [4]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64    
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [5]:
# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [6]:
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,     
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [7]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=0)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

In [8]:
# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [9]:
# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [10]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.765323519706726 train acc 0.25
epoch 1 batch id 201 loss 1.5442984104156494 train acc 0.2484452736318408
epoch 1 batch id 401 loss 1.669565200805664 train acc 0.3455423940149626
epoch 1 batch id 601 loss 1.216027855873108 train acc 0.4097337770382696
epoch 1 batch id 801 loss 1.1335647106170654 train acc 0.4554463171036205
epoch 1 batch id 1001 loss 1.2969141006469727 train acc 0.4875124875124875
epoch 1 batch id 1201 loss 1.1551012992858887 train acc 0.510876353039134
epoch 1 batch id 1401 loss 1.1454170942306519 train acc 0.5270342612419701
epoch 1 batch id 1601 loss 1.197957158088684 train acc 0.5380231105559026
epoch 1 batch id 1801 loss 0.8738500475883484 train acc 0.5486361743475847
epoch 1 batch id 2001 loss 1.2041116952896118 train acc 0.5578616941529235
epoch 1 batch id 2201 loss 0.9557017087936401 train acc 0.5655383916401635
epoch 1 batch id 2401 loss 0.8528918027877808 train acc 0.5720923573511038
epoch 1 batch id 2601 loss 0.7924847602844238 train

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 1 test acc 0.674612395657295


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.9162861704826355 train acc 0.65625
epoch 2 batch id 201 loss 0.8593024015426636 train acc 0.6548507462686567
epoch 2 batch id 401 loss 1.124955654144287 train acc 0.6588216957605985
epoch 2 batch id 601 loss 0.7193484306335449 train acc 0.6603057404326124
epoch 2 batch id 801 loss 1.0079941749572754 train acc 0.6621800873907615
epoch 2 batch id 1001 loss 1.2259600162506104 train acc 0.6663648851148851
epoch 2 batch id 1201 loss 0.9761208295822144 train acc 0.6692599916736053
epoch 2 batch id 1401 loss 1.0396537780761719 train acc 0.6706816559600286
epoch 2 batch id 1601 loss 1.1270655393600464 train acc 0.6727631168019987
epoch 2 batch id 1801 loss 0.5492494106292725 train acc 0.6761347862298723
epoch 2 batch id 2001 loss 1.1417326927185059 train acc 0.6786762868565717
epoch 2 batch id 2201 loss 0.7592246532440186 train acc 0.6810824625170377
epoch 2 batch id 2401 loss 0.5758426189422607 train acc 0.6832960224906289
epoch 2 batch id 2601 loss 0.522323727607727

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 2 test acc 0.6846609822297333


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.7986209392547607 train acc 0.75
epoch 3 batch id 201 loss 0.7797208428382874 train acc 0.7184390547263682
epoch 3 batch id 401 loss 0.8271353244781494 train acc 0.7216334164588528
epoch 3 batch id 601 loss 0.4576512575149536 train acc 0.7220257903494176
epoch 3 batch id 801 loss 0.6782790422439575 train acc 0.7240168539325843
epoch 3 batch id 1001 loss 1.2290399074554443 train acc 0.7277097902097902
epoch 3 batch id 1201 loss 0.8539198040962219 train acc 0.7309533721898418
epoch 3 batch id 1401 loss 0.8623064756393433 train acc 0.7329586009992862
epoch 3 batch id 1601 loss 0.8802357316017151 train acc 0.7352631168019987
epoch 3 batch id 1801 loss 0.3300066590309143 train acc 0.7386868406440866
epoch 3 batch id 2001 loss 0.8269587159156799 train acc 0.7418790604697652
epoch 3 batch id 2201 loss 0.4913092255592346 train acc 0.7446473194002726
epoch 3 batch id 2401 loss 0.5226374864578247 train acc 0.7472797792586422
epoch 3 batch id 2601 loss 0.3548867702484131 

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 3 test acc 0.6906606877656578


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.5337974429130554 train acc 0.8125
epoch 4 batch id 201 loss 0.676946222782135 train acc 0.7782960199004975
epoch 4 batch id 401 loss 0.6255374550819397 train acc 0.782964463840399
epoch 4 batch id 601 loss 0.3529127240180969 train acc 0.7839538269550749
epoch 4 batch id 801 loss 0.5618844032287598 train acc 0.7865948813982522
epoch 4 batch id 1001 loss 0.8978216648101807 train acc 0.7913336663336663
epoch 4 batch id 1201 loss 0.5107479095458984 train acc 0.7941819317235637
epoch 4 batch id 1401 loss 0.6392253041267395 train acc 0.7959493219129193
epoch 4 batch id 1601 loss 0.7917352914810181 train acc 0.7980168644597126
epoch 4 batch id 1801 loss 0.2538401186466217 train acc 0.8008051082731815
epoch 4 batch id 2001 loss 0.5556666254997253 train acc 0.803691904047976
epoch 4 batch id 2201 loss 0.424376904964447 train acc 0.8065509995456611
epoch 4 batch id 2401 loss 0.3112984895706177 train acc 0.8088817159516868
epoch 4 batch id 2601 loss 0.20947566628456116 t

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 4 test acc 0.6839248220412762


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.5969951152801514 train acc 0.78125
epoch 5 batch id 201 loss 0.5590223073959351 train acc 0.8310012437810945
epoch 5 batch id 401 loss 0.6003820300102234 train acc 0.8341645885286783
epoch 5 batch id 601 loss 0.5485747456550598 train acc 0.8362624792013311
epoch 5 batch id 801 loss 0.262102335691452 train acc 0.8376248439450686
epoch 5 batch id 1001 loss 0.643107533454895 train acc 0.8406905594405595
epoch 5 batch id 1201 loss 0.25834202766418457 train acc 0.84375
epoch 5 batch id 1401 loss 0.5387965440750122 train acc 0.8443522483940042
epoch 5 batch id 1601 loss 0.5043561458587646 train acc 0.8455457526545909
epoch 5 batch id 1801 loss 0.1885564774274826 train acc 0.8471682398667407
epoch 5 batch id 2001 loss 0.5866328477859497 train acc 0.8491223138430785
epoch 5 batch id 2201 loss 0.3297790586948395 train acc 0.8507212630622444
epoch 5 batch id 2401 loss 0.21570153534412384 train acc 0.8518455851728447
epoch 5 batch id 2601 loss 0.2142801433801651 train ac

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 5 test acc 0.6863093409125826


In [11]:
torch.save(model, 'model/kobert-v5.pt')

In [12]:
# 모델 사이즈 확인(파라미터 동일)
import os

model_path = 'model/kobert-v5.pt'
size = os.path.getsize(model_path) / (1024*1024) # mb 단위
print(f"Model size: {size:.2f} MB")

Model size: 351.79 MB
