In [1]:
# 런타임 애러 방지
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [2]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v3.csv', index_col=0) 

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
104731,중학교 때 심한 따돌림으로 트라우마가 생겨 사람들과 소통하기가 너무 힘들어.,1
77275,"어, 데리고 나올 거야?",5
96204,나는 진로는 성적이 충분히 좋게 나오면 결정하기로 했어. 그래도 늦지 않을 것 같아.,1
38365,아..누나. 좀 일어나보시지. 누나.,5
46951,안 그러던 동생이 나한테 먹을 거를 갖다 주어서 당황스러웠어.,1


In [3]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

In [4]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64    
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [5]:
# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [6]:
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,     
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [7]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

In [8]:
# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [9]:
# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [10]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.03690242767334 train acc 0.03125
epoch 1 batch id 201 loss 1.6281280517578125 train acc 0.22807835820895522
epoch 1 batch id 401 loss 1.708240270614624 train acc 0.32917705735660846
epoch 1 batch id 601 loss 1.3464723825454712 train acc 0.3913789517470882
epoch 1 batch id 801 loss 1.2271583080291748 train acc 0.4364076154806492
epoch 1 batch id 1001 loss 1.3684086799621582 train acc 0.4706855644355644
epoch 1 batch id 1201 loss 1.232927918434143 train acc 0.49700770191507077
epoch 1 batch id 1401 loss 1.0647355318069458 train acc 0.5145431834403997
epoch 1 batch id 1601 loss 1.173134684562683 train acc 0.5274242660836976
epoch 1 batch id 1801 loss 0.9373440742492676 train acc 0.5393010827318157
epoch 1 batch id 2001 loss 1.3755143880844116 train acc 0.5495377311344328
epoch 1 batch id 2201 loss 0.8973793983459473 train acc 0.5579992049068605
epoch 1 batch id 2401 loss 0.9153470396995544 train acc 0.5650900666389005
epoch 1 batch id 2601 loss 0.7026615142822266

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 1 test acc 0.6641589209812055


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.045026421546936 train acc 0.59375
epoch 2 batch id 201 loss 0.8828083872795105 train acc 0.6532960199004975
epoch 2 batch id 401 loss 1.096628189086914 train acc 0.6569513715710723
epoch 2 batch id 601 loss 0.8334296941757202 train acc 0.658797836938436
epoch 2 batch id 801 loss 1.0011390447616577 train acc 0.6602684144818977
epoch 2 batch id 1001 loss 1.3065295219421387 train acc 0.6628371628371629
epoch 2 batch id 1201 loss 1.1290444135665894 train acc 0.6662156536219817
epoch 2 batch id 1401 loss 1.059396743774414 train acc 0.6686295503211992
epoch 2 batch id 1601 loss 1.136947751045227 train acc 0.6705184259837601
epoch 2 batch id 1801 loss 0.5484654903411865 train acc 0.6734279566907274
epoch 2 batch id 2001 loss 1.068651556968689 train acc 0.6752717391304348
epoch 2 batch id 2201 loss 0.7853304743766785 train acc 0.6776465243071331
epoch 2 batch id 2401 loss 0.6209391355514526 train acc 0.6801853394418992
epoch 2 batch id 2601 loss 0.4192003607749939 tra

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 2 test acc 0.6833438956316895


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.8707109689712524 train acc 0.65625
epoch 3 batch id 201 loss 0.7780370712280273 train acc 0.7199937810945274
epoch 3 batch id 401 loss 0.827009379863739 train acc 0.7213216957605985
epoch 3 batch id 601 loss 0.472354531288147 train acc 0.72363768718802
epoch 3 batch id 801 loss 0.6924821734428406 train acc 0.7258504993757803
epoch 3 batch id 1001 loss 1.0781960487365723 train acc 0.7279595404595405
epoch 3 batch id 1201 loss 0.8106368780136108 train acc 0.7311875520399667
epoch 3 batch id 1401 loss 0.8523895144462585 train acc 0.732936295503212
epoch 3 batch id 1601 loss 0.8871591091156006 train acc 0.7362195502810743
epoch 3 batch id 1801 loss 0.38620689511299133 train acc 0.7392073847862298
epoch 3 batch id 2001 loss 0.8478705286979675 train acc 0.742191404297851
epoch 3 batch id 2201 loss 0.7078331708908081 train acc 0.7447325079509314
epoch 3 batch id 2401 loss 0.5496423244476318 train acc 0.7473578717201166
epoch 3 batch id 2601 loss 0.29199060797691345 t

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 3 test acc 0.6914256542223588


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.5455279350280762 train acc 0.90625
epoch 4 batch id 201 loss 0.7927102446556091 train acc 0.7874689054726368
epoch 4 batch id 401 loss 0.6828895807266235 train acc 0.7853023690773068
epoch 4 batch id 601 loss 0.3736403286457062 train acc 0.7867096505823628
epoch 4 batch id 801 loss 0.6227283477783203 train acc 0.7883114856429463
epoch 4 batch id 1001 loss 0.922927737236023 train acc 0.7919892607392608
epoch 4 batch id 1201 loss 0.7746268510818481 train acc 0.7959252706078268
epoch 4 batch id 1401 loss 0.5599172115325928 train acc 0.7967969307637401
epoch 4 batch id 1601 loss 0.7513710260391235 train acc 0.7996759837601499
epoch 4 batch id 1801 loss 0.25761300325393677 train acc 0.8024361465852304
epoch 4 batch id 2001 loss 0.5102630257606506 train acc 0.8054254122938531
epoch 4 batch id 2201 loss 0.39396217465400696 train acc 0.8078856201726488
epoch 4 batch id 2401 loss 0.3194902539253235 train acc 0.8105737192836319
epoch 4 batch id 2601 loss 0.2435542494058

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 4 test acc 0.6836223562247146


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.46702736616134644 train acc 0.8125
epoch 5 batch id 201 loss 0.6349192261695862 train acc 0.8361318407960199
epoch 5 batch id 401 loss 0.47349485754966736 train acc 0.8362687032418953
epoch 5 batch id 601 loss 0.4137037992477417 train acc 0.8374064059900166
epoch 5 batch id 801 loss 0.5722661018371582 train acc 0.839185393258427
epoch 5 batch id 1001 loss 0.7273741364479065 train acc 0.8425012487512488
epoch 5 batch id 1201 loss 0.44373244047164917 train acc 0.8457014987510408
epoch 5 batch id 1401 loss 0.35330796241760254 train acc 0.8458690221270521
epoch 5 batch id 1601 loss 0.47852641344070435 train acc 0.847556214865709
epoch 5 batch id 1801 loss 0.17743375897407532 train acc 0.8486604664075513
epoch 5 batch id 2001 loss 0.5358647704124451 train acc 0.8506059470264867
epoch 5 batch id 2201 loss 0.30664321780204773 train acc 0.851899704679691
epoch 5 batch id 2401 loss 0.29323744773864746 train acc 0.8530690337359433
epoch 5 batch id 2601 loss 0.1255510598

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 5 test acc 0.6853523326675884


In [11]:
torch.save(model, 'model/kobert-v5.pt')

In [12]:
# 모델 사이즈 확인(파라미터 동일)
import os

model_path = 'model/kobert-v5.pt'
size = os.path.getsize(model_path) / (1024*1024) # mb 단위
print(f"Model size: {size:.2f} MB")

Model size: 351.79 MB
