In [1]:
# 런타임 애러 방지
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [2]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v4.csv', index_col=0) 

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
24891,매점 아저씨 ㅋㅋ,0
84270,올해 양초 공장 대박 났네,2
15620,"웬 일이야, 나도 동감인데? 대학에서 그런 쪽으로 공부하려고 했을 정도야.",2
51977,"아뇨, 기분이 이상해서요.",5
91648,동생이 자꾸 외모를 가지고 나한테 장난을 쳐서 짜증나고 미워져.,4


In [3]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

In [4]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64    
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))
    
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,     
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [5]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=0)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [6]:
train_accuarcy, test_accuarcy = [], []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    train_accuarcy.append(train_acc / (batch_id+1))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    test_accuarcy.append(test_acc / (batch_id+1))
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.9056661128997803 train acc 0.125
epoch 1 batch id 201 loss 1.574646234512329 train acc 0.2605721393034826
epoch 1 batch id 401 loss 1.5566002130508423 train acc 0.3394638403990025
epoch 1 batch id 601 loss 1.1822447776794434 train acc 0.3985544925124792
epoch 1 batch id 801 loss 1.1515653133392334 train acc 0.4386313982521848
epoch 1 batch id 1001 loss 1.0051108598709106 train acc 0.4732455044955045
epoch 1 batch id 1201 loss 0.9794769883155823 train acc 0.4963051623646961
epoch 1 batch id 1401 loss 0.9584118127822876 train acc 0.5151900428265525
epoch 1 batch id 1601 loss 0.9447484612464905 train acc 0.5281855090568395
epoch 1 batch id 1801 loss 1.300019383430481 train acc 0.5392316768461966
epoch 1 batch id 2001 loss 1.0494643449783325 train acc 0.5483039730134932
epoch 1 batch id 2201 loss 0.9517358541488647 train acc 0.5568207632894139
epoch 1 batch id 2401 loss 1.1516118049621582 train acc 0.5640358184089963
epoch 1 batch id 2601 loss 1.170310139656067 tr

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 1 test acc 0.6681117811235724


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.8090456128120422 train acc 0.65625
epoch 2 batch id 201 loss 1.1069473028182983 train acc 0.6574937810945274
epoch 2 batch id 401 loss 0.8185065388679504 train acc 0.6546913965087282
epoch 2 batch id 601 loss 0.8259114623069763 train acc 0.6570299500831946
epoch 2 batch id 801 loss 0.9122297763824463 train acc 0.656289013732834
epoch 2 batch id 1001 loss 0.7737252116203308 train acc 0.6610264735264735
epoch 2 batch id 1201 loss 1.0918035507202148 train acc 0.6629891756869276
epoch 2 batch id 1401 loss 0.8525056838989258 train acc 0.6674250535331906
epoch 2 batch id 1601 loss 0.607513964176178 train acc 0.670206121174266
epoch 2 batch id 1801 loss 1.0399630069732666 train acc 0.6726471404775125
epoch 2 batch id 2001 loss 0.9276140928268433 train acc 0.6743503248375812
epoch 2 batch id 2201 loss 0.8558356165885925 train acc 0.677277373920945
epoch 2 batch id 2401 loss 1.030071496963501 train acc 0.6797298000832986
epoch 2 batch id 2601 loss 0.962837815284729 tra

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 2 test acc 0.6831070440927947


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.6896616816520691 train acc 0.6875
epoch 3 batch id 201 loss 0.868595540523529 train acc 0.7248134328358209
epoch 3 batch id 401 loss 0.590975821018219 train acc 0.717425187032419
epoch 3 batch id 601 loss 0.7524910569190979 train acc 0.7182300332778702
epoch 3 batch id 801 loss 0.7567518949508667 train acc 0.7191791510611736
epoch 3 batch id 1001 loss 0.6821896433830261 train acc 0.7234328171828172
epoch 3 batch id 1201 loss 0.7720222473144531 train acc 0.7255672356369692
epoch 3 batch id 1401 loss 0.636398196220398 train acc 0.729300499643112
epoch 3 batch id 1601 loss 0.6614918112754822 train acc 0.7330965021861336
epoch 3 batch id 1801 loss 0.6830792427062988 train acc 0.7358932537479178
epoch 3 batch id 2001 loss 0.7188674807548523 train acc 0.7382402548725637
epoch 3 batch id 2201 loss 0.7519735097885132 train acc 0.7411403907314857
epoch 3 batch id 2401 loss 0.8691238164901733 train acc 0.7437786339025406
epoch 3 batch id 2601 loss 0.7564446926116943 tra

  0%|          | 0/849 [00:00<?, ?it/s]

epoch 3 test acc 0.6790949710657039


  0%|          | 0/3395 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.4367727041244507 train acc 0.78125


KeyboardInterrupt: 

In [None]:
train_accuarcy, test_accuarcy

In [None]:
#torch.save(model, 'model/kobert-v8.pt')