In [1]:
import gc, torch

gc.collect()
torch.cuda.empty_cache()

In [2]:
import pandas as pd

train_set = pd.read_csv('data/단발성대화.csv', index_col=0) 
train_set.sample(n=5)

Unnamed: 0,sentence,emotion
3681,"웹디자인,,유통관리사쪽 자격증을 취득해서 나중에 경력으로인터파크나, cj몰 , gs...",공포
31674,앤써에서 추천받았는데목소리 진짜좋다,행복
15124,어휴.. 무슨 꼴을 볼라고 이런 나라에서 누가 귀한 자식을 낳겠냐?,분노
25221,"헬조선,,,외노자 70~80만명 더 수입한다,,..",중립
29849,자랑스러운 한국인! 당신을 존경합니다.,행복


In [3]:
train_set['emotion'].unique()

array(['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'], dtype=object)

↳ (감성대화말뭉치와 다르게) 감정 값이 총 7개  
만약 데이터 셋을 병합 한다면 "공포", "불안", "혐오", "상처" 처리를 생각 해봐야 될 듯함
<br>

In [4]:
# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '공포': 1, '놀람': 2, '슬픔': 3, '분노': 4, '혐오': 5, '중립': 6}
# emotions = {'기쁨': 0, '불안': 1, '당황': 2, '슬픔': 3, '분노': 4, '상처': 5} // 감성대화 말뭉치

train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
28827,두분의 사랑이 넘 보기 좋네요~~~~^^,0
3053,브렉시트에 스코틀랜드 독립까지 이루어지면 파장이 너무 커지는데..,1
20065,제가 갈만한 다른 게시판없나요?,3
17605,여기서부터 문제가 시작되네요..,3
33117,6800일 축하해요??,0


In [5]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device("cuda")
#device = torch.device('cpu')

In [6]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters(KoBERT finetuning 베에스 라인) -> 
max_len = 64    #베이스라인 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [7]:
# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [8]:
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,     
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [9]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

In [10]:
# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [11]:
# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [13]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/965 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.028825521469116 train acc 0.1875
epoch 1 batch id 201 loss 1.6425517797470093 train acc 0.20398009950248755
epoch 1 batch id 401 loss 1.1926120519638062 train acc 0.305642144638404
epoch 1 batch id 601 loss 1.2739287614822388 train acc 0.36278078202995007
epoch 1 batch id 801 loss 1.3748269081115723 train acc 0.39591136079900124
epoch 1 train acc 0.415316158127039


  0%|          | 0/242 [00:00<?, ?it/s]

epoch 1 test acc 0.5159939492325856


  0%|          | 0/965 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.1370419263839722 train acc 0.65625
epoch 2 batch id 201 loss 0.9550532102584839 train acc 0.5130597014925373
epoch 2 batch id 401 loss 0.8422402143478394 train acc 0.5278990024937655
epoch 2 batch id 601 loss 1.1233311891555786 train acc 0.5445611480865225
epoch 2 batch id 801 loss 1.036447286605835 train acc 0.5588327091136079
epoch 2 train acc 0.56662948570332


  0%|          | 0/242 [00:00<?, ?it/s]

epoch 2 test acc 0.5358065230224321


  0%|          | 0/965 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.9140394926071167 train acc 0.625
epoch 3 batch id 201 loss 0.8178603649139404 train acc 0.6158271144278606
epoch 3 batch id 401 loss 0.6094398498535156 train acc 0.632247506234414
epoch 3 batch id 601 loss 0.7964781522750854 train acc 0.6516222961730449
epoch 3 batch id 801 loss 0.8134177327156067 train acc 0.669085518102372
epoch 3 train acc 0.6773543945499905


  0%|          | 0/242 [00:00<?, ?it/s]

epoch 3 test acc 0.5402523612750885


  0%|          | 0/965 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.5739497542381287 train acc 0.78125
epoch 4 batch id 201 loss 0.5277354717254639 train acc 0.7374067164179104
epoch 4 batch id 401 loss 0.405958354473114 train acc 0.743142144638404
epoch 4 batch id 601 loss 0.6067067980766296 train acc 0.7602433444259568
epoch 4 batch id 801 loss 0.6409467458724976 train acc 0.7727450062421972
epoch 4 train acc 0.7787324889656496


  0%|          | 0/242 [00:00<?, ?it/s]

epoch 4 test acc 0.5438127213695396


  0%|          | 0/965 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.4135458171367645 train acc 0.8125
epoch 5 batch id 201 loss 0.3924628794193268 train acc 0.8142101990049752
epoch 5 batch id 401 loss 0.3788403570652008 train acc 0.8184226932668329
epoch 5 batch id 601 loss 0.5270698666572571 train acc 0.8286189683860233
epoch 5 batch id 801 loss 0.4195781648159027 train acc 0.8367665418227216
epoch 5 train acc 0.8365045096910382


  0%|          | 0/242 [00:00<?, ?it/s]

epoch 5 test acc 0.5447166469893743


In [15]:
torch.save(model, 'model/kobert-v2.pt')

In [18]:
# 모델 사이즈 확인(파라미터는 v1과 동일)
import os

model_path2 = 'model/kobert-v2.pt'
size2 = os.path.getsize(model_path2) / (1024*1024) # mb 단위
print(f"Model size: {size2:.2f} MB")

Model size: 351.79 MB
