In [3]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v5-bt.csv', index_col=0)

train_set = train_set.dropna()
train_set = train_set.reset_index(drop=True)

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

# 데이터 프레임 셔플
train_set = train_set.sample(frac=1).reset_index(drop=True)
train_set.shape

(147525, 2)

In [4]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 7  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate = 2e-5

# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(float(i[label_idx])) for i in dataset]         # self.labels = [np.int32(i[label_idx]) for i in dataset] 이꺼 변경



    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=0)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [5]:
train_accuarcy, test_accuarcy = [], []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    train_accuarcy.append(train_acc / (batch_id+1))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    test_accuarcy.append(test_acc / (batch_id+1))
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    
print(train_accuarcy, test_accuarcy)

  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.7849496603012085 train acc 0.28125
epoch 1 batch id 201 loss 1.7854769229888916 train acc 0.22046019900497513
epoch 1 batch id 401 loss 1.5321274995803833 train acc 0.26317019950124687
epoch 1 batch id 601 loss 1.5046125650405884 train acc 0.30870424292845255
epoch 1 batch id 801 loss 1.187895655632019 train acc 0.3533863920099875
epoch 1 batch id 1001 loss 1.410932183265686 train acc 0.38920454545454547
epoch 1 batch id 1201 loss 1.1485865116119385 train acc 0.41980641132389673
epoch 1 batch id 1401 loss 1.1155807971954346 train acc 0.4452177016416845
epoch 1 batch id 1601 loss 1.1803524494171143 train acc 0.46720799500312304
epoch 1 batch id 1801 loss 1.0862274169921875 train acc 0.48481746252082175
epoch 1 batch id 2001 loss 0.9968980550765991 train acc 0.5001717891054472
epoch 1 batch id 2201 loss 1.055002212524414 train acc 0.5129060654248069
epoch 1 batch id 2401 loss 0.8070299029350281 train acc 0.524039462723865
epoch 1 batch id 2601 loss 0.92052936553

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 1 test acc 0.6718242145178764


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.6825827360153198 train acc 0.8125
epoch 2 batch id 201 loss 0.8812358379364014 train acc 0.6682213930348259
epoch 2 batch id 401 loss 0.9113104939460754 train acc 0.6691084788029925
epoch 2 batch id 601 loss 0.7585620284080505 train acc 0.6692491680532446
epoch 2 batch id 801 loss 0.6849930882453918 train acc 0.6703339575530587
epoch 2 batch id 1001 loss 0.9784746170043945 train acc 0.6697677322677322
epoch 2 batch id 1201 loss 0.9262698888778687 train acc 0.670951290591174
epoch 2 batch id 1401 loss 0.8358215689659119 train acc 0.6724884011420414
epoch 2 batch id 1601 loss 0.8659276366233826 train acc 0.6748711742660837
epoch 2 batch id 1801 loss 1.0283232927322388 train acc 0.6760827318156579
epoch 2 batch id 2001 loss 0.8918014764785767 train acc 0.6778641929035483
epoch 2 batch id 2201 loss 0.9112681746482849 train acc 0.679719445706497
epoch 2 batch id 2401 loss 0.6372177004814148 train acc 0.6822678050812162
epoch 2 batch id 2601 loss 0.9279241561889648 

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 2 test acc 0.6864504333694474


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.5779872536659241 train acc 0.8125
epoch 3 batch id 201 loss 0.6856139898300171 train acc 0.7133084577114428
epoch 3 batch id 401 loss 0.6881759166717529 train acc 0.7215554862842892
epoch 3 batch id 601 loss 0.655266284942627 train acc 0.7193219633943427
epoch 3 batch id 801 loss 0.4680611193180084 train acc 0.7208957553058677
epoch 3 batch id 1001 loss 0.759028971195221 train acc 0.7208104395604396
epoch 3 batch id 1201 loss 0.757350504398346 train acc 0.7210657785179018
epoch 3 batch id 1401 loss 0.8638201355934143 train acc 0.7228988222698073
epoch 3 batch id 1601 loss 0.7672981023788452 train acc 0.7245276389756402
epoch 3 batch id 1801 loss 1.0260647535324097 train acc 0.7257599944475291
epoch 3 batch id 2001 loss 0.5897591710090637 train acc 0.7279641429285357
epoch 3 batch id 2201 loss 0.7731530070304871 train acc 0.7298955020445252
epoch 3 batch id 2401 loss 0.48406434059143066 train acc 0.7319996876301541
epoch 3 batch id 2601 loss 0.7107743620872498 

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 3 test acc 0.6829293066088841


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.3139314651489258 train acc 0.9375
epoch 4 batch id 201 loss 0.6290171146392822 train acc 0.7671019900497512
epoch 4 batch id 401 loss 0.812953531742096 train acc 0.7699501246882793
epoch 4 batch id 601 loss 0.552045464515686 train acc 0.7666909317803661
epoch 4 batch id 801 loss 0.37679141759872437 train acc 0.7672050561797753
epoch 4 batch id 1001 loss 0.5712737441062927 train acc 0.7660152347652348
epoch 4 batch id 1201 loss 0.6766979098320007 train acc 0.7653517901748543
epoch 4 batch id 1401 loss 0.6807597875595093 train acc 0.767063704496788
epoch 4 batch id 1601 loss 0.6692736744880676 train acc 0.7688749219237976
epoch 4 batch id 1801 loss 0.8165844678878784 train acc 0.7710299833425874
epoch 4 batch id 2001 loss 0.5039604306221008 train acc 0.7728948025987007
epoch 4 batch id 2201 loss 0.6489577889442444 train acc 0.7738811903680145
epoch 4 batch id 2401 loss 0.31412971019744873 train acc 0.7761219283631821
epoch 4 batch id 2601 loss 0.7411506772041321

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 4 test acc 0.6873645720476707


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.21377032995224 train acc 0.96875
epoch 5 batch id 201 loss 0.5141083598136902 train acc 0.8173196517412935
epoch 5 batch id 401 loss 0.6125074028968811 train acc 0.8153054862842892
epoch 5 batch id 601 loss 0.3929903507232666 train acc 0.809952163061564
epoch 5 batch id 801 loss 0.3109658360481262 train acc 0.8103542446941323
epoch 5 batch id 1001 loss 0.482855886220932 train acc 0.8101273726273727
epoch 5 batch id 1201 loss 0.5403850674629211 train acc 0.8087791423813488
epoch 5 batch id 1401 loss 0.5705912709236145 train acc 0.810581727337616
epoch 5 batch id 1601 loss 0.5750877857208252 train acc 0.8117387570268582
epoch 5 batch id 1801 loss 0.7083356380462646 train acc 0.8139748750694059
epoch 5 batch id 2001 loss 0.5146958827972412 train acc 0.8151861569215393
epoch 5 batch id 2201 loss 0.7348799705505371 train acc 0.8163050885960926
epoch 5 batch id 2401 loss 0.22743292152881622 train acc 0.8178623490212411
epoch 5 batch id 2601 loss 0.5493189692497253 t

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 5 test acc 0.6894637053087758


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.14960554242134094 train acc 0.9375
epoch 6 batch id 201 loss 0.48318779468536377 train acc 0.8453047263681592
epoch 6 batch id 401 loss 0.6153068542480469 train acc 0.8456982543640897
epoch 6 batch id 601 loss 0.3146199882030487 train acc 0.8439059900166389
epoch 6 batch id 801 loss 0.24677644670009613 train acc 0.8424625468164794
epoch 6 batch id 1001 loss 0.37980782985687256 train acc 0.8417207792207793
epoch 6 batch id 1201 loss 0.5122339725494385 train acc 0.8408878018318068
epoch 6 batch id 1401 loss 0.6184111833572388 train acc 0.8423001427551748
epoch 6 batch id 1601 loss 0.557496964931488 train acc 0.8433791380387258
epoch 6 batch id 1801 loss 0.8720369338989258 train acc 0.8443920044419767
epoch 6 batch id 2001 loss 0.4535840153694153 train acc 0.8459676411794103
epoch 6 batch id 2201 loss 0.5675052404403687 train acc 0.846830985915493
epoch 6 batch id 2401 loss 0.1876327395439148 train acc 0.8478889004581425
epoch 6 batch id 2601 loss 0.4154543280601

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 6 test acc 0.6869582881906826


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.16904717683792114 train acc 0.9375
epoch 7 batch id 201 loss 0.5248503684997559 train acc 0.8535447761194029
epoch 7 batch id 401 loss 0.5631686449050903 train acc 0.8583229426433915
epoch 7 batch id 601 loss 0.23955106735229492 train acc 0.8577891014975042
epoch 7 batch id 801 loss 0.21588607132434845 train acc 0.8588483146067416
epoch 7 batch id 1001 loss 0.3903769850730896 train acc 0.8575487012987013
epoch 7 batch id 1201 loss 0.39481863379478455 train acc 0.8565258118234804
epoch 7 batch id 1401 loss 0.42969661951065063 train acc 0.8575347965738758
epoch 7 batch id 1601 loss 0.6024450659751892 train acc 0.8580379450343535
epoch 7 batch id 1801 loss 0.5603017807006836 train acc 0.8592448639644642
epoch 7 batch id 2001 loss 0.33036181330680847 train acc 0.8606009495252374
epoch 7 batch id 2201 loss 0.4992963671684265 train acc 0.8608445024988641
epoch 7 batch id 2401 loss 0.17254726588726044 train acc 0.8616201582673886
epoch 7 batch id 2601 loss 0.61826884

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 7 test acc 0.6858410075839654
3190.03125 633.03125


In [1]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v5-bt.csv', index_col=0)

train_set = train_set.dropna()
train_set = train_set.reset_index(drop=True)

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

# 데이터 프레임 셔플
train_set = train_set.sample(frac=1).reset_index(drop=True)
train_set.shape

(147525, 2)

In [2]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 7  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate = 1e-5

# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(float(i[label_idx])) for i in dataset]         # self.labels = [np.int32(i[label_idx]) for i in dataset] 이꺼 변경



    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=0)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [3]:
train_accuarcy, test_accuarcy = [], []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    train_accuarcy.append(train_acc / (batch_id+1))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    test_accuarcy.append(test_acc / (batch_id+1))
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    
print(train_accuarcy, test_accuarcy, sep='\n')

  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.8258147239685059 train acc 0.15625
epoch 1 batch id 201 loss 1.6981139183044434 train acc 0.1957400497512438
epoch 1 batch id 401 loss 1.7189544439315796 train acc 0.24251870324189526
epoch 1 batch id 601 loss 1.6414874792099 train acc 0.2849417637271215
epoch 1 batch id 801 loss 1.4733942747116089 train acc 0.325374531835206
epoch 1 batch id 1001 loss 1.1684176921844482 train acc 0.35967157842157843
epoch 1 batch id 1201 loss 1.3892159461975098 train acc 0.38782785179017487
epoch 1 batch id 1401 loss 1.0611392259597778 train acc 0.41303087080656675
epoch 1 batch id 1601 loss 1.136086106300354 train acc 0.4338694565896315
epoch 1 batch id 1801 loss 1.4218536615371704 train acc 0.45164144919489174
epoch 1 batch id 2001 loss 1.2108484506607056 train acc 0.46842203898050977
epoch 1 batch id 2201 loss 0.9725021719932556 train acc 0.48225238527941844
epoch 1 batch id 2401 loss 1.0816774368286133 train acc 0.4947808204914619
epoch 1 batch id 2601 loss 0.907434225082

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 1 test acc 0.6603805525460456


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.176632046699524 train acc 0.625
epoch 2 batch id 201 loss 1.1063756942749023 train acc 0.6707089552238806
epoch 2 batch id 401 loss 0.8295395374298096 train acc 0.6668485037406484
epoch 2 batch id 601 loss 0.9796239137649536 train acc 0.6674292845257903
epoch 2 batch id 801 loss 0.7904459834098816 train acc 0.6667837078651685
epoch 2 batch id 1001 loss 0.7670003771781921 train acc 0.6672702297702298
epoch 2 batch id 1201 loss 0.9879732728004456 train acc 0.6667360532889259
epoch 2 batch id 1401 loss 0.7992398142814636 train acc 0.669187187723055
epoch 2 batch id 1601 loss 0.9201534986495972 train acc 0.6703622735790131
epoch 2 batch id 1801 loss 1.1080957651138306 train acc 0.6709987506940589
epoch 2 batch id 2001 loss 1.105879545211792 train acc 0.6728979260369815
epoch 2 batch id 2201 loss 0.6900357007980347 train acc 0.6743383689232167
epoch 2 batch id 2401 loss 0.964436411857605 train acc 0.675747084548105
epoch 2 batch id 2601 loss 0.7826902866363525 trai

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 2 test acc 0.6875338569880823


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.070995807647705 train acc 0.59375
epoch 3 batch id 201 loss 0.9139941334724426 train acc 0.7168843283582089
epoch 3 batch id 401 loss 0.7532325387001038 train acc 0.7097100997506235
epoch 3 batch id 601 loss 0.8690445423126221 train acc 0.7094425956738769
epoch 3 batch id 801 loss 0.7896623015403748 train acc 0.7088014981273408
epoch 3 batch id 1001 loss 0.5803099870681763 train acc 0.7084165834165834
epoch 3 batch id 1201 loss 0.8147688508033752 train acc 0.7070930474604497
epoch 3 batch id 1401 loss 0.7232925295829773 train acc 0.7083110278372591
epoch 3 batch id 1601 loss 0.697036623954773 train acc 0.7098297938788257
epoch 3 batch id 1801 loss 1.0439226627349854 train acc 0.7103692393114936
epoch 3 batch id 2001 loss 0.9729917645454407 train acc 0.711987756121939
epoch 3 batch id 2201 loss 0.5928052663803101 train acc 0.7135818945933666
epoch 3 batch id 2401 loss 0.9389228820800781 train acc 0.7147802998750521
epoch 3 batch id 2601 loss 0.7642312049865723 

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 3 test acc 0.6896329902491874


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.8964531421661377 train acc 0.6875
epoch 4 batch id 201 loss 0.8608889579772949 train acc 0.7482898009950248
epoch 4 batch id 401 loss 0.5816807150840759 train acc 0.7436876558603491
epoch 4 batch id 601 loss 0.7982286214828491 train acc 0.7428244592346089
epoch 4 batch id 801 loss 0.5978367924690247 train acc 0.742626404494382
epoch 4 batch id 1001 loss 0.47371113300323486 train acc 0.7426323676323676
epoch 4 batch id 1201 loss 0.7542970180511475 train acc 0.7413093255620317
epoch 4 batch id 1401 loss 0.5904650092124939 train acc 0.7424830478229836
epoch 4 batch id 1601 loss 0.5804026126861572 train acc 0.7437343847595252
epoch 4 batch id 1801 loss 0.9444020390510559 train acc 0.7442566629650195
epoch 4 batch id 2001 loss 0.7690519690513611 train acc 0.7456896551724138
epoch 4 batch id 2201 loss 0.4693196713924408 train acc 0.7467202407996365
epoch 4 batch id 2401 loss 0.9110593795776367 train acc 0.7475661182840483
epoch 4 batch id 2601 loss 0.582166671752929

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 4 test acc 0.6885495666305526


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.8551768064498901 train acc 0.71875
epoch 5 batch id 201 loss 0.7650254964828491 train acc 0.7744092039800995
epoch 5 batch id 401 loss 0.6136534810066223 train acc 0.7707294264339152
epoch 5 batch id 601 loss 0.6888339519500732 train acc 0.7704866888519135
epoch 5 batch id 801 loss 0.5753196477890015 train acc 0.7704822097378277
epoch 5 batch id 1001 loss 0.5074880123138428 train acc 0.7700736763236763
epoch 5 batch id 1201 loss 0.6819635033607483 train acc 0.7686823480432973
epoch 5 batch id 1401 loss 0.5957227349281311 train acc 0.7699188079942898
epoch 5 batch id 1601 loss 0.5524907112121582 train acc 0.7705535602748282
epoch 5 batch id 1801 loss 0.9223369359970093 train acc 0.7710820377568017
epoch 5 batch id 2001 loss 0.7469671368598938 train acc 0.7726761619190404
epoch 5 batch id 2201 loss 0.39328014850616455 train acc 0.7734268514311676
epoch 5 batch id 2401 loss 0.811969518661499 train acc 0.7739093086214077
epoch 5 batch id 2601 loss 0.55002218484878

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 5 test acc 0.6914274106175514


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.6505228281021118 train acc 0.8125
epoch 6 batch id 201 loss 0.6533713936805725 train acc 0.8003731343283582
epoch 6 batch id 401 loss 0.5429118275642395 train acc 0.796290523690773
epoch 6 batch id 601 loss 0.5808356404304504 train acc 0.7940411813643927
epoch 6 batch id 801 loss 0.538279116153717 train acc 0.7933052434456929
epoch 6 batch id 1001 loss 0.4904448688030243 train acc 0.7917082917082917
epoch 6 batch id 1201 loss 0.668642520904541 train acc 0.7892381348875936
epoch 6 batch id 1401 loss 0.49535423517227173 train acc 0.7903729478943612
epoch 6 batch id 1601 loss 0.3651430904865265 train acc 0.790365396627108
epoch 6 batch id 1801 loss 0.862176775932312 train acc 0.7904636313159356
epoch 6 batch id 2001 loss 0.6120594143867493 train acc 0.7909638930534733
epoch 6 batch id 2201 loss 0.38821980357170105 train acc 0.7917139936392549
epoch 6 batch id 2401 loss 0.7008697986602783 train acc 0.792182944606414
epoch 6 batch id 2601 loss 0.5514791011810303 tr

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 6 test acc 0.6914612676056338


  0%|          | 0/3689 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.6601365804672241 train acc 0.75
epoch 7 batch id 201 loss 0.6161670684814453 train acc 0.8022388059701493
epoch 7 batch id 401 loss 0.5875775814056396 train acc 0.7997194513715711
epoch 7 batch id 601 loss 0.6128020286560059 train acc 0.7995528286189684
epoch 7 batch id 801 loss 0.3814089298248291 train acc 0.7992743445692884
epoch 7 batch id 1001 loss 0.3913452625274658 train acc 0.7988573926073926
epoch 7 batch id 1201 loss 0.7159830927848816 train acc 0.7969660699417153
epoch 7 batch id 1401 loss 0.459958553314209 train acc 0.7977560670949322
epoch 7 batch id 1601 loss 0.4526872932910919 train acc 0.7987781074328545
epoch 7 batch id 1801 loss 0.8124911189079285 train acc 0.7989137978900611
epoch 7 batch id 2001 loss 0.5914962291717529 train acc 0.7997876061969016
epoch 7 batch id 2201 loss 0.37870484590530396 train acc 0.8002896410722399
epoch 7 batch id 2401 loss 0.6466255784034729 train acc 0.8005648688046647
epoch 7 batch id 2601 loss 0.47367438673973083

  0%|          | 0/923 [00:00<?, ?it/s]

epoch 7 test acc 0.6932218309859155
[0.5489123068582271, 0.6846960558416915, 0.722011385199241, 0.7529903090268365, 0.7781749796692871, 0.7951511249661155, 0.8026819598807264] [0.6603805525460456, 0.6875338569880823, 0.6896329902491874, 0.6885495666305526, 0.6914274106175514, 0.6914612676056338, 0.6932218309859155]


In [3]:
import pandas as pd

train_set = pd.read_csv('data/병합데이터셋-v5.csv', index_col=0)

train_set = train_set.dropna()
train_set = train_set.reset_index(drop=True)

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
train_set['emotion'] = train_set.emotion.map(emotions)

# 데이터 프레임 셔플
train_set = train_set.sample(frac=1).reset_index(drop=True)
train_set.shape

# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
device = torch.device("cuda")

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 7  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate = 2e-5

# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(float(i[label_idx])) for i in dataset]         # self.labels = [np.int32(i[label_idx]) for i in dataset] 이꺼 변경



    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.1, random_state=0)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

train_accuarcy, test_accuarcy = [], []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    train_accuarcy.append(train_acc / (batch_id+1))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    test_accuarcy.append(test_acc / (batch_id+1))
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    
train_accuarcy, test_accuarcy

  0%|          | 0/4160 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.9032812118530273 train acc 0.0625
epoch 1 batch id 201 loss 1.751467227935791 train acc 0.17273009950248755
epoch 1 batch id 401 loss 1.6798760890960693 train acc 0.23986907730673318
epoch 1 batch id 601 loss 1.4346672296524048 train acc 0.2873336106489185
epoch 1 batch id 801 loss 1.0671095848083496 train acc 0.33941947565543074
epoch 1 batch id 1001 loss 1.075100064277649 train acc 0.37609265734265734
epoch 1 batch id 1201 loss 1.2916501760482788 train acc 0.40825353871773523
epoch 1 batch id 1401 loss 1.2795116901397705 train acc 0.4351133119200571
epoch 1 batch id 1601 loss 0.9827401041984558 train acc 0.4555356027482823
epoch 1 batch id 1801 loss 0.8462985157966614 train acc 0.47494447529150474
epoch 1 batch id 2001 loss 0.9094249606132507 train acc 0.49094202898550726
epoch 1 batch id 2201 loss 0.8161762952804565 train acc 0.5043588141753749
epoch 1 batch id 2401 loss 0.8118577599525452 train acc 0.517219387755102
epoch 1 batch id 2601 loss 0.69574749469

  0%|          | 0/463 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 6.00 GiB total capacity; 4.22 GiB already allocated; 7.56 MiB free; 4.50 GiB reserved in total by PyTorch)

In [None]:
torch.save(model, 'model/kobert-v10.pt')

In [None]:
# 모델 사이즈 확인(파라미터는 v1과 동일)
import os

model_path3 = 'model/kobert-v3.pt'
size2 = os.path.getsize(model_path3) / (1024*1024) # mb 단위
print(f"Model size: {size2:.2f} MB")