In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

In [2]:
#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [3]:
!pip list

Package                      Version
---------------------------- --------------------
absl-py                      1.0.0
accelerate                   0.16.0
aiohttp                      3.8.1
aiosignal                    1.2.0
alembic                      1.8.1
anaconda                     0.0.1.1
anyio                        3.5.0
appdirs                      1.4.4
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
astor                        0.8.1
asttokens                    2.0.5
astunparse                   1.6.3
async-timeout                4.0.2
attrs                        21.4.0
autoviz                      0.1.58
Babel                        2.9.1
backcall                     0.2.0
bleach                       4.1.0
blinker                      1.4
blis                         0.7.9
bokeh                        2.4.3
Boruta                       0.3
boto3                        1.15.18
botocore                     1.18.18
cach

In [4]:
#GPU 사용
device = torch.device("cuda:0")

In [5]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /akidev/NLP/Labs/.cache/kobert_v1.zip
using cached model. /akidev/NLP/Labs/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [6]:
short_sen = pd.read_csv('단발성 대화 정리.csv')
long_sen = pd.read_csv('연속적 대화 정리.csv')

In [7]:
short_sen.sample(3)

Unnamed: 0,발화,감정,감정.1
22901,포켓몬 자체가 드물게 나옴.,중립,2
20024,내일이 마지막회라니...,슬픔,3
7541,기저귀에 오줌 많이 싸면 원래 옷이 축축해 지나요?,놀람,7


In [8]:
long_sen.sample(3)

Unnamed: 0,발화,감정_str,감정_int
693,아 왜,분노,6
44194,"그 녀석 혼자서 내 아내를 온통 독차지하고 있으니까 그렇지, 이대로 일주일만 더 버...",중립,2
43905,무슨 말?,중립,2


In [9]:
short_sen = short_sen.rename(columns={"발화":'sentence','감정':'str','감정.1':'Emotion'})
long_sen = long_sen.rename(columns={"발화":'sentence','감정_str':'str','감정_int':'Emotion'})

In [10]:
total_data = pd.concat([short_sen, long_sen], axis=0)
total_data = total_data.replace({'Emotion' : 7}, 0) 

In [11]:
emotion_list = list(total_data['Emotion'].unique())

In [12]:
emotion_list

[4, 0, 6, 3, 2, 1, 5]

In [13]:
emotion_dict = {}
for i in sorted(emotion_list):
    temp = total_data[total_data['Emotion']==i].sample(1)['str'].values[0]
#     print(temp)
    emotion_dict[i] = temp

In [14]:
emotion_dict

{0: '놀람', 1: '행복', 2: '중립', 3: '슬픔', 4: '공포', 5: '혐오', 6: '분노'}

In [15]:
data_list = []
for q, label in zip(total_data['sentence'], total_data['Emotion'])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

In [16]:
data_list[:3]

[['언니 동생으로 부르는게 맞는 일인가요..??', '4'],
 ['그냥 내 느낌일뿐겠지?', '4'],
 ['아직너무초기라서 그런거죠?', '4']]

In [17]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(data_list, test_size=0.25, random_state=0)

In [18]:
print(len(dataset_train))
print(len(dataset_test))

70665
23556


In [19]:
# BERT 모델에 들어가기 위한 dataset을 만들어주는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [20]:
# Setting parameters
max_len = 128
batch_size = 13
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [21]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /akidev/NLP/Labs/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [22]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [23]:
data_train[0]

(array([   2, 4209, 2707, 5512, 6999,  517,   54,    3,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(8, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 

In [24]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [25]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [26]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [27]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [28]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [29]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [30]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [31]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [32]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f839639fee0>

In [33]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3070


In [34]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.2268226146698 train acc 0.0
epoch 1 batch id 201 loss 1.832778811454773 train acc 0.13892078071182526
epoch 1 batch id 401 loss 1.5435513257980347 train acc 0.3211202762324956
epoch 1 batch id 601 loss 1.0040916204452515 train acc 0.39075899142454823
epoch 1 batch id 801 loss 2.0267934799194336 train acc 0.4243733794295591
epoch 1 batch id 1001 loss 1.145259976387024 train acc 0.4443249058633677
epoch 1 batch id 1201 loss 1.1498080492019653 train acc 0.4639723307500171
epoch 1 batch id 1401 loss 1.7771250009536743 train acc 0.47861417668698303
epoch 1 batch id 1601 loss 1.12114417552948 train acc 0.4904146446932223
epoch 1 batch id 1801 loss 1.1212904453277588 train acc 0.5019433647973368
epoch 1 batch id 2001 loss 0.9531456828117371 train acc 0.5129358397724241
epoch 1 batch id 2201 loss 1.041943073272705 train acc 0.5214413029042727
epoch 1 batch id 2401 loss 0.8145015239715576 train acc 0.5295229551789263
epoch 1 batch id 2601 loss 0.724514901638031 train a

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 1 test acc 0.6594498217014706


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.5572728514671326 train acc 0.8461538461538461
epoch 2 batch id 201 loss 0.8416933417320251 train acc 0.6371986222732486
epoch 2 batch id 401 loss 1.2523777484893799 train acc 0.6389794743909444
epoch 2 batch id 601 loss 0.5750845670700073 train acc 0.6434148214514284
epoch 2 batch id 801 loss 1.2003669738769531 train acc 0.6478440411024702
epoch 2 batch id 1001 loss 1.191827654838562 train acc 0.6475063398140336
epoch 2 batch id 1201 loss 1.1674364805221558 train acc 0.6493306859668242
epoch 2 batch id 1401 loss 1.4787191152572632 train acc 0.650908691593918
epoch 2 batch id 1601 loss 1.230195164680481 train acc 0.6496420506414274
epoch 2 batch id 1801 loss 1.0963774919509888 train acc 0.6495963780805488
epoch 2 batch id 2001 loss 1.136215329170227 train acc 0.6493676238803585
epoch 2 batch id 2201 loss 1.1197589635849 train acc 0.6496697305420482
epoch 2 batch id 2401 loss 0.7776736617088318 train acc 0.6503059622592985
epoch 2 batch id 2601 loss 0.5325633287

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 2 test acc 0.6722278824927759


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.37439611554145813 train acc 0.9230769230769231
epoch 3 batch id 201 loss 0.7499032616615295 train acc 0.6930730960581698
epoch 3 batch id 401 loss 1.1269079446792603 train acc 0.6932668329177046
epoch 3 batch id 601 loss 0.5346275568008423 train acc 0.6938435940099836
epoch 3 batch id 801 loss 1.347634196281433 train acc 0.6957649092480562
epoch 3 batch id 1001 loss 0.9434661269187927 train acc 0.6979943133789288
epoch 3 batch id 1201 loss 0.9913270473480225 train acc 0.6994812015628
epoch 3 batch id 1401 loss 1.3137242794036865 train acc 0.7001592269258218
epoch 3 batch id 1601 loss 0.8159326910972595 train acc 0.699995195310618
epoch 3 batch id 1801 loss 1.2561123371124268 train acc 0.7008926664673367
epoch 3 batch id 2001 loss 0.7930331826210022 train acc 0.7025717910275483
epoch 3 batch id 2201 loss 1.0395523309707642 train acc 0.7029671827490818
epoch 3 batch id 2401 loss 0.8154348731040955 train acc 0.7031365136321192
epoch 3 batch id 2601 loss 0.3556278

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 3 test acc 0.6754542367125066


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.14185276627540588 train acc 1.0
epoch 4 batch id 201 loss 0.6270924210548401 train acc 0.7447378492154607
epoch 4 batch id 401 loss 0.8451785445213318 train acc 0.7479378476884708
epoch 4 batch id 601 loss 0.313155859708786 train acc 0.7491360552924622
epoch 4 batch id 801 loss 0.9117092490196228 train acc 0.7479112647651975
epoch 4 batch id 1001 loss 1.1719179153442383 train acc 0.7481749020210543
epoch 4 batch id 1201 loss 0.710386335849762 train acc 0.7501441106769968
epoch 4 batch id 1401 loss 1.0648499727249146 train acc 0.7522648657552248
epoch 4 batch id 1601 loss 0.7846397757530212 train acc 0.7527987315619936
epoch 4 batch id 1801 loss 0.8981069326400757 train acc 0.7540682526801197
epoch 4 batch id 2001 loss 0.5605455636978149 train acc 0.7542767077999277
epoch 4 batch id 2201 loss 0.8911842107772827 train acc 0.7553210079334355
epoch 4 batch id 2401 loss 0.3589771091938019 train acc 0.7561272546695051
epoch 4 batch id 2601 loss 0.2687629163265228 tr

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 4 test acc 0.6673034471047646


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.0698346421122551 train acc 1.0
epoch 5 batch id 201 loss 0.636987566947937 train acc 0.7895139686184456
epoch 5 batch id 401 loss 0.7666612267494202 train acc 0.7889890657970456
epoch 5 batch id 601 loss 0.4737984538078308 train acc 0.7944451555100484
epoch 5 batch id 801 loss 0.7997013330459595 train acc 0.7924709497743195
epoch 5 batch id 1001 loss 0.961467444896698 train acc 0.7945131791285605
epoch 5 batch id 1201 loss 0.22849451005458832 train acc 0.7987574457183061
epoch 5 batch id 1401 loss 1.7981950044631958 train acc 0.7988250150990958
epoch 5 batch id 1601 loss 0.44618895649909973 train acc 0.8001729688175526
epoch 5 batch id 1801 loss 0.5804777145385742 train acc 0.8019476359287413
epoch 5 batch id 2001 loss 0.447909951210022 train acc 0.8023296044285362
epoch 5 batch id 2201 loss 0.5471876859664917 train acc 0.8028169014084297
epoch 5 batch id 2401 loss 0.9827404618263245 train acc 0.8035433953801073
epoch 5 batch id 2601 loss 0.10814160853624344 t

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 5 test acc 0.6614026150449925


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.034813567996025085 train acc 1.0
epoch 6 batch id 201 loss 0.20407380163669586 train acc 0.8430922311519318
epoch 6 batch id 401 loss 0.6599572896957397 train acc 0.8455783617878384
epoch 6 batch id 601 loss 0.24234384298324585 train acc 0.846665813387944
epoch 6 batch id 801 loss 0.6849849224090576 train acc 0.845673677134347
epoch 6 batch id 1001 loss 0.5142568349838257 train acc 0.8482286943825338
epoch 6 batch id 1201 loss 0.2356029599905014 train acc 0.8505732402485019
epoch 6 batch id 1401 loss 1.242975115776062 train acc 0.851809147312347
epoch 6 batch id 1601 loss 0.14306987822055817 train acc 0.8525921299187863
epoch 6 batch id 1801 loss 0.787255585193634 train acc 0.8528595224874905
epoch 6 batch id 2001 loss 0.6171503663063049 train acc 0.8534963287586796
epoch 6 batch id 2201 loss 0.49298661947250366 train acc 0.8537028623352828
epoch 6 batch id 2401 loss 0.6400877237319946 train acc 0.8544837087110939
epoch 6 batch id 2601 loss 0.1513262391090393 

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 6 test acc 0.6588554932925739


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.015683487057685852 train acc 1.0
epoch 7 batch id 201 loss 0.2559671998023987 train acc 0.8836586299272865
epoch 7 batch id 401 loss 0.6248840093612671 train acc 0.8868214080184155
epoch 7 batch id 601 loss 0.1620505452156067 train acc 0.8895430692435672
epoch 7 batch id 801 loss 0.28130489587783813 train acc 0.889465091712277
epoch 7 batch id 1001 loss 0.42606988549232483 train acc 0.8908015061861136
epoch 7 batch id 1201 loss 0.5246597528457642 train acc 0.8923333119835933
epoch 7 batch id 1401 loss 1.114979863166809 train acc 0.8937572063910275
epoch 7 batch id 1601 loss 0.112274169921875 train acc 0.8940565992408458
epoch 7 batch id 1801 loss 1.2201327085494995 train acc 0.8946311878016343
epoch 7 batch id 2001 loss 0.3745410144329071 train acc 0.8963210702340988
epoch 7 batch id 2201 loss 0.5857125520706177 train acc 0.8962010275049646
epoch 7 batch id 2401 loss 0.22825218737125397 train acc 0.897254349149381
epoch 7 batch id 2601 loss 0.12025944888591766

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 7 test acc 0.6668789268126947


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.008779395371675491 train acc 1.0
epoch 8 batch id 201 loss 0.023065807297825813 train acc 0.9161882893226174
epoch 8 batch id 401 loss 0.4243377149105072 train acc 0.9178975637828507
epoch 8 batch id 601 loss 0.15985694527626038 train acc 0.9200051196723389
epoch 8 batch id 801 loss 0.5021116137504578 train acc 0.9193316047248578
epoch 8 batch id 1001 loss 0.5799574255943298 train acc 0.9212326135402986
epoch 8 batch id 1201 loss 0.024830251932144165 train acc 0.9229488246973593
epoch 8 batch id 1401 loss 1.3503690958023071 train acc 0.9233514522593659
epoch 8 batch id 1601 loss 0.011378458701074123 train acc 0.9240378609522792
epoch 8 batch id 1801 loss 0.7361713647842407 train acc 0.9241447059325906
epoch 8 batch id 2001 loss 0.3519093096256256 train acc 0.9248452696728445
epoch 8 batch id 2201 loss 0.45627591013908386 train acc 0.9247544822283458
epoch 8 batch id 2401 loss 0.02483535185456276 train acc 0.9253195783807976
epoch 8 batch id 2601 loss 0.0084159

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 8 test acc 0.6657327220241062


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.012440106831490993 train acc 1.0
epoch 9 batch id 201 loss 0.007677721790969372 train acc 0.9376195943360124
epoch 9 batch id 401 loss 0.1646171510219574 train acc 0.9401496259351619
epoch 9 batch id 601 loss 0.18084441125392914 train acc 0.941763727121462
epoch 9 batch id 801 loss 0.06429542601108551 train acc 0.9405550753865315
epoch 9 batch id 1001 loss 0.3541695475578308 train acc 0.9418274033658589
epoch 9 batch id 1201 loss 0.1947484016418457 train acc 0.9435726638057963
epoch 9 batch id 1401 loss 0.4437467157840729 train acc 0.943446988414861
epoch 9 batch id 1601 loss 0.005074365995824337 train acc 0.9444577908038166
epoch 9 batch id 1801 loss 0.6592716574668884 train acc 0.9442617349335753
epoch 9 batch id 2001 loss 0.15913823246955872 train acc 0.9449121593049541
epoch 9 batch id 2201 loss 0.4028114974498749 train acc 0.9449201411945541
epoch 9 batch id 2401 loss 0.011677619069814682 train acc 0.9451510588536809
epoch 9 batch id 2601 loss 0.003475263

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 9 test acc 0.6647138733231382


  0%|          | 0/5436 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.0030948324128985405 train acc 1.0
epoch 10 batch id 201 loss 0.007112410850822926 train acc 0.9468044393417533
epoch 10 batch id 601 loss 0.12526190280914307 train acc 0.9495712274414415
epoch 10 batch id 801 loss 0.21432074904441833 train acc 0.9506386247959241
epoch 10 batch id 1001 loss 0.2735116183757782 train acc 0.9529701068162557
epoch 10 batch id 1201 loss 0.005040218587964773 train acc 0.9549734195862367
epoch 10 batch id 1401 loss 0.414859801530838 train acc 0.9550870257508312
epoch 10 batch id 1601 loss 0.004734240937978029 train acc 0.955796857733141
epoch 10 batch id 1801 loss 0.7886164784431458 train acc 0.9552812540041787
epoch 10 batch id 2001 loss 0.00708001758903265 train acc 0.9553684696113409
epoch 10 batch id 2201 loss 0.4687683880329132 train acc 0.9549854961031637
epoch 10 batch id 2401 loss 0.004826681688427925 train acc 0.9551148559894902
epoch 10 batch id 2601 loss 0.0023312384728342295 train acc 0.9549285777659521
epoch 10 batch id 

  0%|          | 0/1812 [00:00<?, ?it/s]

epoch 10 test acc 0.6614875191034064


In [35]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /akidev/NLP/Labs/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [36]:
"""
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            print(np.argmax(logits))
            if np.argmax(logits) == 1:
                test_eval.append("행복이")
            elif np.argmax(logits) == 2:
                test_eval.append("중립이")
            elif np.argmax(logits) == 3:
                test_eval.append("슬픔이")
            elif np.argmax(logits) == 4:
                test_eval.append("공포가")
            elif np.argmax(logits) == 5:
                test_eval.append("혐오가")
            elif np.argmax(logits) == 6:
                test_eval.append("분노가")
            elif np.argmax(logits) == 7:
                test_eval.append("놀람이")

        print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")
        """

'\ndef predict(predict_sentence):\n\n    data = [predict_sentence, \'0\']\n    dataset_another = [data]\n\n    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)\n    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)\n    \n    model.eval()\n\n    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):\n        token_ids = token_ids.long().to(device)\n        segment_ids = segment_ids.long().to(device)\n\n        valid_length= valid_length\n        label = label.long().to(device)\n\n        out = model(token_ids, valid_length, segment_ids)\n\n\n        test_eval=[]\n        for i in out:\n            logits=i\n            logits = logits.detach().cpu().numpy()\n            print(np.argmax(logits))\n            if np.argmax(logits) == 1:\n                test_eval.append("행복이")\n            elif np.argmax(logits) == 2:\n                test_eval.append("중립이")\n            elif np

In [37]:
#질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == 0:
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 디자인은그럭저럭 괜찮은데 배송이좀문제네요 조립은 쉬웠는데 육각렌치가 너무 힘도없고 채결하는게좀힘들었네요


NameError: name 'predict' is not defined

In [38]:
def predict2(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


#         test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            emotion = emotion_dict[np.argmax(logits)]
            

        print(f">> 입력하신 내용의 감정은 {emotion}입니다.")

In [40]:
while True:
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence.upper() == "QUIT":
        print("감정 분석을 종료합니다.")
        break
    predict2(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 디자인은그럭저럭 괜찮은데 배송이좀문제네요 조립은 쉬웠는데 육각렌치가 너무 힘도없고 채결하는게좀힘들었네요
>> 입력하신 내용의 감정은 슬픔입니다.


하고싶은 말을 입력해주세요 : 다 좋은데 생산을 잘 못 했는지, 배송 문제인지 이음부분이 찌그러져있어서 결합이 안 되길래 함마로 그냥 내리 꽂아서 폈어요 역시 맞아야 해요.
>> 입력하신 내용의 감정은 중립입니다.


하고싶은 말을 입력해주세요 : 특히 여성분 혼자라면!! 저는 전동드릴도있고 원래 조립가구를 좋아하는 편이라 총 20분 내외 정도 걸린것 같습니다. 그래도 조립하고나니 튼튼한 느낌이들어서 저는 마음에 들어욥!
>> 입력하신 내용의 감정은 행복입니다.


하고싶은 말을 입력해주세요 : 디자인이랑 튼튼하긴 튼튼한데 상판도 까져서 왔고 밑에 프레임은 페인트도 벗겨져있더라고요 교환하기엔 책상이 진짜 급해서 안하는데 배송 기사님은 말도 없이 1층 로비에 두고가시고… 디자인 빼면 최악이네요..^^
>> 입력하신 내용의 감정은 분노입니다.


하고싶은 말을 입력해주세요 : 두닷 콰트로 에어가 모서리에 곡선이 유니크하고 예뻐서 선택했어요 마감은 말할것도 없고 조립도 어렵지 않았어요! 깨끗하고 튼튼하고 아주 맘에 듭니다 군더더기 없는 디자인 👍
>> 입력하신 내용의 감정은 행복입니다.


하고싶은 말을 입력해주세요 : quit
감정 분석을 종료합니다.


In [None]:
import joblib
joblib.dump(model, './senti7model2.pkl')