In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from WordPieceTokenizer import WordPieceTokenizer as Tokenizer

dataFilePath = 'datasets/'
tokenizer = Tokenizer(f'{dataFilePath}vocab.txt',do_lower_case=False,strip_accents=False,clean_text=False)

In [2]:
df = pd.read_csv(f'{dataFilePath}sentiment_train.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,attention_mask,token_type_ids
0,2 19009 13925 10540 16229 5815 13244 3785 2859...,불안,1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 11733 783 12006 5698 6039 23027 3 0 0 0 0 0 ...,불안,1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,2 11033 5953 5954 6094 5662 14321 10752 5842 6...,불안,1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,2 19393 13465 11005 27098 5677 5905 3 0 0 0 0 ...,불안,1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,2 17124 12139 5706 6243 5842 6388 15275 3 0 0 ...,불안,1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [3]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tensor_x_list = []
attentions = []
for i in range(len(df)):
    token = df.iloc[i,0]
    token = token.split(" ")
    token_list = []
    for t in token:
        token_list.append(int(t))
    tensor_x_list.append(token_list)
    
    attention = df.iloc[i,2]
    attention = attention.split(" ")
    attention_list = []
    for a in attention:
        attention_list.append(int(a))
    attentions.append(attention_list)

tensor_attention = torch.tensor(attentions,dtype=torch.long,device=device)
tensor_x = torch.tensor(tensor_x_list,dtype=torch.long,device=device)
tensor_t = torch.tensor(df["감정"].values.tolist(),dtype=torch.long,device=device)

print(len(tensor_attention))
print(len(tensor_x))
print(len(tensor_t))
dataset = TensorDataset(tensor_x,tensor_attention,tensor_t)

train_loader = DataLoader(dataset,batch_size=64,shuffle=True,drop_last=True)
test_loader = DataLoader(dataset,batch_size=1000,shuffle=False,drop_last=True)

32696
32696
32696


In [5]:
print(len(test_loader))
print(len(train_loader))
print(tensor_x.shape)
print(tensor_t.shape)

32
510
torch.Size([32696, 64])
torch.Size([32696])


In [6]:
from Model import LSTM

vocab_size = tokenizer.get_vocab_size()
embedding_dim = tensor_x.shape[1]

In [8]:
NN = LSTM(vocab_size=vocab_size,embedding_dim=embedding_dim,hidden_dim=10,output_dim=7,n_layers=2,bidirectional=True,dropout_p=0.1)
NN.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(NN.parameters(),lr=0.001)
epoch = 100
acc = 0
prev_acc = 0

for e in range(epoch):
    loss_sum = 0
    NN.train()
    for x, attention,t in train_loader:
        y = NN(x,attention)
        loss = loss_function(y,t)
        loss_sum += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    loss_sum /= len(train_loader)

    NN.eval()
    correct = 0
    total = 0
    for x,attention,t in train_loader:
        y = NN(x,attention)
        correct += (y.argmax(dim = -1) == t).sum().item()
        total += len(x)
    acc = correct / total

    if acc <= prev_acc:
        cnt += 1
    else :
        cnt = 0
        prev_acc = acc
    print(f"\tepoch   {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
    if cnt >= 5:
        print("train halted")
        break
print("---------- 학습 종료 ----------")

	epoch   1		loss 1.719350016351	acc 0.3947	cnt 0
	epoch   2		loss 1.454423050086	acc 0.4671	cnt 0
	epoch   3		loss 1.331133266056	acc 0.5151	cnt 0
	epoch   4		loss 1.251220748238	acc 0.5504	cnt 0
	epoch   5		loss 1.190830593016	acc 0.5744	cnt 0
	epoch   6		loss 1.142400441918	acc 0.5969	cnt 0
	epoch   7		loss 1.094172661094	acc 0.6137	cnt 0
	epoch   8		loss 1.053157565874	acc 0.6397	cnt 0
	epoch   9		loss 1.019314868193	acc 0.6607	cnt 0
	epoch   10		loss 0.988265403112	acc 0.6756	cnt 0
	epoch   11		loss 0.952950001114	acc 0.6907	cnt 0
	epoch   12		loss 0.918875561158	acc 0.7059	cnt 0
	epoch   13		loss 0.888095259666	acc 0.7165	cnt 0
	epoch   14		loss 0.863135629425	acc 0.7347	cnt 0
	epoch   15		loss 0.843000507472	acc 0.7454	cnt 0
	epoch   16		loss 0.811209302907	acc 0.7547	cnt 0
	epoch   17		loss 0.791326062001	acc 0.7691	cnt 0
	epoch   18		loss 0.768125559886	acc 0.7692	cnt 0
	epoch   19		loss 0.744674311900	acc 0.7892	cnt 0
	epoch   20		loss 0.724641959924	acc 0.7995	cnt 0
	epoch   

In [9]:
NN.eval()
torch.save(NN.to('cpu'),"Sentiment.pt")