In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from WordPieceTokenizer import WordPieceTokenizer as Tokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm


dataFilePath = 'datasets/'
saveFilePath = 'saves/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(f'{dataFilePath}vocab.txt',do_lower_case=False,strip_accents=False,clean_text=False)

In [2]:
df = pd.read_csv(f'{dataFilePath}sentiment_train.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len,attention_mask,token_type_ids
0,2 22 15163 5775 5784 10815 5784 15645 22483 13...,불안,24,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 22 30858 21623 16553 5776 14727 5775 5783 13...,불안,12,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,2 22 18064 21623 14268 5780 5776 14268 5780 58...,불안,14,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,2 22 13605 27417 10815 21884 5780 18064 5775 5...,불안,13,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,2 22 10850 14633 5883 29724 24527 14219 5775 5...,불안,11,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [3]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [4]:
def process_dataframe(data_frame, device,batch_size,shuffle=False):

    tensor_x_list = []
    attentions = []
    for i in tqdm(range(len(data_frame))):
        token = data_frame.iloc[i,0]
        token = token.split(" ")
        token_list = []
        for t in token:
            token_list.append(int(t))
        tensor_x_list.append(token_list)
        
        attention = data_frame.iloc[i,3]
        attention = attention.split(" ")
        attention_list = []
        for a in attention:
            attention_list.append(int(a))
        attentions.append(attention_list)

    tensor_x = torch.tensor(tensor_x_list, dtype=torch.long, device=device)
    tensor_attention = torch.tensor(attentions, dtype=torch.long, device=device)
    tensor_t = torch.tensor(data_frame["감정"].values.tolist(), dtype=torch.long, device=device)

    print(len(tensor_x))
    print(len(tensor_attention))
    print(len(tensor_t))
    
    dataset = TensorDataset(tensor_x,tensor_attention,tensor_t)

    loader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=True)
    
    return loader

In [9]:
train_df, test_df = train_test_split(df,train_size=0.9,test_size=0.1)

print(f"학습 세트의 크기: {len(train_df)} 행")
print(f"검증 세트의 크기: {len(test_df)} 행")

train_loader = process_dataframe(train_df,device,200,True)
test_loader = process_dataframe(test_df,device,1000,False)

학습 세트의 크기: 131713 행
검증 세트의 크기: 14635 행


100%|███████████████████████████████████████████████████████████████████████| 131713/131713 [00:08<00:00, 15944.05it/s]


131713
131713
131713


100%|█████████████████████████████████████████████████████████████████████████| 14635/14635 [00:00<00:00, 17409.66it/s]


14635
14635
14635


In [15]:
from Model import LSTM

vocab_size = tokenizer.get_vocab_size()
embedding_dim = 200

In [31]:
NN = LSTM(vocab_size=vocab_size,embedding_dim=embedding_dim,hidden_dim=10,output_dim=7,n_layers=2,bidirectional=True,dropout_p=0.1)
NN.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(NN.parameters(),lr=0.001)
epoch = 500
acc = 0
prev_acc = 0

for e in range(epoch):
    NN.to(device)
    loss_sum = 0
    NN.train()
    for x, attention,t in train_loader:
        y = NN(x,attention)
        loss = loss_function(y,t)
        loss_sum += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    loss_sum /= len(train_loader)

    NN.eval()
    correct = 0
    total = 0
    for x,attention,t in train_loader:
        y = NN(x,attention)
        correct += (y.argmax(dim = -1) == t).sum().item()
        total += len(x)
    acc = correct / total

    if acc <= prev_acc:
        cnt += 1
    else :
        torch.save(NN.to('cpu'),"Sentiment.pt")
        cnt = 0
        prev_acc = acc
    print(f"\tepoch   {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
    if cnt >= 5:
        print("train halted")
        break
print("---------- 학습 종료 ----------")

	epoch   1		loss 1.587521802872	acc 0.4457	cnt 0
	epoch   2		loss 1.498288842320	acc 0.4563	cnt 0
	epoch   3		loss 1.472316711145	acc 0.4601	cnt 0
	epoch   4		loss 1.446897592226	acc 0.4743	cnt 0
	epoch   5		loss 1.421060779718	acc 0.4878	cnt 0
	epoch   6		loss 1.397209973502	acc 0.4729	cnt 1
	epoch   7		loss 1.378084034362	acc 0.5079	cnt 0
	epoch   8		loss 1.360398544367	acc 0.5185	cnt 0
	epoch   9		loss 1.345776856489	acc 0.5188	cnt 0
	epoch   10		loss 1.332008816189	acc 0.5272	cnt 0
	epoch   11		loss 1.318728181185	acc 0.5396	cnt 0
	epoch   12		loss 1.307750316195	acc 0.5420	cnt 0
	epoch   13		loss 1.299310188525	acc 0.5484	cnt 0
	epoch   14		loss 1.285288080075	acc 0.5472	cnt 1
	epoch   15		loss 1.278482861794	acc 0.5516	cnt 0
	epoch   16		loss 1.268268342257	acc 0.5577	cnt 0
	epoch   17		loss 1.262701432031	acc 0.5601	cnt 0
	epoch   18		loss 1.253973746372	acc 0.5649	cnt 0
	epoch   19		loss 1.247832800480	acc 0.5642	cnt 1
	epoch   20		loss 1.239673898213	acc 0.5677	cnt 0
	epoch   

In [33]:
model = torch.load("Sentiment.pt",weights_only=False)
torch.save(model,f"{saveFilePath}train_3.pt")