In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from WordPieceTokenizer import WordPieceTokenizer as Tokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm


dataFilePath = 'datasets/'
saveFilePath = 'saves/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(f'{dataFilePath}sentiment_vocab/vocab.txt',do_lower_case=False,strip_accents=False,clean_text=False)

In [2]:
df = pd.read_csv(f'{dataFilePath}sentiment_train.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len,attention_mask,token_type_ids
0,2 2376 2347 1993 10402 1081 2699 16526 1118 3 ...,불안,24,1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 2083 190 2413 18381 2145 3 0 0 0 0 0 0 0 0 0...,불안,12,1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,2 2097 4679 1383 18349 1063 5411 1135 3 0 0 0 ...,불안,14,1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,2 10126 21697 2664 18111 2596 3 0 0 0 0 0 0 0 ...,불안,13,1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,2 2246 2907 10621 1155 1461 2670 3 0 0 0 0 0 0...,불안,11,1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [3]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [4]:
def process_dataframe(data_frame, device,batch_size,shuffle=False):
    tensor_x_list = []
    attentions = []
    for i in tqdm(range(len(data_frame))):
        token = data_frame.iloc[i,0]
        token = token.split(" ")
        token_list = []
        for t in token:
            token_list.append(int(t))
        tensor_x_list.append(token_list)
        
        attention = data_frame.iloc[i,3]
        attention = attention.split(" ")
        attention_list = []
        for a in attention:
            attention_list.append(int(a))
        attentions.append(attention_list)

    tensor_x = torch.tensor(tensor_x_list, dtype=torch.long, device=device)
    tensor_attention = torch.tensor(attentions, dtype=torch.long, device=device)
    tensor_t = torch.tensor(data_frame["감정"].values.tolist(), dtype=torch.long, device=device)

    dataset = TensorDataset(tensor_x,tensor_attention,tensor_t)

    loader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=True)
    
    return loader

In [5]:
train_df, val_df = train_test_split(df,train_size=0.9,test_size=0.1)

print(f"학습 세트의 크기: {len(train_df)} 행")
print(f"검증 세트의 크기: {len(val_df)} 행")

train_loader = process_dataframe(train_df,device,200,True)
val_loader = process_dataframe(val_df,device,1000,False)

학습 세트의 크기: 131713 행
검증 세트의 크기: 14635 행


100%|███████████████████████████████████████████████████████████████████████| 131713/131713 [00:04<00:00, 26852.37it/s]
100%|█████████████████████████████████████████████████████████████████████████| 14635/14635 [00:00<00:00, 29336.94it/s]


In [6]:
from Model import LSTM
from Model import Transformer, PositionalEncoding
vocab_size = tokenizer.get_vocab_size()
embedding_dim = 250

## LSTM

In [8]:
def LSTM_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer):
    acc = 0
    prev_acc = 0
    cnt = 0
    for e in range(epoch):
        NN.to(device)
        loss_sum = 0
        NN.train()
        for x, attention,t in train_loader:
            y = NN(x,attention)
            loss = loss_function(y,t)
            loss_sum += loss.item()
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_sum /= len(train_loader)
    
        NN.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, attention, t in val_loader:
                x = x.to(device)
                attention = attention.to(device)
                t = t.to(device)
    
                y = NN(x, attention)
                correct += (y.argmax(dim=-1) == t).sum().item()
                total += len(x)
        acc = correct / total
    
        if acc <= prev_acc:
            cnt += 1
        else :
            torch.save(NN.state_dict(), "Sentiment.pt")
            cnt = 0
            prev_acc = acc
        
        print(f"epoch  {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
        
        if cnt >= 5:
            print("train halted")
            break
            
    print("---------- 학습 종료 ----------")

In [9]:
# NN = LSTM(vocab_size=vocab_size,embedding_dim=embedding_dim,hidden_dim=64,output_dim=7,n_layers=4,bidirectional=True,dropout_p=0.1)
# NN.to(device)
# loss_function = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(NN.parameters(),lr=0.001)
# epoch = 500
# LSTM_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer)

## Transformer

In [11]:
def Transformer_Train(epoch, device, train_loader, val_loader, NN, loss_function, optimizer):
    acc = 0
    prev_acc = 0
    cnt = 0
    for e in range(epoch):
        NN.to(device)
        loss_sum = 0
        NN.train()
        for x, attention, t in tqdm(train_loader, desc=f"Epoch {e+1} Training",leave=False):
            x = x.to(device)
            attention = attention.to(device)
            t = t.to(device)

            y = NN(x, attention)
            loss = loss_function(y, t)
            loss_sum += loss.item()
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_sum /= len(train_loader)
        
        NN.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, attention, t in tqdm(val_loader, desc=f"Epoch {e+1} Validation",leave=False):
                x = x.to(device)
                attention = attention.to(device)
                t = t.to(device)
        
                y = NN(x, attention)
                correct += (y.argmax(dim=-1) == t).sum().item()
                total += len(x)
        acc = correct / total
        
        if acc <= prev_acc:
            cnt += 1
        else :
            torch.save(NN.state_dict(), "Sentiment.pt")
            cnt = 0
            prev_acc = acc
        
        print(f"epoch   {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
        
        if cnt >= 5:
            print("train halted")
            break
            
    print("---------- 학습 종료 ----------")

In [12]:
NN = Transformer(vocab_size=vocab_size,embedding_dim=768,hidden_dim=64,output_dim=7,n_layers=6,n_heads=12,dropout_p=0.05,max_len=150,pad_token_id=0)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(NN.parameters(),lr=1e-5)
epoch = 500
Transformer_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer)

  output = torch._nested_tensor_from_mask(
                                                                                                                       

epoch   1		loss 1.536330958630	acc 0.4764	cnt 0


                                                                                                                       

epoch   2		loss 1.365047575855	acc 0.5302	cnt 0


                                                                                                                       

epoch   3		loss 1.275952741911	acc 0.5538	cnt 0


                                                                                                                       

epoch   4		loss 1.219147753208	acc 0.5686	cnt 0


                                                                                                                       

epoch   5		loss 1.175461669823	acc 0.5750	cnt 0


                                                                                                                       

epoch   6		loss 1.138608088428	acc 0.5832	cnt 0


                                                                                                                       

epoch   7		loss 1.106401892328	acc 0.5848	cnt 0


                                                                                                                       

epoch   8		loss 1.074751300381	acc 0.5930	cnt 0


                                                                                                                       

epoch   9		loss 1.047307923029	acc 0.5996	cnt 0


                                                                                                                       

epoch   10		loss 1.021415907833	acc 0.6057	cnt 0


                                                                                                                       

epoch   11		loss 0.994525075683	acc 0.6065	cnt 0


                                                                                                                       

epoch   12		loss 0.974160185248	acc 0.6109	cnt 0


                                                                                                                       

epoch   13		loss 0.948612480359	acc 0.6136	cnt 0


                                                                                                                       

epoch   14		loss 0.926921807886	acc 0.6114	cnt 1


                                                                                                                       

epoch   15		loss 0.904746250360	acc 0.6149	cnt 0


                                                                                                                       

epoch   16		loss 0.883878176879	acc 0.6169	cnt 0


                                                                                                                       

epoch   17		loss 0.864082234428	acc 0.6203	cnt 0


                                                                                                                       

epoch   18		loss 0.843464643488	acc 0.6129	cnt 1


                                                                                                                       

epoch   19		loss 0.824579210176	acc 0.6189	cnt 2


                                                                                                                       

epoch   20		loss 0.806978945131	acc 0.6167	cnt 3


                                                                                                                       

epoch   21		loss 0.785241717444	acc 0.6149	cnt 4


                                                                                                                       

epoch   22		loss 0.766921211159	acc 0.6176	cnt 5
train halted
---------- 학습 종료 ----------




In [17]:
# model = torch.load("Sentiment.pt",weights_only=False)
# torch.save(model,f"{saveFilePath}train_15.pt")