In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from WordPieceTokenizer import WordPieceTokenizer as Tokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm


dataFilePath = 'datasets/'
saveFilePath = 'saves/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(f'{dataFilePath}sentiment_vocab/vocab.txt',do_lower_case=False,strip_accents=False,clean_text=False)

In [2]:
df = pd.read_csv(f'{dataFilePath}sentiment_train.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len,attention_mask,token_type_ids
0,2 2376 2347 1993 10402 1051 2699 16526 1047 3 ...,불안,24,1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 2083 189 2413 18370 2145 3 0 0 0 0 0 0 0 0 0...,불안,12,1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,2 2097 4680 1398 18353 1037 5411 1191 3 0 0 0 ...,불안,14,1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,2 10126 21684 2664 18111 2596 3 0 0 0 0 0 0 0 ...,불안,13,1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,2 2246 2907 10620 1011 1271 2670 3 0 0 0 0 0 0...,불안,11,1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [3]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [4]:
def process_dataframe(data_frame, device,batch_size,shuffle=False):
    tensor_x_list = []
    attentions = []
    for i in tqdm(range(len(data_frame))):
        token = data_frame.iloc[i,0]
        token = token.split(" ")
        token_list = []
        for t in token:
            token_list.append(int(t))
        tensor_x_list.append(token_list)
        
        attention = data_frame.iloc[i,3]
        attention = attention.split(" ")
        attention_list = []
        for a in attention:
            attention_list.append(int(a))
        attentions.append(attention_list)

    tensor_x = torch.tensor(tensor_x_list, dtype=torch.long, device=device)
    tensor_attention = torch.tensor(attentions, dtype=torch.long, device=device)
    tensor_t = torch.tensor(data_frame["감정"].values.tolist(), dtype=torch.long, device=device)

    dataset = TensorDataset(tensor_x,tensor_attention,tensor_t)

    loader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=True)
    
    return loader

In [5]:
train_df, val_df = train_test_split(df,train_size=0.9,test_size=0.1)

print(f"학습 세트의 크기: {len(train_df)} 행")
print(f"검증 세트의 크기: {len(val_df)} 행")

train_loader = process_dataframe(train_df,device,100,True)
val_loader = process_dataframe(val_df,device,1000,False)

학습 세트의 크기: 131713 행
검증 세트의 크기: 14635 행


100%|████████████████████████████████████████████████████████████████████████| 131713/131713 [00:16<00:00, 8091.36it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14635/14635 [00:01<00:00, 8431.57it/s]


In [7]:
from Model import LSTM
from Model import Transformer, PositionalEncoding
vocab_size = tokenizer.get_vocab_size()
embedding_dim = 250

## LSTM

In [8]:
def LSTM_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer):
    acc = 0
    prev_acc = 0
    cnt = 0
    for e in range(epoch):
        NN.to(device)
        loss_sum = 0
        NN.train()
        for x, attention,t in train_loader:
            y = NN(x,attention)
            loss = loss_function(y,t)
            loss_sum += loss.item()
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_sum /= len(train_loader)
    
        NN.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, attention, t in val_loader:
                x = x.to(device)
                attention = attention.to(device)
                t = t.to(device)
    
                y = NN(x, attention)
                correct += (y.argmax(dim=-1) == t).sum().item()
                total += len(x)
        acc = correct / total
    
        if acc <= prev_acc:
            cnt += 1
        else :
            torch.save(NN.state_dict(), "Sentiment.pt")
            cnt = 0
            prev_acc = acc
        
        print(f"epoch  {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
        
        if cnt >= 5:
            print("train halted")
            break
            
    print("---------- 학습 종료 ----------")

In [9]:
# NN = LSTM(vocab_size=vocab_size,embedding_dim=embedding_dim,hidden_dim=64,output_dim=7,n_layers=4,bidirectional=True,dropout_p=0.1)
# NN.to(device)
# loss_function = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(NN.parameters(),lr=0.001)
# epoch = 500
# LSTM_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer)

## Transformer

In [10]:
def Transformer_Train(epoch, device, train_loader, val_loader, NN, loss_function, optimizer):
    acc = 0
    prev_acc = 0
    cnt = 0
    for e in range(epoch):
        NN.to(device)
        loss_sum = 0
        NN.train()
        for x, attention, t in tqdm(train_loader, desc=f"Epoch {e+1} Training"):
            x = x.to(device)
            attention = attention.to(device)
            t = t.to(device)

            y = NN(x, attention)
            loss = loss_function(y, t)
            loss_sum += loss.item()
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_sum /= len(train_loader)
        
        NN.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, attention, t in tqdm(val_loader, desc=f"Epoch {e+1} Validation"):
                x = x.to(device)
                attention = attention.to(device)
                t = t.to(device)
        
                y = NN(x, attention)
                correct += (y.argmax(dim=-1) == t).sum().item()
                total += len(x)
        acc = correct / total
        
        if acc <= prev_acc:
            cnt += 1
        else :
            torch.save(NN.state_dict(), "Sentiment.pt")
            cnt = 0
            prev_acc = acc
        
        print(f"epoch   {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
        
        if cnt >= 5:
            print("train halted")
            break
            
    print("---------- 학습 종료 ----------")

In [16]:
NN = Transformer(vocab_size=vocab_size,embedding_dim=256,hidden_dim=16,output_dim=7,n_layers=4,n_heads=16,dropout_p=0.1,max_len=250,pad_token_id=0)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(NN.parameters(),lr=0.001)
epoch = 500
Transformer_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer)

Epoch 1 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.81it/s]
Epoch 1 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.25it/s]


epoch   1		loss 1.371431331971	acc 0.5767	cnt 0


Epoch 2 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.81it/s]
Epoch 2 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 20.53it/s]


epoch   2		loss 1.160224917176	acc 0.5956	cnt 0


Epoch 3 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.83it/s]
Epoch 3 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 20.88it/s]


epoch   3		loss 1.063455206613	acc 0.6102	cnt 0


Epoch 4 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.82it/s]
Epoch 4 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.51it/s]


epoch   4		loss 1.000909667850	acc 0.6222	cnt 0


Epoch 5 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.82it/s]
Epoch 5 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 20.61it/s]


epoch   5		loss 0.964767390838	acc 0.6279	cnt 0


Epoch 6 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.82it/s]
Epoch 6 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 18.18it/s]


epoch   6		loss 0.931987685739	acc 0.6256	cnt 1


Epoch 7 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.81it/s]
Epoch 7 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.54it/s]


epoch   7		loss 0.909927549150	acc 0.6274	cnt 2


Epoch 8 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.82it/s]
Epoch 8 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 20.60it/s]


epoch   8		loss 0.895576115395	acc 0.6229	cnt 3


Epoch 9 Training: 100%|████████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.82it/s]
Epoch 9 Validation: 100%|██████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.02it/s]


epoch   9		loss 0.881080329011	acc 0.6344	cnt 0


Epoch 10 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.85it/s]
Epoch 10 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.34it/s]


epoch   10		loss 0.870104484605	acc 0.6347	cnt 0


Epoch 11 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.84it/s]
Epoch 11 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.08it/s]


epoch   11		loss 0.859852410485	acc 0.6361	cnt 0


Epoch 12 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.85it/s]
Epoch 12 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.21it/s]


epoch   12		loss 0.857323152420	acc 0.6302	cnt 1


Epoch 13 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.84it/s]
Epoch 13 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 18.07it/s]


epoch   13		loss 0.847469815768	acc 0.6240	cnt 2


Epoch 14 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.84it/s]
Epoch 14 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.15it/s]


epoch   14		loss 0.858280455423	acc 0.6275	cnt 3


Epoch 15 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.84it/s]
Epoch 15 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 20.83it/s]


epoch   15		loss 0.844552876330	acc 0.6320	cnt 4


Epoch 16 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.84it/s]
Epoch 16 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.58it/s]


epoch   16		loss 0.844311154694	acc 0.6365	cnt 0


Epoch 17 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:45<00:00,  5.84it/s]
Epoch 17 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.38it/s]


epoch   17		loss 0.860271964050	acc 0.6330	cnt 1


Epoch 18 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:46<00:00,  5.80it/s]
Epoch 18 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.10it/s]


epoch   18		loss 0.846316316857	acc 0.6264	cnt 2


Epoch 19 Training: 100%|███████████████████████████████████████████████████████████| 1317/1317 [03:47<00:00,  5.80it/s]
Epoch 19 Validation: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 18.12it/s]


epoch   19		loss 0.847210977914	acc 0.6284	cnt 3


Epoch 20 Training:   3%|██                                                           | 45/1317 [00:07<03:41,  5.74it/s]


KeyboardInterrupt: 

In [17]:
model = torch.load("Sentiment.pt",weights_only=False)
torch.save(model,f"{saveFilePath}train_14.pt")