In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from CustomDataCollatorForSequenceClassification import CustomDataCollatorForSequenceClassification
from torch.optim import AdamW
from datasets import Dataset
import pandas as pd
import numpy as np
from WordPieceTokenizer import WordPieceTokenizer as Tokenizer
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from CustomBertSequenceClassification import CustomBertSequenceClassification
from CustomBert import CustomBertConfig
import CustomBert
import os
from Model import LSTM
from Model import Transformer, PositionalEncoding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

dataFilePath = 'datasets/'
saveFilePath = 'saves/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(f'{saveFilePath}vocab.txt',do_lower_case=False,strip_accents=False,clean_text=True)
VOCAB_SIZE = tokenizer.get_vocab_size()
MAX_SEQUENCE_LENGTH = 128

In [2]:
df = pd.read_csv(f'{dataFilePath}sentiment_train.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len,attention_mask,token_type_ids
0,2 10955 4065 2006 7119 1191 12454 19817 9959 3...,불안,24,1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 14186 143 7807 1225 1576 1366 1015 3 0 0 0 0...,불안,12,1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,2 4127 1515 1024 1206 1062 28552 4037 1076 158...,불안,14,1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,2 9388 2525 3097 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0...,불안,13,1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,2 98 1051 3092 1033 1330 1076 1836 25640 3 0 0...,불안,11,1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [3]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [4]:
def prepare_classification_dataset(data_frame, tokenizer):
    processed_tokens = []
    processed_attentions = []
    processed_token_type_ids = []

    for i in tqdm(range(len(data_frame)), desc="데이터 파싱 중"):
        token_str = data_frame.iloc[i, 0]
        attention_str = data_frame.iloc[i, 3]
        token_type_ids_str = data_frame.iloc[i, 4]

        processed_tokens.append([int(t) for t in token_str.split(" ")])
        processed_attentions.append([int(a) for a in attention_str.split(" ")])
        processed_token_type_ids.append([int(t) for t in token_type_ids_str.split(" ")])

    dataset_dict = {
        "input_ids": processed_tokens,
        "attention_mask": processed_attentions,
        "token_type_ids": processed_token_type_ids,
        "labels": data_frame["감정"].values.tolist()
    }
    
    hf_dataset = Dataset.from_dict(dataset_dict)
    hf_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
    
    return hf_dataset

In [5]:
train_df, val_df = train_test_split(df,train_size=0.8,test_size=0.2)

print(f"학습 세트의 크기: {len(train_df)} 행")
print(f"검증 세트의 크기: {len(val_df)} 행")

train_datasets = prepare_classification_dataset(train_df,tokenizer)
print(len(train_datasets))
val_datasets = prepare_classification_dataset(val_df,tokenizer)
print(len(val_datasets))

학습 세트의 크기: 117078 행
검증 세트의 크기: 29270 행


데이터 파싱 중:   0%|          | 0/117078 [00:00<?, ?it/s]

117078


데이터 파싱 중:   0%|          | 0/29270 [00:00<?, ?it/s]

29270


In [6]:
data_collator = CustomDataCollatorForSequenceClassification(tokenizer=tokenizer)

train_loader = DataLoader(
    train_datasets,
    batch_size=16,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=os.cpu_count()
)
print(len(train_loader))
val_loader = DataLoader(
    val_datasets,
    batch_size=1000,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=os.cpu_count()
)
print(len(val_loader))

7318
30


## LSTM

In [4]:
# def process_dataframe(data_frame, device,batch_size,shuffle=False):
#     tensor_x_list = []
#     attentions = []
#     token_type_ids_ = []
#     for i in tqdm(range(len(data_frame))):
#         token = data_frame.iloc[i,0]
#         token = token.split(" ")
#         token_list = []
#         for t in token:
#             token_list.append(int(t))
#         tensor_x_list.append(token_list)
        
#         attention = data_frame.iloc[i,3]
#         attention = attention.split(" ")
#         attention_list = []
#         for a in attention:
#             attention_list.append(int(a))
#         attentions.append(attention_list)

#         token_type_ids = data_frame.iloc[i,4]
#         token_type_ids = token_type_ids.split(" ")
#         token_type_ids_list = []
#         for t in token_type_ids:
#             token_type_ids_list.append(int(t))
#         token_type_ids_.append(attention_list)
        
#     tensor_x = torch.tensor(tensor_x_list, dtype=torch.long, device=device)
#     tensor_attention = torch.tensor(attentions, dtype=torch.long, device=device)
#     tensor_token_type_ids = torch.tensor(token_type_ids_, dtype=torch.long, device=device)
#     tensor_t = torch.tensor(data_frame["감정"].values.tolist(), dtype=torch.long, device=device)

#     dataset = TensorDataset(tensor_x,tensor_attention,tensor_t,tensor_token_type_ids)
#     loader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=True)
#     return loader
    
#     dataset = {"input_ids" : tensor_x, "attention_mask":tensor_attention,"token_type_ids":tensor_token_type_ids,"labels":tensor_t}
    
    

#     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
def LSTM_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer):
    acc = 0
    prev_acc = 0
    cnt = 0
    for e in range(epoch):
        NN.to(device)
        loss_sum = 0
        NN.train()
        for x, attention,t in train_loader:
            y = NN(x,attention)
            loss = loss_function(y,t)
            loss_sum += loss.item()
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_sum /= len(train_loader)
    
        NN.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, attention, t in val_loader:
                x = x.to(device)
                attention = attention.to(device)
                t = t.to(device)
    
                y = NN(x, attention)
                correct += (y.argmax(dim=-1) == t).sum().item()
                total += len(x)
        acc = correct / total
    
        if acc <= prev_acc:
            cnt += 1
        else :
            torch.save(NN.state_dict(), "Sentiment.pt")
            cnt = 0
            prev_acc = acc
        
        print(f"epoch  {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
        
        if cnt >= 5:
            print("train halted")
            break
            
    print("---------- 학습 종료 ----------")

In [8]:
# NN = LSTM(vocab_size=vocab_size,embedding_dim=embedding_dim,hidden_dim=64,output_dim=7,n_layers=4,bidirectional=True,dropout_p=0.1)
# NN.to(device)
# loss_function = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(NN.parameters(),lr=0.001)
# epoch = 500
# LSTM_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer)

## Transformer

In [9]:
def Transformer_Train(epoch, device, train_loader, val_loader, NN, loss_function, optimizer,scheduler):
    acc = 0
    prev_acc = 0
    cnt = 0
    for e in range(epoch):
        NN.to(device)
        loss_sum = 0
        NN.train()
        for x, attention, t in tqdm(train_loader, desc=f"Epoch {e+1} Training",leave=False):
            x = x.to(device)
            attention = attention.to(device)
            t = t.to(device)

            y = NN(x, attention)
            loss = loss_function(y, t)
            loss_sum += loss.item()
        
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm(NN.parameters(),1.0)
            optimizer.step()
        loss_sum /= len(train_loader)
        
        NN.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, attention, t in tqdm(val_loader, desc=f"Epoch {e+1} Validation",leave=False):
                x = x.to(device)
                attention = attention.to(device)
                t = t.to(device)
        
                y = NN(x, attention)
                correct += (y.argmax(dim=-1) == t).sum().item()
                total += len(x)
        acc = correct / total
        
        if acc <= prev_acc:
            cnt += 1
        else :
            torch.save(NN.state_dict(), "Sentiment.pt")
            cnt = 0
            prev_acc = acc

        scheduler.step(acc)
        
        print(f"epoch   {e+1}\t\tloss {loss_sum:.12f}\tacc {acc:.4f}\tcnt {cnt}")
        
        if cnt >= 5:
            print("train halted")
            break
            
    print("---------- 학습 종료 ----------")

In [11]:
NN = Transformer(vocab_size=VOCAB_SIZE,embedding_dim=128,hidden_dim=16,output_dim=7,n_layers=2,n_heads=4,dropout_p=0.05,max_len=150,pad_token_id=0)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(NN.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="max",factor=0.5,patience=3,)
epoch = 500
Transformer_Train(epoch,device,train_loader,val_loader,NN,loss_function,optimizer,scheduler)

  nn.utils.clip_grad_norm(NN.parameters(),1.0)
  output = torch._nested_tensor_from_mask(
                                                                                                                       

epoch   1		loss 1.611574331308	acc 0.4478	cnt 0


                                                                                                                       

epoch   2		loss 1.521470283749	acc 0.4758	cnt 0


                                                                                                                       

epoch   3		loss 1.440676615279	acc 0.5001	cnt 0


                                                                                                                       

epoch   4		loss 1.392663119186	acc 0.5133	cnt 0


                                                                                                                       

epoch   5		loss 1.358114989191	acc 0.5215	cnt 0


                                                                                                                       

epoch   6		loss 1.329676023304	acc 0.5304	cnt 0


                                                                                                                       

epoch   7		loss 1.304244029420	acc 0.5386	cnt 0


                                                                                                                       

epoch   8		loss 1.282142842020	acc 0.5440	cnt 0


                                                                                                                       

epoch   9		loss 1.262910174254	acc 0.5495	cnt 0


                                                                                                                       

epoch   10		loss 1.244619312144	acc 0.5542	cnt 0


                                                                                                                       

epoch   11		loss 1.227673865129	acc 0.5587	cnt 0


                                                                                                                       

KeyboardInterrupt: 

In [None]:
# model = torch.load("Sentiment.pt",weights_only=False)
# torch.save(model,f"{saveFilePath}train_15.pt")

# Transfer Model

In [7]:
MODEL_SAVE_PATH = "saves/Pretrain.pt"

HIDDEN_SIZE = 768
NUM_HIDDEN_LAYERS = 12
NUM_ATTENTION_HEADS = 12
INTERMEDIATE_SIZE = 3072
TYPE_VOCAB_SIZE = 2
DROPOUT_PROB = 0.1

config = CustomBertConfig(
    VOCAB_SIZE=VOCAB_SIZE,
    HIDDEN_SIZE=HIDDEN_SIZE,
    NUM_HIDDEN_LAYERS=NUM_HIDDEN_LAYERS,
    NUM_ATTENTION_HEADS=NUM_ATTENTION_HEADS,
    INTERMEDIATE_SIZE=INTERMEDIATE_SIZE,
    MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH,
    TYPE_VOCAB_SIZE=TYPE_VOCAB_SIZE,
    DROPOUT_PROB=DROPOUT_PROB
)

model = CustomBertSequenceClassification(config,MODEL_SAVE_PATH,7)

model.to(device)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Custom Bert 모델 초기화 완료. 총 학습 가능 파라미터 수 : {num_params}')
print(f'모델이 담긴 장치 : {device}')

모델 가중치를 saves/Pretrain.pt에서 불러오는 중..
모델 가중치 로드 완료
Custom Bert 모델 초기화 완료. 총 학습 가능 파라미터 수 : 110951943
모델이 담긴 장치 : cuda


In [8]:
EPOCHS = 5
LEARNING_RATE = 5e-8
WEIGHT_DECAY = 0.01
optimizer = AdamW(model.parameters(),lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY)

In [9]:
train_losses = []
acc = 0
prev_acc = 0
cnt = 0

print(f"\n<--- 학습 시작 ---> ({EPOCHS} 에폭)")

for e in range(EPOCHS):
    loss_sum = 0
    progress_bar = tqdm(train_loader,desc=f"Train Epoch {e+1}")
    model.train()
    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            token_type_ids=batch["token_type_ids"],
            labels=batch["labels"]
        )
        loss = outputs["loss"]

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loss_sum += loss.item()
        progress_bar.set_postfix({'loss':f"{(loss_sum/(step+1)):.4f}"})
        del outputs, loss
        if 'ccuda' in str(device):
            torch.cuda.empty_cache()

    avg_train_loss = loss_sum / len(train_loader)
    train_losses.append(avg_train_loss)

    print(f"Train Epoch {e+1} 완료. 평균 학습 손실 : {avg_train_loss:.4f}")

    model.eval()
    correct = 0
    total = 0
    val_progress = tqdm(val_loader, desc=f"Validation Epoch {e+1}")
    with torch.no_grad():
        for step, batch in enumerate(val_progress):
            
            y = model.forward(input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"]
            )
            t = batch["labels"]
            correct += (y.argmax(dim=-1) == t).sum().item()
            total += len(x)
    acc = correct / total
    val_progress.set_postfix({"acc" : f"{(acc*100):.2f}%"})
    if acc <= prev_acc:
        cnt += 1
    else :
        torch.save(classificationModel.state_dict(), "Sentiment.pt")
        cnt = 0
        prev_acc = acc

    if cnt >= 5:
        print("train halted")
        break
        
print("\n<--- 학습 완료 --->")


<--- 학습 시작 ---> (5 에폭)


Train Epoch 1:   0%|          | 0/7318 [00:40<?, ?it/s]

Batch input_ids shape: torch.Size([16, 150])
Max input_ids value in batch: 31103
Min input_ids value in batch: 0
Tokenizer vocab size: 32000


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
