## 1차 긍정/부정/중립 3진분류(Ternary_classification)

In [16]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from learning import CustomDataset, dataset_split, load_model
from learning import train_with_early_stopping, train_fn, eval_fn, predict_label
from learning.visualization import sent_length_vis, plot_training_progress
from transformers import ElectraTokenizer, ElectraForSequenceClassification

# KOELECTRA 모델 경로, tokenizer와 model이 같은 위치에 있음
modelpath = "./koelectra-base-v3-discriminator"

In [2]:
# 데이터 로드
dataset = pd.read_csv("./a.csv", encoding='UTF-8')
dataset.dropna(axis=0, inplace=True) # 결측치 제거
dataset.drop_duplicates(inplace=True) # 중복 제거
dataset.reset_index(drop=True, inplace=True) # 인덱스 리셋

print(dataset.isnull().sum()) # 결측치 확인
print(dataset['sentiment'].value_counts()) # 라벨 갯수분포 확인
dataset

sentence     0
sentiment    0
dtype: int64
중립    4975
부정    4974
긍정    4897
Name: sentiment, dtype: int64


Unnamed: 0,sentence,sentiment
0,유재석 오라버니 해피투게더 봤어요,긍정
1,우리모두 준혁이성과 LG선수들이 선전할수 있게 기를 넣어줍시다,긍정
2,웃긴거 알죠 ㅋㅋㅇ ㅏ,긍정
3,덕분에 주방에서 요리하는게 넘 좋아졌어요,긍정
4,마지막 순간 손가락으로 총을 만들어 쏘실때에는 제가 직접 맞는 것 같습니다,긍정
...,...,...
14841,하아 저게 사실이되면 엘지로 갈아타야되겠네,중립
14842,발견했다는 뉴스 후에 잠잠하네,중립
14843,못하면 원하던 투수가 아니다 할것이요 잘하면 250이닝 덜진각오를 해야될것이다,중립
14844,사건 자세한 진술서만안봣더라도 ㅠㅠ,중립


In [3]:
# 문장과 라벨 리스트화
sentences = dataset['sentence'].values
labels = dataset['sentiment'].values

# 라벨을 One-Hot 인코딩으로 변환
label_map = {'부정': 0, '중립': 1,'긍정': 2}
labels = np.array([label_map[label] for label in labels])

In [4]:
# 문장 토크나이징 길이 시각화, 원하는 비율을 커버하는 최적의 padding길이 return
padding_length = sent_length_vis(sentences, modelpath, 0.95) # 문장 리스트, 토크나이저 경로, 원하는 비율
print("최적의 padding_length는 '%s' 입니다."%padding_length)

100%|██████████| 14846/14846 [00:02<00:00, 5369.35it/s]


최적의 padding_length는 '27' 입니다.


In [5]:
train_dataloader, val_dataloader = dataset_split(padding_length, sentences, labels, modelpath, 0.2)

100%|██████████| 14846/14846 [00:02<00:00, 5345.72it/s]


In [8]:
model, optimizer, loss_fn, device = load_model(modelpath, 3) # 모델경로, label수 - (필수) 모델 dict 경로, 옵티마이저 dict 경로, 모델 체크포인트 경로 - (선택사항)

Some weights of the model checkpoint at ./koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ./koelectra-base-v3-discriminator and are newly initialized: ['classifier.o

In [9]:
train_losses_epoch, train_accs_epoch, val_losses_epoch, val_accs_epoch = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, loss_fn, device, 50,  patience=3)

Training: 100%|██████████| 93/93 [00:56<00:00,  1.64it/s, Loss=1.09, Accuracy=0.35] 


Accuracy: 0.4653, Avg Loss: 1.0736, F1 Score: 0.4427, Precision: 0.5119, Recall: 0.4684
Epoch 1/50 - Train Loss: 1.0957 - Train Accuracy: 0.3594 - Validation Loss: 1.0736 - Validation Accuracy: 0.4653


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.918, Accuracy=0.6]  


Accuracy: 0.7155, Avg Loss: 0.8537, F1 Score: 0.7122, Precision: 0.7118, Recall: 0.7137
Epoch 2/50 - Train Loss: 1.0236 - Train Accuracy: 0.5320 - Validation Loss: 0.8537 - Validation Accuracy: 0.7155


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.659, Accuracy=0.75] 


Accuracy: 0.7835, Avg Loss: 0.6028, F1 Score: 0.7762, Precision: 0.7895, Recall: 0.7824
Epoch 3/50 - Train Loss: 0.7771 - Train Accuracy: 0.7205 - Validation Loss: 0.6028 - Validation Accuracy: 0.7835


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.66, Accuracy=0.74]  


Accuracy: 0.7909, Avg Loss: 0.5575, F1 Score: 0.7829, Precision: 0.8063, Recall: 0.7902
Epoch 4/50 - Train Loss: 0.6117 - Train Accuracy: 0.7744 - Validation Loss: 0.5575 - Validation Accuracy: 0.7909


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.571, Accuracy=0.77] 


Accuracy: 0.8114, Avg Loss: 0.5077, F1 Score: 0.8080, Precision: 0.8144, Recall: 0.8108
Epoch 5/50 - Train Loss: 0.5412 - Train Accuracy: 0.7952 - Validation Loss: 0.5077 - Validation Accuracy: 0.8114


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.542, Accuracy=0.8]  


Accuracy: 0.7936, Avg Loss: 0.5417, F1 Score: 0.7850, Precision: 0.8064, Recall: 0.7927
Epoch 6/50 - Train Loss: 0.5035 - Train Accuracy: 0.8062 - Validation Loss: 0.5417 - Validation Accuracy: 0.7936


Training: 100%|██████████| 93/93 [00:58<00:00,  1.58it/s, Loss=0.49, Accuracy=0.82]  


Accuracy: 0.8189, Avg Loss: 0.5020, F1 Score: 0.8150, Precision: 0.8199, Recall: 0.8180
Epoch 7/50 - Train Loss: 0.4649 - Train Accuracy: 0.8264 - Validation Loss: 0.5020 - Validation Accuracy: 0.8189


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.457, Accuracy=0.82] 


Accuracy: 0.8128, Avg Loss: 0.5218, F1 Score: 0.8097, Precision: 0.8184, Recall: 0.8125
Epoch 8/50 - Train Loss: 0.4437 - Train Accuracy: 0.8315 - Validation Loss: 0.5218 - Validation Accuracy: 0.8128


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.412, Accuracy=0.88] 


Accuracy: 0.8192, Avg Loss: 0.5092, F1 Score: 0.8157, Precision: 0.8216, Recall: 0.8186
Epoch 9/50 - Train Loss: 0.4297 - Train Accuracy: 0.8386 - Validation Loss: 0.5092 - Validation Accuracy: 0.8192


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.474, Accuracy=0.84] 


Accuracy: 0.8158, Avg Loss: 0.5234, F1 Score: 0.8114, Precision: 0.8198, Recall: 0.8151
Epoch 10/50 - Train Loss: 0.4079 - Train Accuracy: 0.8448 - Validation Loss: 0.5234 - Validation Accuracy: 0.8158


Training: 100%|██████████| 93/93 [00:58<00:00,  1.60it/s, Loss=0.362, Accuracy=0.88] 


Accuracy: 0.8205, Avg Loss: 0.5198, F1 Score: 0.8164, Precision: 0.8239, Recall: 0.8198
Epoch 11/50 - Train Loss: 0.3899 - Train Accuracy: 0.8509 - Validation Loss: 0.5198 - Validation Accuracy: 0.8205


Training: 100%|██████████| 93/93 [00:58<00:00,  1.60it/s, Loss=0.393, Accuracy=0.85] 


Accuracy: 0.8168, Avg Loss: 0.5137, F1 Score: 0.8130, Precision: 0.8186, Recall: 0.8160
Epoch 12/50 - Train Loss: 0.3821 - Train Accuracy: 0.8585 - Validation Loss: 0.5137 - Validation Accuracy: 0.8168


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.331, Accuracy=0.9]  


Accuracy: 0.8236, Avg Loss: 0.5073, F1 Score: 0.8206, Precision: 0.8233, Recall: 0.8228
Epoch 13/50 - Train Loss: 0.3653 - Train Accuracy: 0.8623 - Validation Loss: 0.5073 - Validation Accuracy: 0.8236


Training: 100%|██████████| 93/93 [00:58<00:00,  1.58it/s, Loss=0.305, Accuracy=0.88] 


Accuracy: 0.8175, Avg Loss: 0.5195, F1 Score: 0.8147, Precision: 0.8190, Recall: 0.8170
Epoch 14/50 - Train Loss: 0.3541 - Train Accuracy: 0.8685 - Validation Loss: 0.5195 - Validation Accuracy: 0.8175


Training: 100%|██████████| 93/93 [00:58<00:00,  1.59it/s, Loss=0.415, Accuracy=0.85] 


Accuracy: 0.8229, Avg Loss: 0.5258, F1 Score: 0.8196, Precision: 0.8237, Recall: 0.8222
Epoch 15/50 - Train Loss: 0.3420 - Train Accuracy: 0.8709 - Validation Loss: 0.5258 - Validation Accuracy: 0.8229


Training: 100%|██████████| 93/93 [00:59<00:00,  1.57it/s, Loss=0.331, Accuracy=0.88] 


Accuracy: 0.8121, Avg Loss: 0.5622, F1 Score: 0.8080, Precision: 0.8179, Recall: 0.8117
Epoch 16/50 - Train Loss: 0.3285 - Train Accuracy: 0.8798 - Validation Loss: 0.5622 - Validation Accuracy: 0.8121
No improvement in validation accuracy for 3 epochs. Early stopping...


In [10]:
plot_training_progress(train_losses_epoch, train_accs_epoch, val_losses_epoch, val_accs_epoch)

## 3진 분류 학습된 모델로 직접 예측하기

In [11]:
df = pd.read_csv("./사랑이별_커뮤니티_챗봇데이터.csv", encoding='UTF-8')
korean_sentences = df['sentence'].tolist() # 한국어 문장 리스트

In [14]:
model, optimizer, loss_fn, device = load_model(modelpath, 3, model_state_path="./model_state/best_model_state_epoch50.pt") # 체크포인트나 모델 dict, optimizer dict 로드

Some weights of the model checkpoint at ./koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ./koelectra-base-v3-discriminator and are newly initialized: ['classifier.o

In [17]:
# 예측 라벨 출력
emotion_labels = {0:'부정', 1:'중립', 2:'긍정'}
predicted_label = [emotion_labels[predict_label(korean_sentence, model, modelpath, device)] for korean_sentence in tqdm(korean_sentences)]
df['label'] = predicted_label
df

100%|██████████| 11823/11823 [09:46<00:00, 20.15it/s]


Unnamed: 0,sentence,sentiment,label
0,12시 땡!,0,부정
1,1지망 학교 떨어졌어,0,중립
2,3박4일 놀러가고 싶다,0,중립
3,3박4일 정도 놀러가고 싶다,0,중립
4,PPL 심하네,0,부정
...,...,...,...
11818,훔쳐보는 것도 눈치 보임.,2,부정
11819,훔쳐보는 것도 눈치 보임.,2,부정
11820,흑기사 해주는 짝남.,2,긍정
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,2,부정


In [18]:
df['label'].value_counts()

부정    6921
중립    4087
긍정     815
Name: label, dtype: int64