## 1차 긍정/부정/중립 3진분류(Ternary_classification)

In [1]:
import numpy as np
import pandas as pd
from learning import CustomDataset, dataset_split, load_model
from learning import train_with_early_stopping, train_fn, eval_fn, predict_label
from learning.visualization import sent_length_vis, plot_training_progress
from transformers import ElectraTokenizer, ElectraForSequenceClassification

# KOELECTRA 모델 경로, tokenizer와 model이 같은 위치에 있음
modelpath = "./koelectra-base-v3-discriminator"

In [6]:
# 데이터 로드
dataset = pd.read_csv("./a.csv", encoding='UTF-8')
dataset.dropna(axis=0, inplace=True) # 결측치 제거
dataset.drop_duplicates(inplace=True) # 중복 제거
dataset.reset_index(drop=True, inplace=True) # 인덱스 리셋

print(dataset.isnull().sum()) # 결측치 확인
print(dataset['sentiment'].value_counts()) # 라벨 갯수분포 확인
dataset

sentence     0
sentiment    0
dtype: int64
중립    4975
부정    4974
긍정    4897
Name: sentiment, dtype: int64


Unnamed: 0,sentence,sentiment
0,유재석 오라버니 해피투게더 봤어요,긍정
1,우리모두 준혁이성과 LG선수들이 선전할수 있게 기를 넣어줍시다,긍정
2,웃긴거 알죠 ㅋㅋㅇ ㅏ,긍정
3,덕분에 주방에서 요리하는게 넘 좋아졌어요,긍정
4,마지막 순간 손가락으로 총을 만들어 쏘실때에는 제가 직접 맞는 것 같습니다,긍정
...,...,...
14841,하아 저게 사실이되면 엘지로 갈아타야되겠네,중립
14842,발견했다는 뉴스 후에 잠잠하네,중립
14843,못하면 원하던 투수가 아니다 할것이요 잘하면 250이닝 덜진각오를 해야될것이다,중립
14844,사건 자세한 진술서만안봣더라도 ㅠㅠ,중립


In [8]:
# 문장과 라벨 리스트화
sentences = dataset['sentence'].values
labels = dataset['sentiment'].values

# 라벨을 One-Hot 인코딩으로 변환
label_map = {'부정': 0, '중립': 1,'긍정': 2}
labels = np.array([label_map[label] for label in labels])

In [9]:
# 문장 토크나이징 길이 시각화, 원하는 비율을 커버하는 최적의 padding길이 return
padding_length = sent_length_vis(sentences, modelpath, 0.95) # 문장 리스트, 토크나이저 경로, 원하는 비율
print("최적의 padding_length는 '%s' 입니다."%padding_length)

100%|██████████| 14846/14846 [00:02<00:00, 5534.60it/s]


최적의 padding_length는 '27' 입니다.


In [10]:
train_dataloader, val_dataloader = dataset_split(padding_length, sentences, labels, modelpath, 0.2)

100%|██████████| 14846/14846 [00:02<00:00, 4981.29it/s]


In [13]:
model, optimizer, loss_fn, device = load_model(modelpath, 3) # 모델경로, label수 - (필수) 모델 dict 경로, 옵티마이저 dict 경로, 모델 체크포인트 경로 - (선택사항)

Some weights of the model checkpoint at ./koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ./koelectra-base-v3-discriminator and are newly initialized: ['classifier.o

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [12]:
train_losses_epoch, train_accs_epoch, val_losses_epoch, val_accs_epoch = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, loss_fn, device, 20,  patience=3)

Training:   0%|          | 0/93 [00:05<?, ?it/s]


RuntimeError: unique_by_key: failed to synchronize: cudaErrorAssert: device-side assert triggered

In [8]:
plot_training_progress(train_losses_epoch, train_accs_epoch, val_losses_epoch, val_accs_epoch)

## 3진 분류 학습된 모델로 직접 예측하기

In [3]:
df = pd.read_csv("./사랑이별_커뮤니티_챗봇데이터.csv", encoding='UTF-8')
korean_sentences = df['sentence'].tolist() # 한국어 문장 리스트

In [None]:
model, optimizer, loss_fn, device = load_model(modelpath, 2, checkpoint_path) # 체크포인트나 모델 dict, optimizer dict 로드

In [None]:
# 예측 라벨 출력
emotion_labels = {0:'부정', 1:'중립', 2:'긍정'}
predicted_label = [emotion_labels[predict_label(korean_sentence, model, modelpath, device)] for korean_sentence in tqdm(korean_sentences)]
df['label'] = predicted_label
df

In [None]:
df['label'].value_counts()