In [1]:
import torch
import torch.nn as nn
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# 데이터 로드
dataset = pd.read_csv("../dataset/Ternary_classification_data_spellcheck.csv", encoding='UTF-8')

In [3]:
dataset.dropna(axis=0, inplace=True)
dataset.reset_index(drop=True, inplace=True)
dataset.drop_duplicates(inplace=True)
print(dataset.isnull().sum())
print(dataset['sentiment'].value_counts())
dataset

sentence     0
sentiment    0
dtype: int64
부정    323321
긍정    263521
중립      6808
Name: sentiment, dtype: int64


Unnamed: 0,sentence,sentiment
0,언니 동생으로 부르는게 맞는 일인가요,부정
1,그냥 내 느낌일뿐겠지,부정
2,아직너무초기라서 그런거죠,부정
3,유치원버스 사고 낫다던데,부정
4,근데 원래이런거맞나요,부정
...,...,...
616533,솔직히 예보 제대로 못하는 데 세금이라도 아끼게 그냥 폐지해라,부정
616534,재미가 없으니 망하지,부정
616535,공장 도시락 비우생적임 아르바이트했는데 화장실가성 손도 않씯고 재료 담고 바닥 떨어...,부정
616536,코딱지 만한 나라에서 지들끼리 피터지게 싸우는 센징 클래스 ㅉㅉㅉ,부정


In [4]:
# 데이터 프레임에서 한국어 문장과 라벨을 가져옵니다.
sentences = dataset['sentence'].values
labels = dataset['sentiment'].values

In [5]:
# 라벨을 One-Hot 인코딩으로 변환합니다.
label_map = {'부정': 0, '중립': 1, '긍정': 2}
labels = np.array([label_map[label] for label in labels])

In [6]:
# koELECTRA 토크나이저 불러오기
tokenizer = ElectraTokenizer.from_pretrained("koelectra-base-v3-discriminator")

In [None]:
# 문장을 토큰화하고 시퀀스로 변환했을 때의 길이 출력
length = [len(tokenizer.encode(sentence)) for sentence in tqdm(sentences)]
len(length)

In [None]:
import plotly.graph_objects as go

dataset['length'] = length

# 히스토그램 생성
histogram = go.Histogram(x=dataset['length'], nbinsx=20)

# 최댓값과 최솟값 계산
max_value = dataset['length'].max()
min_value = dataset['length'].min()

# 최댓값과 최솟값을 주석으로 추가하여 표시
annotations = [
    {
        'x': max_value,
        'y': 0,
        'xref': 'x',
        'yref': 'y',
        'text': f'Max: {max_value}',
        'showarrow': True,
        'arrowhead': 4,
        'ax': 0,
        'ay': -40
    },
    {
        'x': min_value,
        'y': 0,
        'xref': 'x',
        'yref': 'y',
        'text': f'Min: {min_value}',
        'showarrow': True,
        'arrowhead': 4,
        'ax': 0,
        'ay': -40
    }
]

# 그래프 생성 및 레이아웃 설정
fig = go.Figure(histogram)
fig.update_layout(annotations=annotations)

# 그래프 출력
fig.show()

In [None]:
len([i for i in length if i <=60])/len(length) # 패딩 최대길이를 산정하기 위함. 60으로 설정하면 약 96%커버가능

In [7]:
# 문장을 토큰화하고 시퀀스로 변환합니다.
sequences = [tokenizer.encode(sentence, padding='max_length', max_length=60, truncation=True) for sentence in tqdm(sentences)]

100%|██████████| 593650/593650 [02:32<00:00, 3892.81it/s]


In [8]:
# 학습 데이터와 검증 데이터로 나눕니다.
train_sequences, val_sequences, train_labels, val_labels = train_test_split(
    sequences, labels, test_size=0.2, random_state=42
)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long),
        }

In [10]:
# 데이터셋과 데이터로더 생성
train_dataset = CustomDataset(train_sequences, train_labels, tokenizer, max_length=60)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, pin_memory=True)

val_dataset = CustomDataset(val_sequences, val_labels, tokenizer, max_length=60)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=False, pin_memory=True)

In [27]:
# koELECTRA 모델 불러오기
model = ElectraForSequenceClassification.from_pretrained("koelectra-base-v3-discriminator", num_labels=3)

Some weights of the model checkpoint at koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense

In [None]:
# ''' 모델가중치를 불러오거나 체크포인트를 불러올 때 실행 '''

# # 모델의 상태 딕셔너리를 로드합니다.
# model_state_dict = torch.load("Ternary_model_state_dict_learning_New_epoch_15.pt")

# # 모델을 생성하고 상태를 로드합니다.
# model = ElectraForSequenceClassification.from_pretrained("koelectra-base-v3-discriminator", num_labels=3)
# model.load_state_dict(model_state_dict)

# # 옵티마이저의 상태 딕셔너리를 로드합니다.
# optimizer_state_dict = torch.load("Ternary_optimizer_state_learning_dict_New_epoch_15.pt")

In [28]:
# 변경하고자 하는 Dropout 비율
new_dropout_rate = 0.2

# 모든 Dropout 레이어의 비율 변경
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = new_dropout_rate


In [None]:
# 모델 구조 확인
print(model)

In [30]:
# 학습 함수 정의 (tqdm을 사용하여 진행 상황 및 지표 시각화)
def train_fn(data_loader, model, optimizer, loss_fn, device):
    model.train()
    progress_bar = tqdm(data_loader, desc="Training")

    train_losses = []  # train_loss 기록을 위한 리스트
    train_accs = []  # train_accuracy 기록을 위한 리스트

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = loss_fn(logits, labels)  # 손실 함수 적용
        loss.backward()
        optimizer.step()

        # 정확도 계산
        predicted_labels = torch.argmax(logits, dim=1)
        accuracy = (predicted_labels == labels).float().mean().item()

        train_losses.append(loss.item())
        train_accs.append(accuracy)
        progress_bar.set_postfix({'Loss': loss.item(), 'Accuracy': accuracy})

    return train_accs, train_losses


In [31]:
# 평가 함수 정의
def eval_fn(data_loader, model, loss_fn, device):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    total_loss = 0.0

    predicted_labels_list = []  # 예측한 라벨들을 저장하기 위한 리스트
    true_labels_list = []  # 실제 라벨들을 저장하기 위한 리스트

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)  # 손실 함수 적용
            total_loss += loss.item()

            predicted_labels = torch.argmax(logits, dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

            # 예측한 라벨과 실제 라벨을 리스트에 추가
            predicted_labels_list.extend(predicted_labels.tolist())
            true_labels_list.extend(labels.tolist())

    accuracy = correct_predictions / total_predictions
    avg_loss = total_loss / len(data_loader)

    # 예측한 라벨과 실제 라벨 출력
    predicted_labels_list = np.array(predicted_labels_list)
    true_labels_list = np.array(true_labels_list)
    print("Predicted Labels:", predicted_labels_list)
    print("True Labels:", true_labels_list)

    # f1 score, precision, recall 계산
    f1 = f1_score(true_labels_list, predicted_labels_list, average='macro')
    precision = precision_score(true_labels_list, predicted_labels_list, average='macro')
    recall = recall_score(true_labels_list, predicted_labels_list, average='macro')

    print(f"Accuracy: {accuracy:.4f}, Avg Loss: {avg_loss:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    return accuracy, avg_loss

In [32]:
# 장치 설정 (GPU 사용을 위해)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 옵티마이저와 손실 함수 설정   
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
loss_fn = torch.nn.CrossEntropyLoss()

In [33]:
# 학습 및 Early Stopping 정의
num_epochs = 50

# to store train results
train_losses_epoch = []  # epoch 별 train_loss 기록을 위한 리스트
train_accs_epoch = []  # epoch 별 train_accuracy 기록을 위한 리스트

# to store validation results
val_losses_epoch = []
val_accs_epoch = []

def train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, loss_fn, device, patience=3):
    model.to(device)
    best_val_accuracy = 0.0
    best_model_state_dict = None
    no_improvement = 0

    for epoch in range(num_epochs):
        train_accs, train_losses = train_fn(train_dataloader, model, optimizer, loss_fn, device)
        val_accuracy, val_loss  = eval_fn(val_dataloader, model, loss_fn, device)
        # train 결과 저장
        train_losses_epoch.extend(train_losses)
        train_accs_epoch.extend(train_accs)

        # Validation 결과 저장
        val_losses_epoch.append(val_loss)
        val_accs_epoch.append(val_accuracy)
        
        # 일정한 간격으로 체크포인트 저장
        if (epoch+1) % 5 == 0 and epoch >= 0: # 
            checkpoint = {
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                # 필요한 경우 추가 정보 저장 (예: loss, accuracy 등)
            }
            torch.save(checkpoint, f'./checkpoint/checkpoint_epoch_{epoch+1}.pth')

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {np.mean(train_losses):.4f} - Train Accuracy: {np.mean(train_accs):.4f} - Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

        # Early Stopping 체크
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state_dict = model.state_dict()
            no_improvement = 0
        else:
            no_improvement += 1

        if no_improvement >= patience:
            print(f"No improvement in validation accuracy for {patience} epochs. Early stopping...")
            break

    # 최적의 모델 state_dict 반환
    return best_model_state_dict


train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, loss_fn, device, patience=3)

Training: 100%|██████████| 3711/3711 [39:48<00:00,  1.55it/s, Loss=0.259, Accuracy=0.9]  
  _warn_prf(average, modifier, msg_start, len(result))


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8836, Avg Loss: 0.3150, F1 Score: 0.5920, Precision: 0.5891, Recall: 0.5950
Epoch 1/50 - Train Loss: 0.3889 - Train Accuracy: 0.8462 - Validation Loss: 0.3150 - Validation Accuracy: 0.8836


Training: 100%|██████████| 3711/3711 [39:44<00:00,  1.56it/s, Loss=0.226, Accuracy=0.875]


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8895, Avg Loss: 0.3034, F1 Score: 0.6403, Precision: 0.7785, Recall: 0.6233
Epoch 2/50 - Train Loss: 0.3064 - Train Accuracy: 0.8808 - Validation Loss: 0.3034 - Validation Accuracy: 0.8895


Training: 100%|██████████| 3711/3711 [39:59<00:00,  1.55it/s, Loss=0.234, Accuracy=0.85] 


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8932, Avg Loss: 0.2905, F1 Score: 0.7051, Precision: 0.7388, Recall: 0.6855
Epoch 3/50 - Train Loss: 0.2851 - Train Accuracy: 0.8888 - Validation Loss: 0.2905 - Validation Accuracy: 0.8932


Training: 100%|██████████| 3711/3711 [38:53<00:00,  1.59it/s, Loss=0.495, Accuracy=0.85] 


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8968, Avg Loss: 0.2796, F1 Score: 0.7217, Precision: 0.7516, Recall: 0.7021
Epoch 4/50 - Train Loss: 0.2715 - Train Accuracy: 0.8941 - Validation Loss: 0.2796 - Validation Accuracy: 0.8968


Training: 100%|██████████| 3711/3711 [38:48<00:00,  1.59it/s, Loss=0.127, Accuracy=0.95] 


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8980, Avg Loss: 0.2828, F1 Score: 0.7153, Precision: 0.7683, Recall: 0.6892
Epoch 5/50 - Train Loss: 0.2601 - Train Accuracy: 0.8985 - Validation Loss: 0.2828 - Validation Accuracy: 0.8980


Training: 100%|██████████| 3711/3711 [39:37<00:00,  1.56it/s, Loss=0.163, Accuracy=0.925]


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8987, Avg Loss: 0.2767, F1 Score: 0.7257, Precision: 0.7551, Recall: 0.7063
Epoch 6/50 - Train Loss: 0.2508 - Train Accuracy: 0.9024 - Validation Loss: 0.2767 - Validation Accuracy: 0.8987


Training: 100%|██████████| 3711/3711 [38:56<00:00,  1.59it/s, Loss=0.281, Accuracy=0.875] 


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.9007, Avg Loss: 0.2774, F1 Score: 0.7207, Precision: 0.7789, Recall: 0.6928
Epoch 7/50 - Train Loss: 0.2416 - Train Accuracy: 0.9067 - Validation Loss: 0.2774 - Validation Accuracy: 0.9007


Training: 100%|██████████| 3711/3711 [38:39<00:00,  1.60it/s, Loss=0.197, Accuracy=0.9]  


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.9000, Avg Loss: 0.2796, F1 Score: 0.7284, Precision: 0.7490, Recall: 0.7131
Epoch 8/50 - Train Loss: 0.2339 - Train Accuracy: 0.9094 - Validation Loss: 0.2796 - Validation Accuracy: 0.9000


Training: 100%|██████████| 3711/3711 [2:37:18<00:00,  2.54s/it, Loss=0.411, Accuracy=0.9]     


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.9010, Avg Loss: 0.2898, F1 Score: 0.7189, Precision: 0.7851, Recall: 0.6899
Epoch 9/50 - Train Loss: 0.2267 - Train Accuracy: 0.9126 - Validation Loss: 0.2898 - Validation Accuracy: 0.9010


Training: 100%|██████████| 3711/3711 [3:20:30<00:00,  3.24s/it, Loss=0.168, Accuracy=0.95]    


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8997, Avg Loss: 0.2758, F1 Score: 0.7305, Precision: 0.7599, Recall: 0.7116
Epoch 10/50 - Train Loss: 0.2200 - Train Accuracy: 0.9153 - Validation Loss: 0.2758 - Validation Accuracy: 0.8997


Training: 100%|██████████| 3711/3711 [3:25:36<00:00,  3.32s/it, Loss=0.118, Accuracy=0.95]    


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.9008, Avg Loss: 0.2801, F1 Score: 0.7317, Precision: 0.7657, Recall: 0.7105
Epoch 11/50 - Train Loss: 0.2123 - Train Accuracy: 0.9183 - Validation Loss: 0.2801 - Validation Accuracy: 0.9008


Training: 100%|██████████| 3711/3711 [3:25:27<00:00,  3.32s/it, Loss=0.372, Accuracy=0.925]   


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.9025, Avg Loss: 0.2861, F1 Score: 0.7375, Precision: 0.7649, Recall: 0.7186
Epoch 12/50 - Train Loss: 0.2063 - Train Accuracy: 0.9213 - Validation Loss: 0.2861 - Validation Accuracy: 0.9025


Training: 100%|██████████| 3711/3711 [38:56<00:00,  1.59it/s, Loss=0.206, Accuracy=0.925] 


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.9008, Avg Loss: 0.2887, F1 Score: 0.7303, Precision: 0.7686, Recall: 0.7075
Epoch 13/50 - Train Loss: 0.2002 - Train Accuracy: 0.9235 - Validation Loss: 0.2887 - Validation Accuracy: 0.9008


Training: 100%|██████████| 3711/3711 [38:40<00:00,  1.60it/s, Loss=0.176, Accuracy=0.9]   


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8992, Avg Loss: 0.2923, F1 Score: 0.7333, Precision: 0.7696, Recall: 0.7114
Epoch 14/50 - Train Loss: 0.1929 - Train Accuracy: 0.9265 - Validation Loss: 0.2923 - Validation Accuracy: 0.8992


Training: 100%|██████████| 3711/3711 [38:41<00:00,  1.60it/s, Loss=0.178, Accuracy=0.875] 


Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8992, Avg Loss: 0.2982, F1 Score: 0.7288, Precision: 0.7732, Recall: 0.7042
Epoch 15/50 - Train Loss: 0.1872 - Train Accuracy: 0.9289 - Validation Loss: 0.2982 - Validation Accuracy: 0.8992
No improvement in validation accuracy for 3 epochs. Early stopping...


OrderedDict([('electra.embeddings.position_ids',
              tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                        98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
                     

## 시각화 코드

In [34]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 시각화
fig = make_subplots(rows=2, cols=2, subplot_titles=("Train Loss", "Train Accuracy", "Validation Loss", "Validation Accuracy"))

# Train Loss 그래프
fig.add_trace(
    go.Scatter(x=list(range(len(train_losses_epoch))), y=train_losses_epoch, mode='lines', name='Train Loss'),
    row=1, col=1
)
fig.update_xaxes(title_text="Iterations", row=1, col=1)
fig.update_yaxes(title_text="Loss", row=1, col=1)

# Train Accuracy 그래프
fig.add_trace(
    go.Scatter(x=list(range(len(train_accs_epoch))), y=train_accs_epoch, mode='lines', name='Train Accuracy'),
    row=1, col=2
)
fig.update_xaxes(title_text="Iterations", row=1, col=2)
fig.update_yaxes(title_text="Accuracy", row=1, col=2)

# Validation Loss 그래프
fig.add_trace(
    go.Scatter(x=list(range(len(val_losses_epoch))), y=val_losses_epoch, mode='lines', name='Validation Loss'),
    row=2, col=1
)
fig.update_xaxes(title_text="Iterations", row=2, col=1)
fig.update_yaxes(title_text="Loss", row=2, col=1)

# Validation Accuracy 그래프
fig.add_trace(
    go.Scatter(x=list(range(len(val_accs_epoch))), y=val_accs_epoch, mode='lines', name='Validation Accuracy'),
    row=2, col=2
)
fig.update_xaxes(title_text="Iterations", row=2, col=2)
fig.update_yaxes(title_text="Accuracy", row=2, col=2)

fig.update_layout(title="Training Progress")
fig.show()

In [35]:
# 모델의 상태 딕셔너리를 얻어옵니다.
model_state_dict = model.state_dict()

# 모델 상태 딕셔너리를 파일로 저장합니다.
torch.save(model_state_dict, "Ternary_model_state_dict_learning_New_epoch_15.pt")

# 옵티마이저의 상태 딕셔너리를 얻어옵니다.
optimizer_state_dict = optimizer.state_dict()

# 옵티마이저 상태 딕셔너리를 파일로 저장합니다.
torch.save(optimizer_state_dict, "Ternary_optimizer_state_learning_dict_New_epoch_15.pt")

In [36]:
# 검증 데이터로 평가 수행
val_accuracy = eval_fn(val_dataloader, model, loss_fn, device)
print("Validation Accuracy:", val_accuracy[0])
print("Validation Loss:", val_accuracy[1])

Predicted Labels: [0 0 0 ... 0 0 0]
True Labels: [0 0 0 ... 0 2 0]
Accuracy: 0.8992, Avg Loss: 0.2982, F1 Score: 0.7288, Precision: 0.7732, Recall: 0.7042
Validation Accuracy: 0.8992167101827676
Validation Loss: 0.2982346855890777


## 실제로 예측해보기

In [1]:
import torch
import torch.nn as nn
from transformers import ElectraTokenizer, ElectraForSequenceClassification
import pandas as pd
from tqdm import tqdm 

In [2]:
# koELECTRA 토크나이저 불러오기
tokenizer = ElectraTokenizer.from_pretrained("koelectra-base-v3-discriminator")

In [3]:
''' 모델가중치를 불러오거나 체크포인트를 불러올 때 실행 '''

# 모델의 상태 딕셔너리를 로드합니다.
model_state_dict = torch.load("Ternary_model_state_dict_learning_New_epoch_15.pt")

# 모델을 생성하고 상태를 로드합니다.
model = ElectraForSequenceClassification.from_pretrained("koelectra-base-v3-discriminator", num_labels=3)
model.load_state_dict(model_state_dict)

# 옵티마이저의 상태 딕셔너리를 로드합니다.
optimizer_state_dict = torch.load("Ternary_optimizer_state_learning_dict_New_epoch_15.pt")

Some weights of the model checkpoint at koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at koelectra-base-v3-discriminator and are newly initialized: ['classifier.out_p

In [4]:
# 변경하고자 하는 Dropout 비율
new_dropout_rate = 0.2

# 모든 Dropout 레이어의 비율 변경
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = new_dropout_rate


In [5]:
# 장치 설정 (GPU 사용을 위해)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [6]:
# 한국어 문장을 입력으로 받아서 예측 라벨을 출력하는 함수
def predict_label(sentence, model, tokenizer, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
        return predicted_label

In [7]:
df = pd.read_csv("../dataset/사랑이별_커뮤니티_챗봇데이터.csv", encoding='UTF-8')

In [8]:
# 한국어 문장 입력 받기
korean_sentences = df['sentence'].tolist()

# 예측 라벨 출력
emotion_labels = {0:'부정', 1:'중립', 2:'긍정'}
predicted_label = [emotion_labels[predict_label(korean_sentence, model, tokenizer, device)] for korean_sentence in tqdm(korean_sentences)]
df['label'] = predicted_label

100%|██████████| 11823/11823 [02:58<00:00, 66.11it/s]


In [9]:
df.rename(columns={'label': 'predicted_sentiment'}, inplace=True)
df = df[['sentence','predicted_sentiment']]
df
df.to_csv("Ternary_classification_epoch_15_prediction_New.csv", index=False, encoding='UTF-8')

In [43]:
df['predicted_sentiment'].value_counts()

부정    5959
긍정    5741
중립     123
Name: predicted_sentiment, dtype: int64