In [None]:
# 1. 필요한 데이터 가져오기
import pandas as pd

data = pd.read_excel("/content/drive/MyDrive/Project/data.xlsx", engine="openpyxl")

data.head()

In [24]:
# 2. 데이터 분할 (train 훈련 / test 검증)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)
train_data.shape, test_data.shape

((12580, 2), (3146, 2))

In [None]:
!pip install transformers

In [26]:
# 3. 전처리 : sentence의 한국어를 숫자벡터로 변환하는 토큰화 작업
# Hugging face의 BERT모델 활용

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

# 토크나이저 설정
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

# 데이터셋 정의
class TextDataset(Dataset):
  def __init__(self, sentences, labels, tokenizer, max_len):
    self.sentences = sentences
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, item):
    sentence = str(self.sentences[item])
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
        sentence,
        add_special_tokens = True,
        max_length = self.max_len,
        return_token_type_ids = False,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
      )

    return {
        'sentence' : sentence,
        'input_ids' : encoding['input_ids'].flatten(),
        'attention_mask' : encoding['attention_mask'].flatten(),
        'labels' : torch.tensor(label, dtype = torch.long)
      }

In [27]:
# 데이터 셋 생성
max_len = 128    # 최대 시퀀스 길이 설정
train_dataset = TextDataset(
    train_data['sentence'].to_numpy(),
    train_data['label_idx'].to_numpy(),
    tokenizer,
    max_len
)

test_dataset = TextDataset(
    train_data['sentence'].to_numpy(),
    train_data['label_idx'].to_numpy(),
    tokenizer,
    max_len
)

In [33]:
# 데이터 로더 생성
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, num_workers=4)



In [None]:
# BERT 모델 로드 및 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels = len(data['label_idx'].unique()))
model.to(device)

In [35]:
from torch.optim import AdamW
# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(),lr=5e-5)
# 손실함수 설정
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

epochs = 20
for epoch in range(epochs):
  # 훈련
  model.train()
  for batch in tqdm(train_dataloader, desc=f'Train epoch : {epoch + 1}'):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # 포워드 태우기
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    # 백워드 태우기
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f'Epoch {epoch +1}/{epochs} completed')

  # 평가
  model.eval()
  total_loss = 0
  all_preds = []
  all_labels = []
  with torch.no_grad():
    for batch in tqdm(test_dataloader, desc=f'Test epoch : {epoch +1}'):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs.loss
      total_loss += loss.item()

      # 예측
      _, preds = torch.max(outputs.logits, dim=1)
      all_preds.extend(preds.tolist())
      all_labels.extend(labels.tolist())

    avg_loss = total_loss / len(test_dataloader)
    avg_accuracy = accuracy_score(all_labels, all_preds)
    avg_precision = precision_score(all_labels, all_preds, average = 'weighted')
    avg_recall = recall_score(all_labels, all_preds, average = 'weighted')
    avg_f1 = f1_score(all_labels, all_preds, average = 'weighted')

    print(f'Test Loss : {avg_loss}')
    print(f'Test accuracy : {avg_accuracy}')
    print(f'Test f1 score : {avg_f1}')
    print(f'Test precision : {avg_precision}')
    print(f'Test recall : {avg_recall}')

Train epoch : 1: 100%|██████████| 197/197 [03:48<00:00,  1.16s/it]


Epoch 1/20 completed


Test epoch : 1: 100%|██████████| 197/197 [01:23<00:00,  2.36it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 1.8881846273005916
Test accuracy : 0.747933227344992
Test f1 score : 0.6752091605135122
Test precision : 0.6333528737084045
Test recall : 0.747933227344992


Train epoch : 2: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 2/20 completed


Test epoch : 2: 100%|██████████| 197/197 [01:23<00:00,  2.36it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 1.6176815849875437
Test accuracy : 0.7918124006359301
Test f1 score : 0.7285484259032033
Test precision : 0.6920860910987342
Test recall : 0.7918124006359301


Train epoch : 3: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 3/20 completed


Test epoch : 3: 100%|██████████| 197/197 [01:23<00:00,  2.35it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 1.3841876278673937
Test accuracy : 0.8345786963434022
Test f1 score : 0.782782311015297
Test precision : 0.7513151175534762
Test recall : 0.8345786963434022


Train epoch : 4: 100%|██████████| 197/197 [03:47<00:00,  1.15s/it]


Epoch 4/20 completed


Test epoch : 4: 100%|██████████| 197/197 [01:23<00:00,  2.36it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 1.2146165637195412
Test accuracy : 0.8505564387917329
Test f1 score : 0.8026038037993255
Test precision : 0.7727451380671806
Test recall : 0.8505564387917329


Train epoch : 5: 100%|██████████| 197/197 [03:47<00:00,  1.15s/it]


Epoch 5/20 completed


Test epoch : 5: 100%|██████████| 197/197 [01:23<00:00,  2.35it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 1.0469325735484283
Test accuracy : 0.8737678855325914
Test f1 score : 0.8331548785199377
Test precision : 0.8081755647090134
Test recall : 0.8737678855325914


Train epoch : 6: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 6/20 completed


Test epoch : 6: 100%|██████████| 197/197 [01:23<00:00,  2.36it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.9141797219133619
Test accuracy : 0.8944356120826709
Test f1 score : 0.8618830164803948
Test precision : 0.8412842091723042
Test recall : 0.8944356120826709


Train epoch : 7: 100%|██████████| 197/197 [03:47<00:00,  1.15s/it]


Epoch 7/20 completed


Test epoch : 7: 100%|██████████| 197/197 [01:23<00:00,  2.35it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.7985444194471776
Test accuracy : 0.9088235294117647
Test f1 score : 0.8804828999835314
Test precision : 0.8630614750661683
Test recall : 0.9088235294117647


Train epoch : 8: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 8/20 completed


Test epoch : 8: 100%|██████████| 197/197 [01:23<00:00,  2.36it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.6978110087099414
Test accuracy : 0.9261526232114468
Test f1 score : 0.9028002601980476
Test precision : 0.8883622726297091
Test recall : 0.9261526232114468


Train epoch : 9: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 9/20 completed


Test epoch : 9: 100%|██████████| 197/197 [01:23<00:00,  2.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.6044794139523192
Test accuracy : 0.9329093799682034
Test f1 score : 0.9118868367415395
Test precision : 0.8988462386582283
Test recall : 0.9329093799682034


Train epoch : 10: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 10/20 completed


Test epoch : 10: 100%|██████████| 197/197 [01:23<00:00,  2.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.5151945416848671
Test accuracy : 0.946899841017488
Test f1 score : 0.9300283685088294
Test precision : 0.9191750780639759
Test recall : 0.946899841017488


Train epoch : 11: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 11/20 completed


Test epoch : 11: 100%|██████████| 197/197 [01:23<00:00,  2.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.44699217348837006
Test accuracy : 0.9587440381558029
Test f1 score : 0.9458781522286038
Test precision : 0.9372582293945048
Test recall : 0.9587440381558029


Train epoch : 12: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 12/20 completed


Test epoch : 12: 100%|██████████| 197/197 [01:23<00:00,  2.36it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.3849729005304085
Test accuracy : 0.9619236883942767
Test f1 score : 0.9508553186837257
Test precision : 0.9440731103204751
Test recall : 0.9619236883942767


Train epoch : 13: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 13/20 completed


Test epoch : 13: 100%|██████████| 197/197 [01:23<00:00,  2.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.3303525051489699
Test accuracy : 0.9717806041335453
Test f1 score : 0.9642589412427751
Test precision : 0.959638311406844
Test recall : 0.9717806041335453


Train epoch : 14: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 14/20 completed


Test epoch : 14: 100%|██████████| 197/197 [01:23<00:00,  2.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.2838189468014664
Test accuracy : 0.9744038155802862
Test f1 score : 0.9682515010700674
Test precision : 0.9642347050998004
Test recall : 0.9744038155802862


Train epoch : 15: 100%|██████████| 197/197 [03:46<00:00,  1.15s/it]


Epoch 15/20 completed


Test epoch : 15: 100%|██████████| 197/197 [01:23<00:00,  2.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Test Loss : 0.24217756761966017
Test accuracy : 0.978060413354531
Test f1 score : 0.9736171302902843
Test precision : 0.9710548214036997
Test recall : 0.978060413354531


Train epoch : 16:   2%|▏         | 4/197 [00:04<03:48,  1.18s/it]