In [None]:
!pip install kobert-transformers

In [9]:
!pip install torch



In [None]:
!pip install --upgrade kobert-transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 1. 필요한 데이터 가져오기
import pandas as pd

data = pd.read_excel("/content/drive/MyDrive/젯봇/data.xlsx", engine="openpyxl")

data.head()

Unnamed: 0,sentence,label_idx
0,네,0
1,네 접시 색깔은 다른데 가격은 똑같아요,1997
2,네 그럼요,0
3,오늘 회는 뭐가 좋나요,2349
4,여기 배달도 하시는가요?,1750


In [14]:
# 2. 데이터 분할 (train 훈련 / test 검증)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)
train_data.shape, test_data.shape

((12580, 2), (3146, 2))

In [26]:
from kobert_transformers import get_kobert_model, get_tokenizer
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification

# tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base')
# model = BertModel.from_pretrained('kykim/bert-kor-base')

# 토크나이저, 모델 설정
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2998)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# 3. 전처리 : sentence의 한국어를 숫자벡터로 변환하는 토큰화 작업
# Hugging face의 BERT모델 활용

# 데이터셋 정의
class TextDataset(Dataset):
  def __init__(self, sentences, labels, tokenizer, max_len):
    self.sentences = sentences
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, item):
    sentence = str(self.sentences[item])
    label = self.labels[item]
    encoding = self.tokenizer.encode_plus(
        sentence,
        add_special_tokens = True,
        max_length = self.max_len,
        return_token_type_ids = False,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
      )

    return {
        'sentence' : sentence,
        'input_ids' : encoding['input_ids'].flatten(),
        'attention_mask' : encoding['attention_mask'].flatten(),
        'labels' : torch.tensor(label, dtype = torch.long)
      }

In [28]:
# 데이터 셋 생성
max_len = 128    # 최대 시퀀스 길이 설정
train_dataset = TextDataset(
    train_data['sentence'].to_numpy(),
    train_data['label_idx'].to_numpy(),
    tokenizer,
    max_len
)

test_dataset = TextDataset(
    test_data['sentence'].to_numpy(),
    test_data['label_idx'].to_numpy(),
    tokenizer,
    max_len
)

In [29]:
# 데이터 로더 생성
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

In [30]:
# BERT 모델 로드 및 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels = len(data['label_idx'].unique()))
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [31]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss

# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(),lr=5e-5)
# 손실함수 설정
loss_fn = CrossEntropyLoss()

In [32]:
from tqdm import tqdm
import numpy as np

# 훈련 함수
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0

  for i in tqdm(data_loader, desc=f'train epoch : {epoch + 1}'):
    input_ids = i["input_ids"].to(device)
    attention_mask = i["attention_mask"].to(device)
    labels = i["labels"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(outputs.logits, dim=1)
    loss = loss_fn(outputs.logits, labels)

    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)


In [33]:
# 평가 함수
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for i in tqdm(data_loader,  desc=f'test epoch : {epoch + 1}'):
            input_ids = i["input_ids"].to(device)
            attention_mask = i["attention_mask"].to(device)
            labels = i["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)


In [34]:
# 모델 훈련 및 평가 루프
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, len(train_dataset))
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, test_loader, loss_fn, device, len(test_dataset))
    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()

Epoch 1/3
----------


train epoch : 1: 100%|██████████| 394/394 [01:20<00:00,  4.92it/s]


Train loss 6.061377318377422 accuracy 0.23593004769475356


train epoch : 1: 100%|██████████| 99/99 [00:07<00:00, 13.96it/s]


Val loss 5.799241865524138 accuracy 0.23935155753337573

Epoch 2/3
----------


train epoch : 2: 100%|██████████| 394/394 [01:20<00:00,  4.92it/s]


Train loss 5.6095779135747605 accuracy 0.23950715421303656


train epoch : 2:  65%|██████▍   | 64/99 [00:04<00:02, 13.69it/s]


KeyboardInterrupt: ignored

In [None]:
# from tqdm import tqdm
# from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# epochs = 20
# for epoch in range(epochs):
#   # 훈련
#   model.train()
#   for batch in tqdm(train_dataloader, desc=f'Train epoch : {epoch + 1}'):
#     input_ids = batch['input_ids'].to(device)
#     attention_mask = batch['attention_mask'].to(device)
#     labels = batch['labels'].to(device)

#     # 포워드 태우기
#     outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#     loss = outputs.loss

#     # 백워드 태우기
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#   print(f'Epoch {epoch +1}/{epochs} completed')

#   # 평가
#   model.eval()
#   total_loss = 0
#   all_preds = []
#   all_labels = []
#   with torch.no_grad():
#     for batch in tqdm(test_dataloader, desc=f'Test epoch : {epoch +1}'):
#       input_ids = batch['input_ids'].to(device)
#       attention_mask = batch['attention_mask'].to(device)
#       labels = batch['labels'].to(device)

#       outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#       loss = outputs.loss
#       total_loss += loss.item()

#       # 예측
#       _, preds = torch.max(outputs.logits, dim=1)
#       all_preds.extend(preds.tolist())
#       all_labels.extend(labels.tolist())

#     avg_loss = total_loss / len(test_dataloader)
#     avg_accuracy = accuracy_score(all_labels, all_preds)
#     avg_precision = precision_score(all_labels, all_preds, average = 'weighted')
#     avg_recall = recall_score(all_labels, all_preds, average = 'weighted')
#     avg_f1 = f1_score(all_labels, all_preds, average = 'weighted')

#     print(f'Test Loss : {avg_loss}')
#     print(f'Test accuracy : {avg_accuracy}')
#     print(f'Test f1 score : {avg_f1}')
#     print(f'Test precision : {avg_precision}')
#     print(f'Test recall : {avg_recall}')