In [1]:
!pip install torch
!pip install transformers
!pip install sentencepiece



In [2]:
!git clone https://github.com/SKTBrain/KoBERT.git

fatal: destination path 'KoBERT' already exists and is not an empty directory.


In [3]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-8fy63cco/kobert-tokenizer_60fb8b3cfdd348a2a239be06482bb252
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-8fy63cco/kobert-tokenizer_60fb8b3cfdd348a2a239be06482bb252
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25ldone
[?25h

In [4]:
from kobert_tokenizer import KoBERTTokenizer

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, AdamW, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [6]:
import pandas as pd

# csv 파일을 읽어오기
data = pd.read_csv("restaurant_data2.csv")

In [7]:
data_only = data[["발화문", "인텐트"]]

In [8]:
from transformers import BertTokenizerFast

# 토크나이저 초기화
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
# 데이터 임베딩
embedding = tokenizer(data_only['발화문'].tolist(), truncation=True, padding=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_only['인텐트'] = le.fit_transform(data_only['인텐트'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_only['인텐트'] = le.fit_transform(data_only['인텐트'])


In [10]:
input_ids = embedding['input_ids']
attention_masks = embedding['attention_mask']
labels = data_only['인텐트'].to_numpy()

In [11]:
class MenuDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
        self.attention_masks = [torch.tensor(mask, dtype=torch.long) for mask in attention_masks]
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [12]:
# 데이터셋 생성
dataset = MenuDataset(input_ids=input_ids, attention_masks=attention_masks, labels=labels)

# 데이터셋을 학습용과 검증용으로 분리
train_data, val_data = train_test_split(dataset, test_size=0.2)

In [13]:
# hyper parmeter
epochs= 5
batch_size=32
lr = 2e-5

In [14]:
# DataLoader 생성
train_data_loader = DataLoader(train_data, batch_size=batch_size)
val_data_loader = DataLoader(val_data, batch_size=batch_size)

In [15]:
# PyTorch의 nn.Module 클래스를 상속받아 BertClassifier 클래스를 정의합니다.
# 이 클래스는 BERT 모델을 이용한 분류 모델을 구현한 것입니다.
class BertClassifier(nn.Module):
    # 초기화 메소드에서는 분류할 라벨의 개수(num_labels)와 드롭아웃 비율(dropout_rate)를 인자로 받습니다.
    # 'bert-base-uncased'라는 이름의 사전 훈련된 모델을 사용하여 BERT 모델을 초기화하고,
    # 드롭아웃 레이어와 선형 레이어를 추가합니다.
    def __init__(self, num_labels, dropout_rate=0.3):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('skt/kobert-base-v1')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, num_labels)

    # forward 메소드는 입력 데이터(input_ids, attention_mask)를 받아 BERT 모델을 통과시키고,
    # 그 결과를 드롭아웃 레이어와 선형 레이어를 통과시켜 최종 결과를 반환합니다.
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output


In [16]:
# 모델 생성
num_labels = len(np.unique(data_only['인텐트']))
model = BertClassifier(num_labels)

# GPU 설정
device = torch.device('cuda')
model = model.to(device)

# 옵티마이저와 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss().to(device)

# 학습 및 검증
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    for batch in tqdm(train_data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 검증 데이터 평가
    model.eval()
    val_losses = []
    val_predictions = []
    val_truths = []
    
    for batch in tqdm(val_data_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            val_loss = criterion(outputs, labels)
    
        val_losses.append(val_loss.item())
        val_predictions.extend(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())
        val_truths.extend(labels.cpu().detach().numpy().tolist())
    
    val_loss = sum(val_losses) / len(val_losses)
    val_acc = accuracy_score(val_truths, val_predictions)
    val_f1 = f1_score(val_truths, val_predictions, average='weighted')
    val_precision = precision_score(val_truths, val_predictions, average='weighted')
    val_recall = recall_score(val_truths, val_predictions, average='weighted')
    
    print(f"Validation Loss: {val_loss:.4f} Accuracy: {val_acc:.4f} F1-score: {val_f1:.4f} Precision: {val_precision:.4f} Recall: {val_recall:.4f}")




Epoch 1/5


Training: 100%|██████████| 7260/7260 [11:25<00:00, 10.59it/s]
Validating: 100%|██████████| 1815/1815 [00:59<00:00, 30.48it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.6446 Accuracy: 0.7885 F1-score: 0.7786 Precision: 0.7833 Recall: 0.7885
Epoch 2/5


Training: 100%|██████████| 7260/7260 [11:25<00:00, 10.59it/s]
Validating: 100%|██████████| 1815/1815 [00:59<00:00, 30.53it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.6198 Accuracy: 0.7950 F1-score: 0.7856 Precision: 0.7912 Recall: 0.7950
Epoch 3/5


Training: 100%|██████████| 7260/7260 [11:25<00:00, 10.59it/s]
Validating: 100%|██████████| 1815/1815 [00:59<00:00, 30.58it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.6169 Accuracy: 0.8001 F1-score: 0.7905 Precision: 0.7952 Recall: 0.8001
Epoch 4/5


Training: 100%|██████████| 7260/7260 [11:25<00:00, 10.59it/s]
Validating: 100%|██████████| 1815/1815 [00:59<00:00, 30.54it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.6187 Accuracy: 0.8002 F1-score: 0.7920 Precision: 0.7948 Recall: 0.8002
Epoch 5/5


Training: 100%|██████████| 7260/7260 [11:25<00:00, 10.60it/s]
Validating: 100%|██████████| 1815/1815 [00:59<00:00, 30.56it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.6293 Accuracy: 0.8024 F1-score: 0.7969 Precision: 0.7986 Recall: 0.8024


In [17]:
def predict_intent(text, model, tokenizer):
    # 텍스트를 토크나이즈하고 BERT 입력 형식에 맞게 변환
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_tensors='pt',
        return_token_type_ids=False
    )
    
    # 각 텐서를 GPU로 이동
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    # 모델의 예측 생성
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 가장 높은 확률을 가진 클래스의 인덱스를 가져옴
    _, predicted = torch.max(outputs, dim=1)
    
    # 예측된 인덱스를 의도로 변환
    # 이 부분은 실제 의도와 인덱스를 매핑하는 방법에 따라 다르게 작성해야 합니다.
    intent = le.inverse_transform([predicted.item()])[0]
    
    return intent


In [20]:
# 테스트
text = input()
predicted_intent = predict_intent(text, model, tokenizer)
print(predicted_intent)

 싸이 버거 얼마인가요?


제품_가격_질문


In [26]:
# 모델의 state_dict 저장
# torch.save(model.state_dict(), "/home/ubuntu/Project/HG/saved_model3/Bert_model3.pth")

In [27]:
#import pickle

# LabelEncoder 저장
# with open("/home/ubuntu/Project/HG/saved_model3/label_encoder.pkl", "wb") as f:
#    pickle.dump(le, f)

In [23]:
# 모델 저장
model_path = "/home/ubuntu/Project/HG/saved_model_BERT"  # 모델을 저장할 경로를 지정해주세요.
model.bert.save_pretrained(model_path)

# 토크나이저 저장
tokenizer_path = "/home/ubuntu/Project/HG/saved_model_BERT"  # 토크나이저를 저장할 경로를 지정해주세요.
tokenizer.save_pretrained(tokenizer_path)

# 레이블 인코더 저장
import pickle
label_encoder_file = "/home/ubuntu/Project/HG/saved_model_BERT/label_encoder.pkl"  # 레이블 인코더를 저장할 경로를 지정해주세요.
pickle.dump(le, open(label_encoder_file, 'wb'))