In [1]:
!pip install torch
!pip install transformers
!pip install sentencepiece



In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, AdamW, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [31]:
import pandas as pd

# csv 파일을 읽어오기
data = pd.read_csv("restaurant_data.csv")

In [32]:
data_only = data[["발화문", "인텐트"]]

In [33]:
from transformers import BertTokenizerFast

# 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-base')
# 데이터 인코딩
encodings = tokenizer(data_only['발화문'].tolist(), truncation=True, padding=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

In [34]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_only['인텐트'] = le.fit_transform(data_only['인텐트'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_only['인텐트'] = le.fit_transform(data_only['인텐트'])


In [35]:
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']
labels = data_only['인텐트'].to_numpy()

In [36]:
class MenuDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
        self.attention_masks = [torch.tensor(mask, dtype=torch.long) for mask in attention_masks]
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [37]:
# 데이터셋 생성
dataset = MenuDataset(input_ids=input_ids, attention_masks=attention_masks, labels=labels)

# 데이터셋을 학습용과 검증용으로 분리
train_data, val_data = train_test_split(dataset, test_size=0.2)

In [38]:
# hyper parmeter
epochs= 5
batch_size=32
lr = 2e-5

In [39]:
# DataLoader 생성
train_data_loader = DataLoader(train_data, batch_size=batch_size)
val_data_loader = DataLoader(val_data, batch_size=batch_size)

In [40]:
# PyTorch의 nn.Module 클래스를 상속받아 BertClassifier 클래스를 정의합니다.
# 이 클래스는 BERT 모델을 이용한 분류 모델을 구현한 것입니다.
class BertClassifier(nn.Module):
    # 초기화 메소드에서는 분류할 라벨의 개수(num_labels)와 드롭아웃 비율(dropout_rate)를 인자로 받습니다.
    # 'bert-base-uncased'라는 이름의 사전 훈련된 모델을 사용하여 BERT 모델을 초기화하고,
    # 드롭아웃 레이어와 선형 레이어를 추가합니다.
    def __init__(self, num_labels, dropout_rate=0.3):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('beomi/kcbert-base')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, num_labels)

    # forward 메소드는 입력 데이터(input_ids, attention_mask)를 받아 BERT 모델을 통과시키고,
    # 그 결과를 드롭아웃 레이어와 선형 레이어를 통과시켜 최종 결과를 반환합니다.
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output


In [41]:
# 모델 생성
num_labels = len(np.unique(data_only['인텐트']))
model = BertClassifier(num_labels)

# GPU 설정
device = torch.device('cuda')
model = model.to(device)

# 옵티마이저와 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss().to(device)

# 학습 및 검증
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    for batch in tqdm(train_data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 검증 데이터 평가
    model.eval()
    val_losses = []
    val_predictions = []
    val_truths = []
    
    for batch in tqdm(val_data_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            val_loss = criterion(outputs, labels)
    
        val_losses.append(val_loss.item())
        val_predictions.extend(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())
        val_truths.extend(labels.cpu().detach().numpy().tolist())
    
    val_loss = sum(val_losses) / len(val_losses)
    val_acc = accuracy_score(val_truths, val_predictions)
    val_f1 = f1_score(val_truths, val_predictions, average='weighted')
    val_precision = precision_score(val_truths, val_predictions, average='weighted')
    val_recall = recall_score(val_truths, val_predictions, average='weighted')
    
    print(f"Validation Loss: {val_loss:.4f} Accuracy: {val_acc:.4f} F1-score: {val_f1:.4f} Precision: {val_precision:.4f} Recall: {val_recall:.4f}")


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]



Epoch 1/5


Training: 100%|██████████| 7260/7260 [08:55<00:00, 13.55it/s]
Validating: 100%|██████████| 1815/1815 [00:43<00:00, 41.52it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.5265 Accuracy: 0.5958 F1-score: 0.5707 Precision: 0.5779 Recall: 0.5958
Epoch 2/5


Training: 100%|██████████| 7260/7260 [08:55<00:00, 13.56it/s]
Validating: 100%|██████████| 1815/1815 [00:43<00:00, 41.66it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.4408 Accuracy: 0.6137 F1-score: 0.5951 Precision: 0.6069 Recall: 0.6137
Epoch 3/5


Training: 100%|██████████| 7260/7260 [08:55<00:00, 13.56it/s]
Validating: 100%|██████████| 1815/1815 [00:43<00:00, 41.69it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.4272 Accuracy: 0.6235 F1-score: 0.6086 Precision: 0.6253 Recall: 0.6235
Epoch 4/5


Training: 100%|██████████| 7260/7260 [08:56<00:00, 13.52it/s]
Validating: 100%|██████████| 1815/1815 [00:43<00:00, 41.67it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.4517 Accuracy: 0.6226 F1-score: 0.6102 Precision: 0.6224 Recall: 0.6226
Epoch 5/5


Training: 100%|██████████| 7260/7260 [08:55<00:00, 13.55it/s]
Validating: 100%|██████████| 1815/1815 [00:43<00:00, 41.59it/s]


Validation Loss: 1.4891 Accuracy: 0.6248 F1-score: 0.6138 Precision: 0.6228 Recall: 0.6248


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def predict_intent(text, model, tokenizer):
    # 텍스트를 토크나이즈하고 BERT 입력 형식에 맞게 변환
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_tensors='pt',
        return_token_type_ids=False
    )
    
    # 각 텐서를 GPU로 이동
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    # 모델의 예측 생성
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 가장 높은 확률을 가진 클래스의 인덱스를 가져옴
    _, predicted = torch.max(outputs, dim=1)
    
    # 예측된 인덱스를 의도로 변환
    # 이 부분은 실제 의도와 인덱스를 매핑하는 방법에 따라 다르게 작성해야 합니다.
    intent = le.inverse_transform([predicted.item()])[0]
    
    return intent


In [73]:
# 테스트
text = input()
predicted_intent = predict_intent(text, model, tokenizer)
print(predicted_intent)

 배달료 따로 있나요?


배송_비용_질문


In [75]:
# 모델의 state_dict 저장
torch.save(model.state_dict(), "/home/ubuntu/Project/HG/saved_model2/Bert_model2.pth")

In [77]:
import pickle

# LabelEncoder 저장
with open("/home/ubuntu/Project/HG/saved_model2/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)