In [1]:
!pip install torch
!pip install transformers
!pip install sentencepiece



In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, AdamW, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [39]:
import pandas as pd

# csv 파일을 읽어오기
data = pd.read_csv("restaurant_data.csv")

In [40]:
data_only = data[["발화문", "인텐트"]]

In [41]:
from transformers import BertTokenizerFast

# 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 데이터 인코딩
encodings = tokenizer(data_only['발화문'].tolist(), truncation=True, padding=True)

In [42]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_only['인텐트'] = le.fit_transform(data_only['인텐트'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_only['인텐트'] = le.fit_transform(data_only['인텐트'])


In [43]:
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']
labels = data_only['인텐트'].to_numpy()

In [44]:
class MenuDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
        self.attention_masks = [torch.tensor(mask, dtype=torch.long) for mask in attention_masks]
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [45]:
# 데이터셋 생성
dataset = MenuDataset(input_ids=input_ids, attention_masks=attention_masks, labels=labels)

# 데이터셋을 학습용과 검증용으로 분리
train_data, val_data = train_test_split(dataset, test_size=0.2)

In [46]:
# hyper parmeter
epochs= 100
batch_size=32
lr = 5e-5

In [47]:
# DataLoader 생성
train_data_loader = DataLoader(train_data, batch_size=batch_size)
val_data_loader = DataLoader(val_data, batch_size=batch_size)

In [64]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels, dropout_rate=0.3):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output


In [49]:
# 모델 생성
num_labels = len(np.unique(data_only['인텐트']))
model = BertClassifier(num_labels)

# GPU 설정
device = torch.device('cuda')
model = model.to(device)

# 옵티마이저와 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss().to(device)

# 학습 및 검증
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    for batch in tqdm(train_data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    
    # 검증 데이터 평가
    model.eval()
    val_losses = []
    val_predictions = []
    val_truths = []
    
    for batch in tqdm(val_data_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            val_loss = criterion(outputs, labels)
    
        val_losses.append(val_loss.item())
        val_predictions.extend(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())
        val_truths.extend(labels.cpu().detach().numpy().tolist())
    
    val_loss = sum(val_losses) / len(val_losses)
    val_acc = accuracy_score(val_truths, val_predictions)
    val_f1 = f1_score(val_truths, val_predictions, average='weighted')
    val_precision = precision_score(val_truths, val_predictions, average='weighted')
    val_recall = recall_score(val_truths, val_predictions, average='weighted')
    
    print(f"Validation Loss: {val_loss:.4f} Accuracy: {val_acc:.4f} F1-score: {val_f1:.4f} Precision: {val_precision:.4f} Recall: {val_recall:.4f}")




Epoch 1/100


Training: 100%|██████████| 7260/7260 [28:26<00:00,  4.25it/s]
Validating: 100%|██████████| 1815/1815 [02:22<00:00, 12.73it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 2.0636 Accuracy: 0.4731 F1-score: 0.4493 Precision: 0.4631 Recall: 0.4731
Epoch 2/100


Training: 100%|██████████| 7260/7260 [28:26<00:00,  4.25it/s]
Validating: 100%|██████████| 1815/1815 [02:22<00:00, 12.73it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.9098 Accuracy: 0.5012 F1-score: 0.4862 Precision: 0.5067 Recall: 0.5012
Epoch 3/100


Training: 100%|██████████| 7260/7260 [28:27<00:00,  4.25it/s]
Validating: 100%|██████████| 1815/1815 [02:22<00:00, 12.74it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.8356 Accuracy: 0.5174 F1-score: 0.5046 Precision: 0.5255 Recall: 0.5174
Epoch 4/100


Training: 100%|██████████| 7260/7260 [28:27<00:00,  4.25it/s]
Validating: 100%|██████████| 1815/1815 [02:22<00:00, 12.73it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.8245 Accuracy: 0.5251 F1-score: 0.5135 Precision: 0.5337 Recall: 0.5251
Epoch 5/100


Training: 100%|██████████| 7260/7260 [28:26<00:00,  4.25it/s]
Validating: 100%|██████████| 1815/1815 [02:22<00:00, 12.76it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.8018 Accuracy: 0.5301 F1-score: 0.5193 Precision: 0.5433 Recall: 0.5301
Epoch 6/100


Training: 100%|██████████| 7260/7260 [28:27<00:00,  4.25it/s]
Validating: 100%|██████████| 1815/1815 [02:22<00:00, 12.73it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.8011 Accuracy: 0.5313 F1-score: 0.5238 Precision: 0.5478 Recall: 0.5313
Epoch 7/100


Training:  81%|████████  | 5872/7260 [23:00<05:26,  4.25it/s]


KeyboardInterrupt: 

In [65]:
def predict_intent(text, model, tokenizer):
    # 텍스트를 토크나이즈하고 BERT 입력 형식에 맞게 변환
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_tensors='pt',
        return_token_type_ids=False
    )
    
    # 각 텐서를 GPU로 이동
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    # 모델의 예측 생성
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 가장 높은 확률을 가진 클래스의 인덱스를 가져옴
    _, predicted = torch.max(outputs, dim=1)
    
    # 예측된 인덱스를 의도로 변환
    # 이 부분은 실제 의도와 인덱스를 매핑하는 방법에 따라 다르게 작성해야 합니다.
    intent = le.inverse_transform([predicted.item()])[0]
    
    return intent


In [73]:
# 테스트
text = input()
predicted_intent = predict_intent(text, model, tokenizer)
print(predicted_intent)

 배달료 따로 있나요?


배송_비용_질문


In [75]:
# 모델의 state_dict 저장
torch.save(model.state_dict(), "/home/ubuntu/Project/HG/saved_model1/Bert_model1.pth")

In [76]:
# LabelEncoder 저장
with open("/home/ubuntu/Project/HG/saved_model1/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

NameError: name 'pickle' is not defined