In [1]:
!pip install torch
!pip install transformers
!pip install sentencepiece



In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, AdamW, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [129]:
import pandas as pd

# 엑셀 파일을 읽어오기
data = pd.read_excel("data.xlsx", engine="openpyxl") # openpyxl엔진을 쓰겠다.

In [130]:
data_only = data[["SENTENCE", "지식베이스"]]
# '지식베이스' 칼럼에서 결측치가 있는 행을 제거
data_only = data_only.dropna(subset=['지식베이스'])

In [131]:
data_only.head(5)

Unnamed: 0,SENTENCE,지식베이스
2,짬뽕류는 어떤 게 있나요? 잘 나가는 짬뽕 있나요?,짬뽕/메뉴
3,특해물 짬뽕도 있고 전복 새우 짬뽕도 있고 해물 종류도 새우 홍합 전복 없는 게 없습니다,"특해물 짬뽕/메뉴, 전복 새우 짬뽕/메뉴, 새우/해물/재료, 홍합/해물/재료, 전복..."
4,전복 들어가는 거는 특해물 짬뽕 시켜야 돼요?,"전복/해물/재료, 특해물 짬뽕/메뉴"
5,전복 짬뽕 시키면 전복이 들어가죠,"전복 짬뽕/메뉴, 전복/해물/재료"
6,전복 들어가고 여러 가지 또 딴 것도 들어가죠?,전복/해물/재료


In [132]:
# ","를 하나의 token인 "|"으로 바꿔서 label을 표현
data_only['지식베이스'] = data_only['지식베이스'].apply(lambda x: "|".join([s.strip() for s in x.split(",")]))
data_only['메뉴정보'] = data_only['지식베이스'].apply(lambda x: 1 if '메뉴' in x else 0)

In [133]:
data_only.head(5)

Unnamed: 0,SENTENCE,지식베이스,메뉴정보
2,짬뽕류는 어떤 게 있나요? 잘 나가는 짬뽕 있나요?,짬뽕/메뉴,1
3,특해물 짬뽕도 있고 전복 새우 짬뽕도 있고 해물 종류도 새우 홍합 전복 없는 게 없습니다,특해물 짬뽕/메뉴|전복 새우 짬뽕/메뉴|새우/해물/재료|홍합/해물/재료|전복/해물/재료,1
4,전복 들어가는 거는 특해물 짬뽕 시켜야 돼요?,전복/해물/재료|특해물 짬뽕/메뉴,1
5,전복 짬뽕 시키면 전복이 들어가죠,전복 짬뽕/메뉴|전복/해물/재료,1
6,전복 들어가고 여러 가지 또 딴 것도 들어가죠?,전복/해물/재료,0


In [134]:
from transformers import BertTokenizerFast

# 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 데이터 인코딩
encodings = tokenizer(data_only['SENTENCE'].tolist(), truncation=True, padding=True)

In [135]:
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']
labels = data_only['메뉴정보'].to_numpy()

In [136]:
class MenuDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
        self.attention_masks = [torch.tensor(mask, dtype=torch.long) for mask in attention_masks]
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [137]:
# 데이터셋 생성
dataset = MenuDataset(input_ids=input_ids, attention_masks=attention_masks, labels=labels)

# 데이터셋을 학습용과 검증용으로 분리
train_data, val_data = train_test_split(dataset, test_size=0.1)

In [138]:
# hyper parmeter
epochs= 5
batch_size=32
lr = 1e-5
max_len = 128

In [139]:
# DataLoader 생성
train_data_loader = DataLoader(train_data, batch_size=batch_size)
val_data_loader = DataLoader(val_data, batch_size=batch_size)

In [140]:
from transformers import BertTokenizer, AdamW, BertModel
from torch.utils.data import DataLoader
import torch.nn as nn
import torch

# BertClassifier 클래스 정의
class BertClassifier(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output).view(-1)  # 수정된 부분
        return proba

In [None]:
# 모델 생성
model = BertClassifier()

# GPU 설정
device = torch.device('cpu')
model = model.to(device)

# 옵티마이저와 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
criterion = nn.BCELoss().to(device)

# 학습 및 검증
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    for batch in tqdm(train_data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 검증 데이터 평가
    model.eval()
    val_losses = []
    val_predictions = []
    val_truths = []
    
    for batch in tqdm(val_data_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            val_loss = criterion(outputs, labels.float())
    
        val_losses.append(val_loss.item())
        val_predictions.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        val_truths.extend(labels.cpu().detach().numpy().tolist())
    
    val_loss = sum(val_losses) / len(val_losses)
    val_acc = accuracy_score(val_truths, [1 if pred >= 0.5 else 0 for pred in val_predictions])
    val_f1 = f1_score(val_truths, [1 if pred >= 0.5 else 0 for pred in val_predictions])
    val_precision = precision_score(val_truths, [1 if pred >= 0.5 else 0 for pred in val_predictions])
    val_recall = recall_score(val_truths, [1 if pred >= 0.5 else 0 for pred in val_predictions])
    
    print(f"Validation Loss: {val_loss:.4f} Accuracy: {val_acc:.4f} F1-score: {val_f1:.4f} Precision: {val_precision:.4f} Recall: {val_recall:.4f}")



Epoch 1/5


Training:  53%|█████▎    | 118/223 [04:02<03:34,  2.04s/it]

In [None]:
def predict(model, sentence):
    model.eval()  # 모델을 평가 모드로 설정
    tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)  # 문장을 토큰화
    input_ids = torch.tensor([tokenized_sentence]).to(device)  # 토큰화된 문장을 텐서로 변환
    attention_mask = (input_ids != 0).float().to(device)  # attention mask 설정
    with torch.no_grad():  # 기울기 계산 비활성화
        proba = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()  # 모델의 예측 확률값 계산
    return 'Yes' if proba > 0.5 else 'No'  # 확률값에 따라 'Yes' 또는 'No' 반환

sentence = "짬뽕류는 어떤 게 있나요? 잘 나가는 짬뽕 있나요?"
print(predict(model, sentence))


In [None]:
from transformers import BertForTokenClassification, BertTokenizer
from transformers import BertPreTrainedModel
from torch.optim import AdamW

class BertForNER(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + (outputs.last_hidden_state,)  # Add hidden states and attention if they are here

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)

In [None]:
def extract_labels(sentence, knowledge_base, tokenizer, max_length):
    if pd.isnull(sentence) or str(sentence).lower() == 'nan':  # sentence가 결측값인 경우 또는 'nan'인 경우
        sentence = ''  # 빈 문자열로 설정
    else:
        sentence = str(sentence)  # 문자열로 변환
    if pd.isnull(knowledge_base):  # knowledge_base가 결측값인 경우
        knowledge_base = ''  # 빈 문자열로 설정
    else:
        knowledge_base = str(knowledge_base)  # 문자열로 변환

    tokens = tokenizer.tokenize(sentence)
    labels = [2] * len(tokens)  # 'O' -> 2
    menus = knowledge_base.split('|')
    for menu in menus:
        if '/메뉴' in menu:
            menu_name = menu.split('/메뉴')[0]
            menu_tokens = tokenizer.tokenize(menu_name)
            for i in range(len(tokens)):
                if tokens[i:i+len(menu_tokens)] == menu_tokens:
                    labels[i] = 0  # 'B' -> 0
                    for j in range(i+1, i+len(menu_tokens)):
                        labels[j] = 1  # 'I' -> 1
    
    # Apply padding
    if len(labels) < max_length:
        labels += [2] * (max_length - len(labels))  # 'O' -> 2
    else:
        labels = labels[:max_length]
    
    return labels


In [None]:
from torch.utils.data import Dataset, DataLoader

class MenuDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row['SENTENCE']
        knowledge_base = row['지식베이스']

        # 결측값 또는 'nan'을 빈 문자열로 대체
        if pd.isnull(sentence) or str(sentence).lower() == 'nan':
            sentence = ''
        else:
            sentence = str(sentence)

        if pd.isnull(knowledge_base) or str(knowledge_base).lower() == 'nan':
            knowledge_base = ''
        else:
            knowledge_base = str(knowledge_base)

        labels = extract_labels(sentence, knowledge_base, self.tokenizer, self.max_length)
        encoding = self.tokenizer.encode_plus(sentence, truncation=True, padding='max_length', max_length=self.max_length)
        items = {key: torch.tensor(val) for key, val in encoding.items()}
        items['labels'] = torch.tensor(labels, dtype=torch.long)  # 수정된 부분
        return items


In [None]:
import pandas as pd

# 가정: train_data와 val_data가 리스트 형태로 제공되었다.
train_data = pd.DataFrame(train_data, columns=['SENTENCE', '지식베이스'])
val_data = pd.DataFrame(val_data, columns=['SENTENCE', '지식베이스'])

# 데이터 로더 생성
MAX_LENGTH = 128  # 또는 원하는 값을 설정하세요.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
train_dataset = MenuDataset(train_data, tokenizer, MAX_LENGTH)
val_dataset = MenuDataset(val_data, tokenizer, MAX_LENGTH)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
from transformers import BertConfig
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss

num_labels = 10  # 추출할 라벨의 수를 설정합니다.
config = BertConfig.from_pretrained('bert-base-uncased')  # BERT의 설정을 불러옵니다.
config.num_labels = num_labels  # 라벨의 수를 설정합니다.

model2 = BertForNER(config, num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)  # 옵티마이저 설정

epochs=5

for epoch in range(epochs):
    model2.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    model2.eval()
    val_preds, val_labels = [], []
    for batch in tqdm(val_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        preds = torch.argmax(logits, dim=2)
        val_preds.extend(preds.view(-1).cpu().numpy())
        val_labels.extend(labels.view(-1).cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')

    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Train loss  : {avg_train_loss:.4f}')
    print(f'Val accuracy: {val_acc:.4f}')
    print(f'Val precision: {val_precision:.4f}')
    print(f'Val recall   : {val_recall:.4f}')
    print(f'Val f1-score : {val_f1:.4f}\n')
