In [74]:
!pip install transformers



In [75]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch

In [76]:
data = pd.read_excel("/content/drive/MyDrive/젯봇/Data4.xlsx", engine="openpyxl")

In [77]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [78]:
def preprocess_text(text):
    # 필요한 경우 텍스트 정제 로직 추가
    return text

def tokenize_and_encode(text):
    # 텍스트 정제
    text = preprocess_text(text)
    # 토크나이저를 사용하여 텍스트를 토큰화 및 인코딩
    return tokenizer.encode(text, add_special_tokens=True)

In [79]:
train_data, val_data = train_test_split(data, test_size=0.3, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

In [80]:
def parse_labels(label_str, num_classes):
    labels = [0] * num_classes
    for label in label_str.split(','):
        labels[int(label)] = 1
    return labels

In [81]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, num_classes):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        self.num_classes = num_classes

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # 텍스트 추출 및 정제
        text = str(self.data.SENTENCE[index])
        text = " ".join(text.split())

        # 토크나이저를 사용하여 텍스트를 토큰화 및 인코딩
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # 레이블 처리
        # 레이블을 숫자 타입으로 변환하는 로직을 여기에 추가합니다.
        # 예: '1,3,5' -> [0, 1, 0, 1, 0, 1] (원-핫 인코딩 방식)
        # 이 부분은 데이터셋에 따라 달라질 수 있으므로 적절히 조정해야 합니다.
        label_str = self.data.label_idx[index]
        labels = parse_labels(label_str, self.num_classes)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(labels, dtype=torch.float)  # 레이블 데이터 타입에 따라 수정 필요
        }

In [82]:
max_len = 128  # 최대 시퀀스 길이
batch_size = 32
num_classes = data['label_idx'].nunique()


train_dataset = TextDataset(train_data, tokenizer, max_len, num_classes)
val_dataset = TextDataset(val_data, tokenizer, max_len, num_classes)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)
model.to(device)

In [84]:
from torch.nn import BCEWithLogitsLoss
loss_function = BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)



In [85]:
from tqdm import tqdm

epochs = 3

for epoch in range(epochs):
    model.train()
    for data in tqdm(train_loader, desc='Evaluating'):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids, mask)

        loss = loss_function(outputs.logits, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Evaluating: 100%|██████████| 344/344 [01:53<00:00,  3.04it/s]


Epoch 1/3, Loss: 0.00227528577670455


Evaluating: 100%|██████████| 344/344 [01:52<00:00,  3.07it/s]


Epoch 2/3, Loss: 0.0016780688893049955


Evaluating: 100%|██████████| 344/344 [01:51<00:00,  3.08it/s]

Epoch 3/3, Loss: 0.0017274259589612484





In [86]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

model.eval()
total_loss, predictions, actuals = 0, [], []

with torch.no_grad():
    for data in tqdm(val_loader, desc="Evaluating"):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask)
        loss = loss_function(outputs.logits, targets)
        total_loss += loss.item()

        preds = torch.sigmoid(outputs.logits).cpu().detach().numpy() > 0.3  # 임계값 조정
        predictions.extend(preds)
        actuals.extend(targets.cpu().detach().numpy())

avg_loss = total_loss / len(val_loader)
predictions = np.array(predictions)
actuals = np.array(actuals)

accuracy = accuracy_score(actuals, predictions)
precision = precision_score(actuals, predictions, average='macro')
recall = recall_score(actuals, predictions, average='macro')
f1 = f1_score(actuals, predictions, average='macro')

# 결과 출력
print(f"Validation Loss: {avg_loss}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Evaluating: 100%|██████████| 148/148 [00:19<00:00,  7.70it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.0016583042837540946
Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
