In [1]:
!pip install kobert-transformers



In [2]:
!pip install sentencepiece



In [3]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-kq9mhfvc/kobert-tokenizer_e202de0628ca47d981c0663f446aad7a
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-kq9mhfvc/kobert-tokenizer_e202de0628ca47d981c0663f446aad7a
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from kobert_transformers import get_kobert_model, get_tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import BertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer

In [5]:
data = pd.read_excel("/content/drive/MyDrive/젯봇/Data4.xlsx", engine="openpyxl")

In [6]:
mlb = MultiLabelBinarizer()
data['label_idx'] = data['label_idx'].apply(lambda x: list(map(int, x.split(','))))
data_labels = mlb.fit_transform(data['label_idx'])
num_classes = len(mlb.classes_)

max_len = max(map(len, data_labels))
data['label_idx'] = [i + [0]*(max_len-len(i)) for i in data_labels.tolist()]

In [7]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1', last_hidden_states=True)
model = BertForSequenceClassification.from_pretrained('skt/kobert-base-v1', num_labels=num_classes)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class KoBERTDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, num_classes):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_classes = num_classes

    def __getitem__(self, index):
        text = self.data.SENTENCE[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        labels = self.data.label_idx[index]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(labels, dtype=torch.float)  # 데이터 형식에 따라 수정
        }

    def __len__(self):
        return self.len

In [9]:
MAX_LEN = 128
BATCH_SIZE = 32

train_data, val_data = train_test_split(data, test_size=0.3)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

train_dataset = KoBERTDataset(train_data, tokenizer, MAX_LEN, num_classes)
val_dataset = KoBERTDataset(val_data, tokenizer, MAX_LEN, num_classes)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
class_weights = [0] * num_classes
label_counts = pd.Series([item for sublist in mlb.inverse_transform(data_labels) for item in sublist]).value_counts()
for i in range(num_classes):
    class_weights[i] = label_counts.sum() / (label_counts[i] if i in label_counts else 1)

In [11]:
from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_function = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(class_weights).to(device))



In [None]:
from tqdm import tqdm

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_loader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_function(outputs.logits, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Evaluating: 100%|██████████| 344/344 [03:22<00:00,  1.70it/s]


Epoch 1/20, Loss: 4.549602031707764


Evaluating: 100%|██████████| 344/344 [03:21<00:00,  1.70it/s]


Epoch 2/20, Loss: 3.6528267860412598


Evaluating: 100%|██████████| 344/344 [03:21<00:00,  1.70it/s]


Epoch 3/20, Loss: 3.642589569091797


Evaluating: 100%|██████████| 344/344 [03:21<00:00,  1.71it/s]


Epoch 4/20, Loss: 3.1242566108703613


Evaluating: 100%|██████████| 344/344 [03:21<00:00,  1.70it/s]


Epoch 5/20, Loss: 1.995190143585205


Evaluating: 100%|██████████| 344/344 [03:21<00:00,  1.70it/s]


Epoch 6/20, Loss: 1.9959306716918945


Evaluating:  38%|███▊      | 130/344 [01:16<02:05,  1.71it/s]

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, val_loader):
    model.eval()
    predictions, actuals = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = (torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy()
            predictions.extend(preds)
            actuals.extend(targets.cpu().detach().numpy())

    predictions = np.array(predictions)
    actuals = np.array(actuals)

    accuracy = accuracy_score(actuals.ravel(), predictions.ravel())
    precision = precision_score(actuals.ravel(), predictions.ravel(), average='micro')
    recall = recall_score(actuals.ravel(), predictions.ravel(), average='micro')
    f1 = f1_score(actuals.ravel(), predictions.ravel(), average='micro')

    return accuracy, precision, recall, f1


accuracy, precision, recall, f1 = evaluate_model(model, val_loader)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Evaluating: 100%|██████████| 148/148 [00:31<00:00,  4.72it/s]


Accuracy: 0.7736451270979932
Precision: 0.7736451270979932
Recall: 0.7736451270979932
F1 Score: 0.7736451270979932
