In [None]:
!pip install kobert-transformers



In [None]:
!pip install sentencepiece



In [None]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-kfzotmmj/kobert-tokenizer_8c6ed3f96e8547b5939ee8379beb3017
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-kfzotmmj/kobert-tokenizer_8c6ed3f96e8547b5939ee8379beb3017
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from kobert_transformers import get_kobert_model, get_tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_excel("/content/drive/MyDrive/젯봇/Data4.xlsx", engine="openpyxl")
le = LabelEncoder()
data['label_idx'] = le.fit_transform(data['label_idx'])
num_classes = data['label_idx'].nunique()

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1', last_hidden_states=True)
model = BertForSequenceClassification.from_pretrained('skt/kobert-base-v1', num_labels=num_classes)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def convert_labels_to_numeric(label, num_classes):
    # 예: 3 -> [0, 0, 0, 1, 0, 0]
    labels = [0] * num_classes
    labels[label] = 1
    return labels


In [None]:
class KoBERTDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, num_classes):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_classes = num_classes

    def __getitem__(self, index):
        text = self.data.SENTENCE[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        label_str = self.data.label_idx[index]
        labels = convert_labels_to_numeric(label_str, self.num_classes)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(labels, dtype=torch.float)  # 데이터 형식에 따라 수정
        }

    def __len__(self):
        return self.len


In [None]:
MAX_LEN = 128
BATCH_SIZE = 32

train_data, val_data = train_test_split(data, test_size=0.3)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

train_dataset = KoBERTDataset(train_data, tokenizer, MAX_LEN, num_classes)
val_dataset = KoBERTDataset(val_data, tokenizer, MAX_LEN, num_classes)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

class_weights = [0] * num_classes
label_counts = train_data['label_idx'].value_counts()
for label, count in label_counts.items():
  class_weights[label] = np.sum(label_counts) / count

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_function = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(class_weights).to(device))



In [None]:
from tqdm import tqdm

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_loader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_function(outputs.logits, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Evaluating: 100%|██████████| 344/344 [03:37<00:00,  1.58it/s]


Epoch 1/3, Loss: 1.0718321800231934


Evaluating: 100%|██████████| 344/344 [03:35<00:00,  1.59it/s]


Epoch 2/3, Loss: 1.080198049545288


Evaluating: 100%|██████████| 344/344 [03:35<00:00,  1.59it/s]

Epoch 3/3, Loss: 0.9551646709442139





In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, val_loader):
    model.eval()
    predictions, actuals = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = (torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy()
            predictions.extend(preds)
            actuals.extend(targets.cpu().detach().numpy())

    predictions = np.array(predictions)
    actuals = np.array(actuals)

    accuracy = accuracy_score(actuals.ravel(), predictions.ravel())
    precision = precision_score(actuals.ravel(), predictions.ravel(), average='micro')
    recall = recall_score(actuals.ravel(), predictions.ravel(), average='micro')
    f1 = f1_score(actuals.ravel(), predictions.ravel(), average='micro')

    return accuracy, precision, recall, f1


accuracy, precision, recall, f1 = evaluate_model(model, val_loader)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Evaluating: 100%|██████████| 148/148 [00:35<00:00,  4.19it/s]


Accuracy: 0.7248967837510814
Precision: 0.7248967837510814
Recall: 0.7248967837510814
F1 Score: 0.7248967837510814
