In [23]:
import pandas as pd
import urllib.request

# 데이터 로드
# 변경 예정
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv",
    filename="ChatBotData.csv",
)
df = pd.read_csv("ChatBotData.csv")

In [24]:
# 랜덤 라벨 데이터 생성
# 실제 데이터셋 사용 시 제거
import numpy as np

np.random.seed(42)

# 클래스 수 설정
num_classes = 5

# 데이터 샘플 수 설정
num_samples = len(df['Q'])

# 랜덤 레이블 생성 함수
def generate_random_labels(num_samples, num_classes):
    labels = []
    for _ in range(num_samples):
        num_labels = np.random.randint(1, num_classes + 1)  # 각 샘플에 최소 1개 이상의 클래스가 할당되도록
        label_indices = np.random.choice(num_classes, num_labels, replace=False)
        labels.append(label_indices.tolist())
    return labels

df['label'] = generate_random_labels(num_samples, num_classes)

In [25]:
from transformers import PreTrainedTokenizerFast

Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [26]:
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer

dataset = Dataset.from_pandas(df)

def get_cls_input(examples):
    inputs = examples['Q']
    labels = examples['label']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    label_classes = [0, 1, 2, 3, 4]
    mlb = MultiLabelBinarizer(classes=label_classes)
    labels_binary = mlb.fit_transform(labels)
    labels = labels_binary.astype(np.float32).tolist()
    model_inputs["labels"] = labels
    return model_inputs

# 데이터셋 전처리
tokenized_datasets = dataset.map(get_cls_input, batched=True)


Map: 100%|██████████| 11823/11823 [00:00<00:00, 23075.35 examples/s]


In [27]:
# 데이터 크기 줄이기 위함
dataset = tokenized_datasets.train_test_split(test_size=0.005)

# 검증 데이터 분할
train_test_dataset = dataset['test'].train_test_split(test_size=0.2)

train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

In [28]:
import wandb

wandb.login()

True

In [29]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 7e-5
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
    'train_batch_size': {
        'values': [8, 16, 32]
    },
    'eval_batch_size': {
        'values': [8, 16, 32]
    }
}

sweep_config['parameters'] = parameters_dict

In [30]:
import evaluate

def compute_metrics(pred):
    logits, labels = pred
    predictions = (logits > 0).astype(int)  # 시그모이드 활성화 함수 적용 후 임계값 0.5 기준으로 이진화

    labels = labels.flatten()
    predictions = predictions.flatten()

    # 정확도, F1 score 계산
    acc_metrics = evaluate.load("accuracy")
    accuracy = acc_metrics.compute(predictions=predictions, references=labels)

    f1_metrics = evaluate.load("f1")
    f1 = f1_metrics.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy, "f1_metrics": f1}

In [31]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

def train():
  wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = AutoModelForSequenceClassification.from_pretrained(
      "skt/kogpt2-base-v2",
      num_labels=5,
      problem_type="multi_label_classification"
  )

  # 학습 설정
  training_args = TrainingArguments(
      fp16=True,
      output_dir='./results',
      num_train_epochs=1,
      learning_rate=config.learning_rate,
      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,
      warmup_steps=500,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=0.1,
      do_eval=True,
      evaluation_strategy="steps",
      eval_steps=0.2,
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics,
  )

  trainer.train()

  model.save_pretrained('./kogpt2-classification')
  tokenizer.save_pretrained('./kogpt2-classification')

In [32]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [33]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='minidlthon_kogpt2_classification')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

Create sweep with ID: wgw56m7q
Sweep URL: https://wandb.ai/nkim123/minidlthon_kogpt2_classification/sweeps/wgw56m7q


[34m[1mwandb[0m: Agent Starting Run: zy4utqf4 with config:
[34m[1mwandb[0m: 	eval_batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 4.870156631861571e-05
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1 Metrics
10,0.7242,0.703103,{'accuracy': 0.5833333333333334},{'f1': 0.6835443037974683}
20,0.7341,0.681106,{'accuracy': 0.6166666666666667},{'f1': 0.735632183908046}
30,0.7518,0.666727,{'accuracy': 0.6166666666666667},{'f1': 0.7415730337078652}
40,0.9067,0.673413,{'accuracy': 0.5833333333333334},{'f1': 0.7191011235955056}


0,1
eval/loss,█▄▁▂
eval/runtime,▁▅█▄
eval/samples_per_second,█▄▁▅
eval/steps_per_second,█▄▁▅
train/epoch,▁▂▂▃▃▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▃▃▄▅▅▆▇▇██
train/grad_norm,█▁▅▄▅▃█▄▆
train/learning_rate,▁▂▃▄▅▅▆▇█
train/loss,▄▄▆▄▆▄▆█▁

0,1
eval/loss,0.67341
eval/runtime,2.4248
eval/samples_per_second,4.949
eval/steps_per_second,4.949
total_flos,3135645941760.0
train/epoch,1.0
train/global_step,48.0
train/grad_norm,27.09541
train/learning_rate,0.0
train/loss,0.6052


In [34]:
# 저장된 모델 및 토크나이저 로드
trained_model = AutoModelForSequenceClassification.from_pretrained('./kogpt2-classification')
trained_tokenizer = PreTrainedTokenizerFast.from_pretrained('./kogpt2-classification')

In [35]:
import torch

def predict(text, model, tokenizer, threshold=0.6):
    # 모델을 평가 모드로 전환
    model.eval()

    # 입력 문장 토큰화
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # 모델에 입력을 전달하여 로짓(logits)을 얻음
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 로짓에 시그모이드 적용하여 확률로 변환
    probabilities = torch.sigmoid(logits)
    # 임계값을 기준으로 이진화
    predictions = (probabilities > threshold).int()

    # 레이블 디코딩
    label_classes = [0, 1, 2, 3, 4]
    predicted_labels = [label_classes[i] for i in range(num_classes) if predictions[0][i] == 1]

    return predicted_labels


In [36]:
# 예제 입력 문장
input_text = "안녕"

# 분류 결과 추론
# threshold 잘 설정해야
predicted_labels = predict(input_text, trained_model, trained_tokenizer)
print(f"Predicted labels: {predicted_labels}")

Predicted labels: [1, 2, 3, 4]
