In [1]:
import pandas as pd
import urllib.request

# 데이터 로드
# 변경 예정
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv",
    filename="ChatBotData.csv",
)
df = pd.read_csv("ChatBotData.csv")

In [2]:
# 랜덤 라벨 데이터 생성
# 실제 데이터셋 사용 시 제거
import numpy as np

np.random.seed(42)

# 클래스 수 설정
num_classes = 5

# 데이터 샘플 수 설정
num_samples = len(df['Q'])

# 랜덤 레이블 생성 함수
def generate_random_labels(num_samples, num_classes):
    labels = []
    for _ in range(num_samples):
        num_labels = np.random.randint(1, num_classes + 1)  # 각 샘플에 최소 1개 이상의 클래스가 할당되도록
        label_indices = np.random.choice(num_classes, num_labels, replace=False)
        labels.append(label_indices.tolist())
    return labels

df['label'] = generate_random_labels(num_samples, num_classes)

In [3]:
from transformers import PreTrainedTokenizerFast

Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer

dataset = Dataset.from_pandas(df)

def get_cls_input(examples):
    inputs = examples['Q']
    labels = examples['label']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    label_classes = [0, 1, 2, 3, 4]
    mlb = MultiLabelBinarizer(classes=label_classes)
    labels_binary = mlb.fit_transform(labels)
    labels = labels_binary.astype(np.float32).tolist()
    model_inputs["labels"] = labels
    return model_inputs

# 데이터셋 전처리
tokenized_datasets = dataset.map(get_cls_input, batched=True)


Map:   0%|          | 0/11823 [00:00<?, ? examples/s]

In [5]:
# 데이터 크기 줄이기 위함
dataset = tokenized_datasets.train_test_split(test_size=0.005)

# 검증 데이터 분할
train_test_dataset = dataset['test'].train_test_split(test_size=0.2)

train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

In [6]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mnkim12[0m ([33mnkim123[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 7e-5
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
    'train_batch_size': {
        'values': [8, 16, 32]
    },
    'eval_batch_size': {
        'values': [8, 16, 32]
    }
}

sweep_config['parameters'] = parameters_dict

In [8]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none")

In [9]:
import evaluate

def compute_metrics(pred):
    logits, labels = pred
    predictions = (logits > 0).astype(int)  # 시그모이드 활성화 함수 적용 후 임계값 0.5 기준으로 이진화

    labels = labels.flatten()
    predictions = predictions.flatten()

    # 정확도, F1 score 계산
    acc_metrics = evaluate.load("accuracy")
    accuracy = acc_metrics.compute(predictions=predictions, references=labels)

    f1_metrics = evaluate.load("f1")
    f1 = f1_metrics.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy, "f1_metrics": f1}

In [10]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

def train():
  wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = AutoModelForSequenceClassification.from_pretrained(
      "skt/kogpt2-base-v2",
      num_labels=5,
      problem_type="multi_label_classification"
  )
  # lora 적용
  model = get_peft_model(model, peft_config)

  # 학습 설정
  training_args = TrainingArguments(
      fp16=True,
      output_dir='./results',
      num_train_epochs=1,
      learning_rate=config.learning_rate,
      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,
      warmup_steps=500,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=0.1,
      do_eval=True,
      evaluation_strategy="steps",
      eval_steps=0.2,
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics,
  )

  trainer.train()

  model.save_pretrained('./kogpt2-classification')
  tokenizer.save_pretrained('./kogpt2-classification')

In [11]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [12]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='minidlthon_kogpt2_classification')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

Create sweep with ID: 67wwlzrt
Sweep URL: https://wandb.ai/nkim123/minidlthon_kogpt2_classification/sweeps/67wwlzrt


[34m[1mwandb[0m: Agent Starting Run: hk9m00fy with config:
[34m[1mwandb[0m: 	eval_batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 4.599576244692246e-05
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1 Metrics
10,0.7487,0.791446,{'accuracy': 0.4666666666666667},{'f1': 0.5294117647058824}
20,0.7158,0.79062,{'accuracy': 0.4666666666666667},{'f1': 0.5294117647058824}
30,0.6901,0.789463,{'accuracy': 0.4666666666666667},{'f1': 0.5294117647058824}
40,0.8622,0.788107,{'accuracy': 0.4666666666666667},{'f1': 0.5294117647058824}


Trainer is attempting to log a value of "{'accuracy': 0.4666666666666667}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5294117647058824}" of type <class 'dict'> for key "eval/f1_metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.4666666666666667}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5294117647058824}" of type <class 'dict'> for key "eval/f1_metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.4666666666666667}" of type <

VBox(children=(Label(value='0.003 MB of 0.028 MB uploaded\r'), FloatProgress(value=0.10188613947049663, max=1.…

0,1
eval/loss,█▆▄▁
eval/runtime,▁▃█▂
eval/samples_per_second,█▅▁▆
eval/steps_per_second,█▅▁▆
train/epoch,▁▂▂▃▃▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▃▃▄▅▅▆▇▇██
train/grad_norm,▆▇▃█▆▁▄▅▄
train/learning_rate,▁▂▃▄▄▅▆▇█
train/loss,▇▃▇▂█▁▂█▃

0,1
eval/loss,0.78811
eval/runtime,6.7193
eval/samples_per_second,1.786
eval/steps_per_second,1.786
total_flos,3146659135488.0
train/epoch,1.0
train/global_step,48.0
train/grad_norm,10.17744
train/learning_rate,0.0
train/loss,0.7332


In [13]:
# 저장된 모델 및 토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained(
      './kogpt2-classification',
      num_labels=5,
      problem_type="multi_label_classification"
)
trained_model = get_peft_model(model, peft_config)
trained_tokenizer = PreTrainedTokenizerFast.from_pretrained('./kogpt2-classification')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import torch

def predict(text, model, tokenizer, threshold=0.6):
    # 모델을 평가 모드로 전환
    model.eval()

    # 입력 문장 토큰화
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # 모델에 입력을 전달하여 로짓(logits)을 얻음
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 로짓에 시그모이드 적용하여 확률로 변환
    probabilities = torch.sigmoid(logits)
    # 임계값을 기준으로 이진화
    predictions = (probabilities > threshold).int()

    # 레이블 디코딩
    label_classes = [0, 1, 2, 3, 4]
    predicted_labels = [label_classes[i] for i in range(num_classes) if predictions[0][i] == 1]

    return predicted_labels


In [15]:
# 예제 입력 문장
input_text = "왜?"

# 분류 결과 추론
# threshold 잘 설정해야
predicted_labels = predict(input_text, trained_model, trained_tokenizer)
print(f"Predicted labels: {predicted_labels}")

Predicted labels: [1, 4]
