In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

# 데이터 로드
train_df_org = pd.read_csv('/content/drive/MyDrive/dataset/train_data.csv')
valid_df_org = pd.read_csv('/content/drive/MyDrive/dataset/validation_data.csv')

In [3]:
from tqdm import tqdm

def extract_data(df):
  filtered_text = []
  filtered_empathy = []

  # 데이터프레임을 순회하며 조건에 맞는 데이터 추출
  for i in tqdm(range(len(df)-1)):
      if df.loc[i, 'speaker'] == 0 and df.loc[i + 1, 'speaker'] == 1 and df.loc[i + 1, 'empathy'] != 0:
          filtered_text.append(df.loc[i, 'text'])
          filtered_empathy.append(df.loc[i + 1, 'empathy'])

  # 결과를 데이터프레임으로 생성
  return pd.DataFrame({'text': filtered_text, 'empathy': filtered_empathy})

#train_df = extract_data(train_df_org)
#valid_df = extract_data(valid_df_org)

## 정제된 데이터 파일로 저장
#train_df.to_csv('/content/drive/MyDrive/dataset/cls_train_data.csv', index=False)
#valid_df.to_csv('/content/drive/MyDrive/dataset/cls_validation_data.csv', index=False)

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/dataset/cls_train_data.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/dataset/cls_validation_data.csv')

train_df.head()

Unnamed: 0,text,empathy
0,"엄마, 아기가 태어나니까 내가 부모로서 해야 할 게 참 많은 것 같아요.",3
1,어제 평소보다도 격하게 막 온몸을 써가면서 울더라고요. 얼마나 당황했는지 몰라요.,5
2,맞아요. 젖을 물려주니 금세 뚝 그쳤어요. 난 분명히 순했을 것 같은데.,5
3,잠도 못 자고 우는 아이 달래랴 수유하랴 머리카락이 다 빠지는 줄 알았어요.,3
4,그러면서 엄마 생각이 많이 났어요. 엄마는 직장까지 다니면서 나를 키우느라 얼마나 ...,2


In [5]:
from transformers import PreTrainedTokenizerFast

Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [6]:
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

label_classes = ['1', '2', '3', '4', '5']
mlb = MultiLabelBinarizer(classes=label_classes)

def get_cls_input(examples):
    inputs = examples['text']
    labels = [label.split(',') for label in examples['empathy']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    labels_binary = mlb.fit_transform(labels)
    labels = labels_binary.astype(np.float32).tolist()

    model_inputs["labels"] = labels
    return model_inputs

# 데이터셋 전처리
train_inputs = train_dataset.map(get_cls_input, batched=True)
valid_inputs = valid_dataset.map(get_cls_input, batched=True)


ModuleNotFoundError: No module named 'datasets'

In [None]:
# 데이터 크기 줄이기 위함
train_inputs = train_inputs.train_test_split(test_size=0.05)['test']
valid_inputs = valid_inputs.train_test_split(test_size=0.05)['test']

In [None]:
train_inputs

In [None]:
valid_inputs

In [None]:
import wandb

wandb.login()

In [None]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'lr_scheduler_type':{
        'values': ['linear', 'cosine', 'polynomial']
    },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
    'train_batch_size': {
        'values': [8, 16, 32]
    },
    'eval_batch_size': {
        'values': [8, 16, 32]
    }
}

sweep_config['parameters'] = parameters_dict

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none")

In [None]:
import evaluate

def compute_metrics(pred):
    logits, labels = pred
    predictions = (logits > 0).astype(float)  # 시그모이드 활성화 함수 적용 후 임계값 0.5 기준으로 이진
    labels = labels.flatten()
    predictions = predictions.flatten()

    # 정확도, F1 score 계산
    acc_metrics = evaluate.load("accuracy")
    accuracy = acc_metrics.compute(predictions=predictions, references=labels)

    f1_metrics = evaluate.load("f1")
    f1 = f1_metrics.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy, "f1_metrics": f1}

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

def train():
  wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = AutoModelForSequenceClassification.from_pretrained(
      "skt/kogpt2-base-v2",
      num_labels=5,
      problem_type="multi_label_classification"
  )
  # lora 적용
  model = get_peft_model(model, peft_config)

  # 학습 설정
  training_args = TrainingArguments(
      fp16=True,
      output_dir='./results',
      num_train_epochs=1,
      lr_scheduler_type=config.lr_scheduler_type,
      learning_rate=config.learning_rate,
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.eval_batch_size,
      warmup_steps=500,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=0.1,
      do_eval=True,
      eval_strategy="steps",
      eval_steps=0.2,
      remove_unused_columns=True,
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_inputs,
      eval_dataset=valid_inputs,
      compute_metrics=compute_metrics,
  )

  trainer.train()

  model.save_pretrained('./kogpt2-classification')
  tokenizer.save_pretrained('./kogpt2-classification')

In [None]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='minidlthon_kogpt2_classification')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

In [None]:
# 저장된 모델 및 토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained(
      './kogpt2-classification',
      num_labels=5,
      problem_type="multi_label_classification"
)
trained_model = get_peft_model(model, peft_config)
trained_tokenizer = PreTrainedTokenizerFast.from_pretrained('./kogpt2-classification')

In [None]:
import torch

def predict(text, model, tokenizer, num_classes=5, threshold=0.6):
    # 모델을 평가 모드로 전환
    model.eval()

    # 입력 문장 토큰화
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # 모델에 입력을 전달하여 로짓(logits)을 얻음
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 로짓에 시그모이드 적용하여 확률로 변환
    probabilities = torch.sigmoid(logits)
    # 임계값을 기준으로 이진화
    predictions = (probabilities > threshold).int()

    # 레이블 디코딩
    label_classes = [0, 1, 2, 3, 4]
    predicted_labels = [label_classes[i] for i in range(num_classes) if predictions[0][i] == 1]

    return predicted_labels


In [None]:
# 예제 입력 문장
input_text = "오늘 뭐 해?"

# 분류 결과 추론
# threshold 잘 설정해야
predicted_labels = predict(input_text, trained_model, trained_tokenizer)
print(f"Predicted labels: {predicted_labels}")