### 기본 설정

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install accelerate
!pip install peft
!pip install evaluate
!pip install wandb
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.me

In [3]:
# 수정 필요
DATA_SIZE = 0.001
USE_LORA = False

### 데이터셋

In [5]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/dataset/gen_train_data.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/dataset/gen_validation_data.csv')

train_df.head()

Unnamed: 0,speaker,empathy,listener
0,"엄마, 아기가 태어나니까 내가 부모로서 해야 할 게 참 많은 것 같아요.",3,그렇지? 아기 키우는 게 여간 어려운 일이 아니야.
1,어제 평소보다도 격하게 막 온몸을 써가면서 울더라고요. 얼마나 당황했는지 몰라요.,5,배가 고파서 그랬던 것 아닐까? 아기들은 배가 고프면 몸부림을 친단다.
2,맞아요. 젖을 물려주니 금세 뚝 그쳤어요. 난 분명히 순했을 것 같은데.,5,말도 마. 네가 얼마나 까탈스러웠는데. 우리 손주가 내 딸을 빼다 박았네.
3,잠도 못 자고 우는 아이 달래랴 수유하랴 머리카락이 다 빠지는 줄 알았어요.,3,"네가 어른 노릇, 부모 노릇을 하느라 고생이 정말 많구나."
4,그러면서 엄마 생각이 많이 났어요. 엄마는 직장까지 다니면서 나를 키우느라 얼마나 ...,2,우리 딸이 엄마가 되더니 철이 들었네. 하지만 네가 웃어주면 세상 모든 시름이 사라...


In [6]:
# 데이터 크기 줄이기 위함
train_df = train_df.sample(frac=DATA_SIZE, random_state=42)
valid_df = valid_df.sample(frac=DATA_SIZE, random_state=42)

In [7]:
from transformers import AutoTokenizer

Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<sent>'
PAD = '<pad>'

# KoGPT2 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [8]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

label_classes = ['조언', '격려', '위로', '동조', '']
def get_input(examples):
    speaker = [Q_TKN + example for example in examples['speaker']]
    listener = [A_TKN + example for example in examples['listener']]
    empathy = []
    for example in examples['empathy']:
      tmp = example.split(',')
      tmp = list(map(int, tmp))
      labels = [label_classes[idx-1] for idx in tmp]
      example = ','.join(labels)
      empathy.append(SENT + example)

    inputs = [speaker[i] + empathy[i] for i in range(len(speaker))]
    outputs = [example + tokenizer.eos_token for example in examples['listener']]

    model_inputs = tokenizer(inputs, listener, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 전처리
train_inputs = train_dataset.map(get_input, batched=True)
valid_inputs = valid_dataset.map(get_input, batched=True)

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

### 모델 학습

In [9]:
import wandb

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [10]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'lr_scheduler_type':{
        'values': ['linear', 'cosine', 'polynomial']
    },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
    'train_batch_size': {
        'values': [8, 16, 32]
    },
    'eval_batch_size': {
        'values': [8, 16, 32]
    }
}

sweep_config['parameters'] = parameters_dict

In [11]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none")

In [15]:
from transformers import BartForCausalLM, Trainer, TrainingArguments
from accelerate import Accelerator

def train():
  run = wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = BartForCausalLM.from_pretrained("gogamza/kobart-base-v2")
  if USE_LORA:
    # lora 적용
    accelerator = Accelerator() #데이터 병렬 처리 원활
    model = accelerator.prepare(model)
    model = get_peft_model(model, peft_config)

  # 학습 설정
  training_args = TrainingArguments(
      fp16=True,
      output_dir='./results',
      num_train_epochs=1,
      lr_scheduler_type=config.lr_scheduler_type,
      learning_rate=config.learning_rate,
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.eval_batch_size,
      warmup_steps=10000,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=0.01,
      do_eval=True,
      eval_strategy="steps",
      eval_steps=0.1,
      remove_unused_columns=True,
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_inputs,
      eval_dataset=valid_inputs,
  )

  trainer.train()

  model_path = './kobart-chatbot'
  model.save_pretrained(model_path)
  tokenizer.save_pretrained(model_path)


In [16]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [17]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='miniaiffelthon-kobart')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

Create sweep with ID: o6p7dm28
Sweep URL: https://wandb.ai/nkim123/miniaiffelthon-kobart/sweeps/o6p7dm28


[34m[1mwandb[0m: Agent Starting Run: 6uv51w8w with config:
[34m[1mwandb[0m: 	eval_batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 7.724623279154198e-05
[34m[1mwandb[0m: 	lr_scheduler_type: linear
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.3


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Some weights of BartForCausalLM were not initialized from the model checkpoint at gogamza/kobart-base-v2 and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
2,11.2609,11.265977
4,11.2498,11.265977
6,11.2606,11.265777
8,11.2513,11.265223
10,11.2651,11.264255
12,11.2505,11.263075


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}


VBox(children=(Label(value='0.026 MB of 0.026 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,███▆▄▁
eval/runtime,█▁▃▃▃▂
eval/samples_per_second,▁█▆▆▆▇
eval/steps_per_second,▁█▆▆▆▇
train/epoch,▁▂▂▂▃▃▄▄▄▅▅▅▆▇▇▇███
train/global_step,▁▂▂▂▃▃▄▄▄▅▅▅▆▇▇▇███
train/grad_norm,▆▄▁▄▇█▇▃▂
train/learning_rate,▁▁▁▂▃▃▄▅▆▆▇█
train/loss,▄▅▁▂█▅▃▃▇▆▂▃

0,1
eval/loss,11.26307
eval/runtime,0.0678
eval/samples_per_second,339.457
eval/steps_per_second,14.759
total_flos,15854006697984.0
train/epoch,1.0
train/global_step,12.0
train/grad_norm,42.06907
train/learning_rate,0.0
train/loss,11.2505
