In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

# 데이터 로드
#train_df_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/train_data.csv')
#valid_df_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/validation_data.csv')

In [3]:
from tqdm import tqdm

def extract_data(df):
  filtered_speaker = []
  filtered_empathy = []
  filtered_listener = []

  # 데이터프레임을 순회하며 조건에 맞는 데이터 추출
  for i in tqdm(range(len(df)-1)):
      if df.loc[i, 'speaker'] == 0 and df.loc[i + 1, 'speaker'] == 1 and df.loc[i + 1, 'empathy'] != 0:
          filtered_speaker.append(df.loc[i, 'text'])
          filtered_empathy.append(df.loc[i + 1, 'empathy'])
          filtered_listener.append(df.loc[i + 1, 'text'])

  # 결과를 데이터프레임으로 생성
  return pd.DataFrame({'speaker': filtered_speaker, 'empathy': filtered_empathy, 'listener': filtered_listener})

#train_df = extract_data(train_df_org)
#valid_df = extract_data(valid_df_org)

# 정제된 데이터 파일로 저장
#train_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/gen_train_data.csv', index=False)
#valid_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/gen_validation_data.csv', index=False)

In [2]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/gen_train_data.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/gen_validation_data.csv')

train_df.head()

Unnamed: 0,speaker,empathy,listener
0,"엄마, 아기가 태어나니까 내가 부모로서 해야 할 게 참 많은 것 같아요.",3,그렇지? 아기 키우는 게 여간 어려운 일이 아니야.
1,어제 평소보다도 격하게 막 온몸을 써가면서 울더라고요. 얼마나 당황했는지 몰라요.,5,배가 고파서 그랬던 것 아닐까? 아기들은 배가 고프면 몸부림을 친단다.
2,맞아요. 젖을 물려주니 금세 뚝 그쳤어요. 난 분명히 순했을 것 같은데.,5,말도 마. 네가 얼마나 까탈스러웠는데. 우리 손주가 내 딸을 빼다 박았네.
3,잠도 못 자고 우는 아이 달래랴 수유하랴 머리카락이 다 빠지는 줄 알았어요.,3,"네가 어른 노릇, 부모 노릇을 하느라 고생이 정말 많구나."
4,그러면서 엄마 생각이 많이 났어요. 엄마는 직장까지 다니면서 나를 키우느라 얼마나 ...,2,우리 딸이 엄마가 되더니 철이 들었네. 하지만 네가 웃어주면 세상 모든 시름이 사라...


In [3]:
from transformers import PreTrainedTokenizerFast

Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<sent>'
PAD = '<pad>'

# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

def get_input(examples):
    speaker = [Q_TKN + example for example in examples['speaker']]
    listener = [A_TKN + example for example in examples['listener']]
    empathy = [SENT + example for example in examples['empathy']]

    inputs = [speaker[i] + empathy[i] + listener[i] for i in range(len(speaker))]
    outputs = [example + tokenizer.eos_token for example in examples['listener']]

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 전처리
train_inputs = train_dataset.map(get_input, batched=True)
valid_inputs = valid_dataset.map(get_input, batched=True)

Map:   0%|          | 0/181802 [00:00<?, ? examples/s]

Map:   0%|          | 0/22554 [00:00<?, ? examples/s]

In [5]:
# 데이터 크기 줄이기 위함
train_inputs = train_inputs.train_test_split(test_size=0.005)['test']
valid_inputs = valid_inputs.train_test_split(test_size=0.005)['test']

In [6]:
train_inputs

Dataset({
    features: ['speaker', 'empathy', 'listener', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 910
})

In [7]:
import wandb

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [1]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 7e-5
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
    'train_batch_size': {
        'values': [8, 16, 32]
    },
    'eval_batch_size': {
        'values': [8, 16, 32]
    }
}

sweep_config['parameters'] = parameters_dict

In [9]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none")

In [10]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, EarlyStoppingCallback
from accelerate import Accelerator

def train():
  wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
  # lora 적용
  accelerator = Accelerator() #데이터 병렬 처리 원활
  model = accelerator.prepare(model)
  model = get_peft_model(model, peft_config)

  # 학습 설정
  training_args = TrainingArguments(
      output_dir='./results',
      num_train_epochs=config.epochs,
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.eval_batch_size,
      warmup_steps=500,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=0.01,
      do_eval=True,
      evaluation_strategy="steps",
      eval_steps=0.2,
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_inputs,
      eval_dataset=valid_inputs,
  )

  trainer.train()

  model.save_pretrained('./kogpt2-chatbot')
  tokenizer.save_pretrained('./kogpt2-chatbot')

In [11]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [12]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='minidlthon_kogpt2')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

Create sweep with ID: 1m3tczby
Sweep URL: https://wandb.ai/nkim123/minidlthon_kogpt2/sweeps/1m3tczby


[34m[1mwandb[0m: Agent Starting Run: 7y2o6hbh with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	eval_batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 6.534373119446709e-05
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: Currently logged in as: [33mnkim12[0m ([33mnkim123[0m). Use [1m`wandb login --relogin`[0m to force relogin


pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]



Step,Training Loss,Validation Loss
12,16.6135,16.833481
24,16.5531,16.786835
36,16.519,16.705465
48,16.3516,16.584118


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▇▄▁
eval/runtime,▂▂▁█
eval/samples_per_second,▇▇█▁
eval/steps_per_second,▇▇█▁
train/epoch,▁▂▂▃▃▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▃▃▄▅▅▆▇▇██
train/grad_norm,▃▁▁▂▁▅▇▅█
train/learning_rate,▁▂▃▄▅▅▆▇█
train/loss,██▆▆▄▅▅▁▁

0,1
eval/loss,16.58412
eval/runtime,2.3379
eval/samples_per_second,48.334
eval/steps_per_second,3.422
total_flos,119300090757120.0
train/epoch,1.0
train/global_step,57.0
train/grad_norm,12.54101
train/learning_rate,1e-05
train/loss,16.3547


In [13]:
# 저장된 모델 및 토크나이저 로드
model = GPT2LMHeadModel.from_pretrained('./kogpt2-chatbot')

trained_model = get_peft_model(model, peft_config)
trained_tokenizer = PreTrainedTokenizerFast.from_pretrained('./kogpt2-chatbot')

In [14]:
# 입력 문장 토큰화
input_text = "대답이 너무 빨라"
input_ids = trained_tokenizer.encode(trained_tokenizer.bos_token + input_text + trained_tokenizer.eos_token, return_tensors='pt')

# 모델 추론
outputs = trained_model.generate(input_ids, max_length=50, repetition_penalty=2.0, num_beams=5, early_stopping=True)
output_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(output_text)

대답이 너무 빨라,,,,,,,,,,,,,, 


In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, AutoModelForQuestionAnswering
# 파인튜닝 전 모델과 비교
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
# 입력 문장 토큰화
input_text = "과제가 너무 힘들어"
input_ids = tokenizer.encode(tokenizer.bos_token + input_text + tokenizer.eos_token, return_tensors='pt')

# 모델 추론
gen_ids = model.generate(input_ids,
                           max_length=50,
                           repetition_penalty=2.0,
                           pad_token_id=tokenizer.pad_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           use_cache=True)
generated = tokenizer.decode(gen_ids[0]).split(tokenizer.eos_token)[2]
print(generated)

 <unk>,,
이제부터 시작해야겠다.
그런데 이게 무슨 말인가.
아무튼 나는 그걸로 끝낼 수 없다.
나는 지금껏 내가 해온 일을 모두 다 잊고 있다.



In [5]:
!pip install accelerate
!pip install peft
!pip install evaluate
!pip install wandb
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.me

Collecting wandb
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.12.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.17.5-py3-none-ma