In [1]:
Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import PreTrainedTokenizerFast
import urllib.request

# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

# 데이터 로드
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv",
    filename="ChatBotData.csv",
)
df = pd.read_csv("ChatBotData.csv")
dataset = Dataset.from_pandas(df)

def get_input(examples):
    inputs = [tokenizer.bos_token + example for example in examples['Q']]
    outputs = [example + tokenizer.eos_token for example in examples['A']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 전처리
tokenized_datasets = dataset.map(get_input, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Map:   0%|          | 0/11823 [00:00<?, ? examples/s]

In [3]:
# 데이터 크기 줄이기 위함
dataset = tokenized_datasets.train_test_split(test_size=0.01)

# 검증 데이터 분할
train_test_dataset = dataset['test'].train_test_split(test_size=0.2)

train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

In [10]:
import wandb

wandb.login()

True

In [11]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [1]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 7e-5
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
}

sweep_config['parameters'] = parameters_dict

In [12]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, EarlyStoppingCallback

def train():
  wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

  # 학습 설정
  training_args = TrainingArguments(
      output_dir='./results',
      num_train_epochs=config.epochs,
      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,
      warmup_steps=500,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=10,
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
  )

  trainer.train()

  model.save_pretrained('./kogpt2-chatbot')
  tokenizer.save_pretrained('./kogpt2-chatbot')

In [13]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [14]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='minidlthon_kogpt2')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

Create sweep with ID: 9x1djz1e
Sweep URL: https://wandb.ai/nkim123/minidlthon_kogpt2/sweeps/9x1djz1e


[34m[1mwandb[0m: Agent Starting Run: br23zxns with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1.0450548438722688e-05
[34m[1mwandb[0m: 	weight_decay: 0.3
[34m[1mwandb[0m: Currently logged in as: [33mnkim12[0m ([33mnkim123[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,16.1372
20,14.1547
30,9.346
40,2.7576
50,0.7993
60,0.8352
70,0.6393
80,0.5418
90,0.6032


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▂▃▃▄▅▆▇██
train/global_step,▁▂▃▃▄▅▆▇██
train/grad_norm,▇▇█▂▁▁▁▁▁
train/learning_rate,▁▂▃▄▄▅▆▇█
train/loss,█▇▅▂▁▁▁▁▁

0,1
total_flos,6205685760000.0
train/epoch,1.0
train/global_step,95.0
train/grad_norm,3.46982
train/learning_rate,1e-05
train/loss,0.6032
train_loss,4.84709
train_runtime,374.7708
train_samples_per_second,0.253
train_steps_per_second,0.253


In [15]:
# 저장된 모델 및 토크나이저 로드
trained_model = GPT2LMHeadModel.from_pretrained('./kogpt2-chatbot')
trained_tokenizer = PreTrainedTokenizerFast.from_pretrained('./kogpt2-chatbot')

In [16]:
# 입력 문장 토큰화
input_text = "대답이 너무 빨라"
input_ids = trained_tokenizer.encode(trained_tokenizer.bos_token + input_text + trained_tokenizer.eos_token, return_tensors='pt')

# 모델 추론
outputs = trained_model.generate(input_ids, max_length=50, repetition_penalty=2.0, num_beams=5, early_stopping=True)
output_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(output_text)

대답이 너무 빨라


In [17]:
# 파인튜닝 전 모델과 비교
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')

# 모델 추론
gen_ids = model.generate(input_ids,
                           max_length=50,
                           repetition_penalty=2.0,
                           pad_token_id=tokenizer.pad_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


</s> 대답이 너무 빨라</s> <unk>, "그럼요, 그건 그렇고요." 하고 말했습니다.
그러자 그는 다시 말을 했어요.
'아니, 그럼.'
그는 이렇게 말하고는 고개를 끄덕였지요.
그리고 나서 그가 말한 대로
