In [1]:
!pip install transformers



In [2]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel

In [3]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['lm_head.weight', 'transformer.h.1.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.9.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

In [4]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

1
1
3
----------
</s>
<usr>
<pad>
<sys>


In [5]:
import pandas as pd
import tqdm
import urllib.request

In [6]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')

In [7]:
len(train_data)

11823

In [8]:
batch_size = 32

In [9]:
def get_chat_data():#generator함수 사용하여 dataset 구성
  for question, answer in zip(train_data.Q.to_list(), train_data.A.to_list()):
    bos_token = [tokenizer.bos_token_id] #문장 시작 토큰
    eos_token = [tokenizer.eos_token_id] #문장 끝 토큰
    sent = tokenizer.encode('<usr>' + question + '<sys>' + answer)
    yield bos_token + sent + eos_token

In [10]:
dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32) #질문과 대답을 토큰화

In [11]:
dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)
#데이터셋에 패딩을 추가 -> 배치의 시퀀스 길이를 동일하게 유지

In [12]:
for batch in dataset:
    print(batch)
    break

tf.Tensor(
[[    1     2  9349  7888   739  7318   376     4 12557  6824  9108  9028
   7098 25856     1     3     3     3     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9020  8263  7497 10192 11615  8210  8006     4 12422  8711
   9535  7483 12521     1     3     3     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9085  7597   395  8149 10624  7397 24224 13358  7182     4
  12079  8135 16899  9677  8234   389     1     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9085  7597   395  8149  9465 10624  7397 24224 13358  7182
      4 12079  8135 16899  9677  8234   389     1     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9943   422   418  9327  8702  7098     4  9847 16912 18328
   8671  7415  8263  8234   389     1     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9815   410 21249 10174  6824  8210  800

In [13]:
tokenizer.decode(batch[0])

'</s><usr> 12시 땡!<sys> 하루가 또 가네요.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [14]:
print(batch[0])

tf.Tensor(
[    1     2  9349  7888   739  7318   376     4 12557  6824  9108  9028
  7098 25856     1     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3], shape=(30,), dtype=int32)


In [15]:
print(tokenizer.encode('</s><usr> 12시 땡!<sys> 하루가 또 가네요.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'))

[1, 2, 9349, 7888, 739, 7318, 376, 4, 12557, 6824, 9108, 9028, 7098, 25856, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [16]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

In [17]:
steps = len(train_data) // batch_size + 1
print(steps)

370


In [18]:
EPOCHS = 1

for epoch in range(EPOCHS):
  epoch_loss = 0

  for batch in tqdm.tqdm_notebook(dataset, total=steps):
      with tf.GradientTape() as tape: #자동 미분을 위해 작업을 기록
          result = model(batch, labels=batch) #현재 배치로 모델 호출
          loss = result[0]
          batch_loss = tf.reduce_mean(loss) #배치에 대한 평균 손실 계산

      grads = tape.gradient(batch_loss, model.trainable_variables) #학습 가능한 변수에 대한 손실의 기울기 계산
      adam.apply_gradients(zip(grads, model.trainable_variables)) #adam 사용
      epoch_loss += batch_loss / steps

  print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm.tqdm_notebook(dataset, total=steps):


  0%|          | 0/370 [00:00<?, ?it/s]



[Epoch:    1] cost = 2.12707782


In [19]:
text = '오늘도 좋은 하루!'

In [20]:
sent = '<usr>' + text + '<sys>'

In [21]:
input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])

In [22]:
output = model.generate(input_ids, max_length=50, early_stopping=True, eos_token_id=tokenizer.eos_token_id)



In [23]:
decoded_sentence = tokenizer.decode(output[0].numpy().tolist())

In [24]:
decoded_sentence.split('<sys> ')[1].replace('</s>', '')

'좋은 하루를 만들어보세요.'

In [25]:
output = model.generate(input_ids, max_length=50, do_sample=True, top_k=10)
tokenizer.decode(output[0].numpy().tolist())

'</s><usr> 오늘도 좋은 하루!<sys> 오늘도 좋은 하루를 하셨군요.</s>'

In [26]:
def return_answer_by_chatbot(user_text):
  sent = '<usr>' + user_text + '<sys>'
  input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent) #시작 토큰 추가 후 텍스트 인코딩
  input_ids = tf.convert_to_tensor([input_ids]) #텐서로 변환
  output = model.generate(input_ids, max_length=50, do_sample=True, top_k=20) #대답 생성
  sentence = tokenizer.decode(output[0].numpy().tolist()) #생성된 출력 디코딩
  chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '') #텍스트만 추출
  return chatbot_response

In [27]:
return_answer_by_chatbot('안녕! 반가워~')

'감사합니다.'

In [28]:
return_answer_by_chatbot('너는 누구야?')

'친구의 첫인연일 뿐이죠.'

In [29]:
return_answer_by_chatbot('사랑해')

'사랑을 하라는 건가봐요.'

In [30]:
return_answer_by_chatbot('나랑 영화보자')

'영화보면서 영화 보고 싶은데 영화 좀 볼수 있었으면 좋겠네요.'

In [31]:
return_answer_by_chatbot('너무 심심한데 나랑 놀자')

'짝사랑으로 살 수 있어요.'

In [32]:
return_answer_by_chatbot('영화 해리포터 재밌어?')

'영화해보는 것도 좋을 것 같습니다.'

In [33]:
return_answer_by_chatbot('너 딥 러닝 잘해?')

'그립기도 해요.'

In [34]:
return_answer_by_chatbot('너 취했어?')

'취하면 취해도 될까요.'

In [35]:
return_answer_by_chatbot('커피 한 잔 할까?')

'커피 좋아하시는 분도 있을것같아요.'