In [36]:
!pip install transformers



In [37]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel

In [38]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='', eos_token='', pad_token='')
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.3.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'lm_head.weight', 'transformer.h.0.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.7.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

In [39]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

51200
51200
51200
----------
</s>
<usr>
<pad>
<sys>


In [40]:
import pandas as pd
import tqdm
import urllib.request

In [41]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')

In [42]:
len(train_data)

11823

In [43]:
batch_size = 32

In [44]:
def get_chat_data():
    #train_data의 각 질문과 답변을 순서대로 가져와서 처리
    for question, answer in zip(train_data.Q.to_list(), train_data.A.to_list()):
        bos_token = [tokenizer.bos_token_id]  #문장의 시작을 나타내는 토큰
        eos_token = [tokenizer.eos_token_id]  #문장의 끝을 나타내는 토큰
        sent = tokenizer.encode('' + question + '' + answer)
        #시작 토큰, 문장, 종료 토큰을 합쳐서 반환
        yield bos_token + sent + eos_token

In [45]:
dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32)

In [46]:
dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [47]:
for batch in dataset:
    print(batch)
    break

tf.Tensor(
[[51200  9349  7888   739  7318   376 25000  6824  9108  9028  7098 25856
  51200 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200
  51200 51200 51200 51200]
 [51200  9020  8263  7497 10192 11615  8210  8006 11567  8711  9535  7483
  12521 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200
  51200 51200 51200 51200]
 [51200  9085  7597   395  8149 10624  7397 24224 13358  7182  8030 19138
  16899  9677  8234   389 51200 51200 51200 51200 51200 51200 51200 51200
  51200 51200 51200 51200]
 [51200  9085  7597   395  8149  9465 10624  7397 24224 13358  7182  8030
  19138 16899  9677  8234   389 51200 51200 51200 51200 51200 51200 51200
  51200 51200 51200 51200]
 [51200  9943   422   418  9327  8702  7098  7141 16912 18328  8671  7415
   8263  8234   389 51200 51200 51200 51200 51200 51200 51200 51200 51200
  51200 51200 51200 51200]
 [51200  9815   410 21249 10174  6824  8210  8006 16146 11056 11594 10137
  10556  9266  8711 25856 51200 51200 51

In [48]:
tokenizer.decode(batch[0])

'<|endoftext|> 12시 땡!하루가 또 가네요.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [49]:
print(batch[0])

tf.Tensor(
[51200  9349  7888   739  7318   376 25000  6824  9108  9028  7098 25856
 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200 51200
 51200 51200 51200 51200], shape=(28,), dtype=int32)


In [50]:
print(tokenizer.encode(' 12시 땡! 하루가 또 가네요.'))

[9349, 7888, 739, 7318, 376, 12557, 6824, 9108, 9028, 7098, 25856]


In [51]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

In [52]:
steps = len(train_data) // batch_size + 1
print(steps)

370


In [54]:
EPOCHS = 1

for epoch in range(EPOCHS):
    epoch_loss = 0  #현재 에폭에서의 총 손실값을 저장하는 변수

    for batch in tqdm.tqdm_notebook(dataset, total=steps):
        with tf.GradientTape() as tape:
            #배치를 입력으로 전달하고 손실을 계산
            result = model(batch, labels=batch)
            loss = result[0]
            batch_loss = tf.reduce_mean(loss)  #배치 내의 평균 손실 계산

        #경사 계산 및 모델의 가중치 업데이트
        grads = tape.gradient(batch_loss, model.trainable_variables)
        adam.apply_gradients(zip(grads, model.trainable_variables))

        #현재 배치의 손실을 에폭 손실에 더함
        epoch_loss += batch_loss / steps

    #현재 에폭에서의 총 손실값 출력
    print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm.tqdm_notebook(dataset, total=steps):


  0%|          | 0/370 [00:00<?, ?it/s]

InvalidArgumentError: ignored

In [55]:
text = '오늘도 좋은 하루!'

In [56]:
sent = '' + text + ''

In [57]:
input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])

In [58]:
output = model.generate(input_ids, max_length=50, early_stopping=True, eos_token_id=tokenizer.eos_token_id)



InvalidArgumentError: ignored

In [None]:
decoded_sentence = tokenizer.decode(output[0].numpy().tolist())

In [None]:
decoded_sentence.split(' ')[1].replace('', '')

In [None]:
output = model.generate(input_ids, max_length=50, do_sample=True, top_k=10)
tokenizer.decode(output[0].numpy().tolist())

In [None]:
def return_answer_by_chatbot(user_text):
  sent = '' + user_text + ''
  input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
  input_ids = tf.convert_to_tensor([input_ids])
  output = model.generate(input_ids, max_length=50, do_sample=True, top_k=20)
  sentence = tokenizer.decode(output[0].numpy().tolist())
  chatbot_response = sentence.split(' ')[1].replace('', '')
  return chatbot_response

In [None]:
return_answer_by_chatbot('안녕! 반가워~')

In [None]:
return_answer_by_chatbot('너는 누구야?')

In [None]:
return_answer_by_chatbot('사랑해')

In [None]:
return_answer_by_chatbot('나랑 영화보자')

In [None]:
return_answer_by_chatbot('너무 심심한데 나랑 놀자')

In [None]:
return_answer_by_chatbot('영화 해리포터 재밌어?')

In [None]:
return_answer_by_chatbot('너 딥 러닝 잘해?')

In [None]:
return_answer_by_chatbot('너 취했어?')

In [None]:
return_answer_by_chatbot('커피 한 잔 할까?')