In [0]:
from google.colab import drive

drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [0]:
pip install konlpy



In [0]:
import random
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from konlpy.tag import Okt
tf.enable_eager_execution()

In [0]:
epochs = 200
num_words = 2000

In [0]:
class Encoder(tf.keras.Model):

  def __init__(self):
    super(Encoder, self).__init__()
    self.embed = layers.Embedding(num_words, 64)
    self.lstm = layers.LSTM(512, return_sequences=True, return_state=True)

  # H contain all information of input (return_sequences)
  def __call__(self, x, training=False, mask=None):
    x = self.embed(x)
    H, h, c = self.lstm(x)
    return H, h, c

In [0]:
class Decoder(tf.keras.Model):

  def __init__(self):
    super(Decoder, self).__init__()
    self.embed = layers.Embedding(num_words, 64)
    self.lstm = layers.LSTM(512, return_sequences=True, return_state=True)
    self.att = layers.Attention()
    self.dense = layers.Dense(num_words, activation='softmax')

  # S_ has information of inputs until t-1 time
  def __call__(self, inputs, training=False, mask=False):
    x, s0, c0, H = inputs
    x = self.embed(x)
    S, h, c = self.lstm(x, initial_state=[s0, c0])
    S_ = tf.concat([s0[:,tf.newaxis,:], S[:,:-1,:]], axis=1)
    # Attention()([query, key, value])
    A = self.att([S_,H])
    y = tf.concat([S,A], axis=-1)
    return self.dense(y), h, c

In [0]:
class Seq2seq(tf.keras.Model):

  def __init__(self, sos, eos):
    super(Seq2seq, self).__init__()
    self.enc = Encoder()
    self.dec = Decoder()
    self.sos = sos
    self.eos = eos

  def __call__(self, inputs, training=False,mask=None):
    if training is True:
      x, y = inputs
      H, h, c = self.enc(x)
      y, _, _ = self.dec((y,h,c,H))
      return y
    else:
      x = inputs
      H, h, c = self.enc(x)

      y = tf.convert_to_tensor(self.sos)
      y = tf.reshape(y,(1,1))
      seq = tf.TensorArray(tf.int32, 64)

      for idx in tf.range(64):
        y, h, c = self.dec([y, h, c, H])
        y = tf.cast(tf.argmax(y, axis=-1), dtype=tf.int32)
        y = tf.reshape(y,(1,1))
        seq = seq.write(idx, y)
        if y==self.eos:
          break
      return tf.reshape(seq.stack(), (1,64))

In [0]:
@tf.function
def train_step(model, inputs, labels, loss_object, optimizer, train_loss, train_accuracy):
  # labels = [sos, ..., eos], shifted = [sos, ...], output = [..., eos]
  output_labels = labels[:,1:]
  shifted_labels = labels[:,:-1]
  with tf.GradientTape() as tape:
    predictions = model([inputs, shifted_labels], training=True)
    loss =loss_object(output_labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  train_loss(loss)
  train_accuracy(output_labels, predictions)

@tf.function
def test_step(model, inputs):
  return model(inputs, training=False)

In [0]:
file_path = 'gdrive/My Drive/dataset/chatbot_data.csv'
okt = Okt()

#open file and split by morpheme
with open(file_path, 'r') as file:
  lines = file.readlines()
  seq = [' '.join(okt.morphs(line)) for line in lines]
print(seq[0])

# seperate question, answer   answers have to contain sos(\t), eos(\n)
questions = seq[::2]
answers = ['\t' + lines for lines in seq[1::2]]
print(answers[0])

num_samples = len(questions)

# seperate train, test data (4:1)
perm = list(range(num_samples))
random.seed(0)
random.shuffle(perm)

train_q = []
train_a = []
test_q = []
test_a = []

for idx,qna in enumerate(zip(questions, answers)):
  q, a = qna
  if perm[idx] > num_samples//5:
    train_q.append(q)
    train_a.append(a)
  else:
    test_q.append(q)
    test_a.append(a)

#tokenize data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~')

tokenizer.fit_on_texts(train_q + train_a)

train_q_seq = tokenizer.texts_to_sequences(train_q)
train_a_seq = tokenizer.texts_to_sequences(train_a)
test_q_seq = tokenizer.texts_to_sequences(test_q)
test_a_seq = tokenizer.texts_to_sequences(test_a)
print(train_q_seq)

# padding to match same length model input size
x_train = pad_sequences(train_q_seq, 64)
y_train = pad_sequences(train_a_seq, 65, padding='post')
x_test = pad_sequences(test_q_seq, 64)
y_test = pad_sequences(test_a_seq, 65, padding='post')
print(x_train[0])

train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32).prefetch(1024)
test_df = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(1).prefetch(1024)

아이스 아메리카노 하나요 

	테이크아웃 하실 건가 요 ? 

[[8, 5, 86, 1], [122, 189, 4, 26, 25, 1], [122, 230, 60, 304, 446, 7, 305, 30, 1], [15, 87, 33, 123, 36, 95, 19, 22, 1], [447, 231, 96, 7, 190, 448, 1], [42, 5, 16, 10, 8, 306, 16, 307, 1], [11, 159, 9, 449, 38, 97, 3, 1], [189, 191, 30, 1], [8, 189, 450, 110, 7, 141, 19, 22, 1], [46, 34, 97, 451, 308, 142, 24, 1], [452, 57, 35, 39, 95, 19, 68, 1], [232, 17, 160, 7, 22, 1], [33, 69, 111, 453, 47, 23, 98, 74, 1], [309, 16, 124, 161, 8, 5, 12, 1], [57, 162, 1], [11, 310, 15, 454, 61, 36, 25, 1], [192, 193, 6, 98, 74, 1], [455, 88, 26, 75, 1], [311, 194, 1], [47, 7, 99, 27, 75, 1], [456, 46, 163, 312, 1], [112, 51, 22, 1], [112, 51, 7, 125, 164, 23, 313, 457, 164, 314, 15, 313, 1], [112, 51, 458, 100, 6, 98, 23, 68, 1], [57, 312, 1], [8, 5, 48, 15, 12, 1], [126, 459, 33, 7, 233, 1], [101, 460, 461, 195, 7, 462, 19, 143, 1], [122, 6, 315, 42, 463, 40, 47, 3, 1], [125, 308, 100, 9, 22, 1], [20, 77, 30, 1], [189, 48, 78, 5, 58, 10, 3, 1], [234, 316, 12, 1],

In [0]:
model = Seq2seq(sos=tokenizer.word_index['\t'], eos=tokenizer.word_index['\n'])

loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [0]:
for epoch in range(epochs):
  for seqs, labels in train_ds:
    train_step(model, seqs, labels, loss_object, optimizer, train_loss, train_accuracy)
  template = 'Epoch {}, Loss: {}, Accuracy: {}'
  print(template.format(epoch + 1,
                        train_loss.result(),
                        train_accuracy.result() * 100))

  train_loss.reset_states()
  train_accuracy.reset_states()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1, Loss: 2.9809491634368896, Accuracy: 84.55122375488281
Epoch 2, Loss: 0.5946705341339111, Accuracy: 91.89771270751953
Epoch 3, Loss: 0.5005366206169128, Accuracy: 91.99952697753906
Epoch 4, Loss: 0.4821603000164032, Accuracy: 92.0974349975586
Epoch 5, Loss: 0.4737153649330139, Accuracy: 92.15225219726562
Epoch 6, Loss: 0.4668738543987274, Accuracy: 92.14442443847656
Epoch 7, Loss: 0.4595836102962494, Accuracy: 92.1483383178711
Epoch 8, Loss: 0.4502396881580353, Accuracy: 92.1953353881836
Epoch 9, Loss: 0.43705856800079346, Accuracy: 92.23841094970703
Epoch 10, Loss: 0.42030662298202515, Accuracy: 92.3245620727539
Epoch 11, Loss: 0.4078884422779083, Accuracy: 92.55560302734375
Epoch 12, Loss: 0.4016413986682892, Accuracy: 92.70832824707031
Epoch 13, Loss: 0.38907620310783386, Accuracy: 92.97462463378906
Epoch 14, Loss: 0.37991511821746826, Accuracy: 93.19001007080078
Epoch 15, Loss: 0.3

In [0]:
for test_seq, test_labels in test_ds:
  prediction = test_step(model, test_seq)
  test_text = tokenizer.sequences_to_texts(test_seq.numpy())
  gt_text = tokenizer.sequences_to_texts(test_labels.numpy())
  texts = tokenizer.sequences_to_texts(prediction.numpy())
  print('_')
  print('q: ', test_text)
  print('a: ', gt_text)
  print('p: ', texts)

NameError: ignored