In [1]:
import numpy as np
import re
import pandas as pd
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
chatbot = pd.read_csv("./data/ChatbotData.csv")

chatbot['Q'] = chatbot['Q'].str.replace("[^\w]", " ")
chatbot['A'] = chatbot['A'].str.replace("[^\w]", " ")

print(chatbot)

                             Q                         A  label
0                       12시 땡                 하루가 또 가네요       0
1                  1지망 학교 떨어졌어                 위로해 드립니다       0
2                 3박4일 놀러가고 싶다               여행은 언제나 좋죠       0
3              3박4일 정도 놀러가고 싶다               여행은 언제나 좋죠       0
4                      PPL 심하네                눈살이 찌푸려지죠       0
...                        ...                       ...    ...
11818           훔쳐보는 것도 눈치 보임         티가 나니까 눈치가 보이는 거죠       2
11819           훔쳐보는 것도 눈치 보임              훔쳐보는 거 티나나봐요       2
11820              흑기사 해주는 짝남                     설렜겠어요       2
11821  힘든 연애 좋은 연애라는게 무슨 차이일까   잘 헤어질 수 있는 사이 여부인 거 같아요       2
11822               힘들어서 결혼할까봐        도피성 결혼은 하지 않길 바라요       2

[11823 rows x 3 columns]


In [5]:
chatbot['label']

0        0
1        0
2        0
3        0
4        0
        ..
11818    2
11819    2
11820    2
11821    2
11822    2
Name: label, Length: 11823, dtype: int64

In [4]:
encoder_input, decoder_input, decoder_output = [], [], []

for stc in chatbot['Q']:
    encoder_input.append(stc.split())

for stc in chatbot['A']:
    decoder_input.append(("<start> "+stc).split())

for stc in chatbot['A']:
    decoder_output.append((stc+" <end>").split())

In [None]:
tokenizer_q = Tokenizer()
tokenizer_q.fit_on_texts(encoder_input)
encoder_input = tokenizer_q.texts_to_sequences(encoder_input)

tokenizer_a = Tokenizer()
tokenizer_a.fit_on_texts(decoder_input)
tokenizer_a.fit_on_texts(decoder_output)
decoder_input = tokenizer_a.texts_to_sequences(decoder_input)
decoder_output = tokenizer_a.texts_to_sequences(decoder_output)

In [None]:
encoder_input = pad_sequences(encoder_input, padding="post")
decoder_input = pad_sequences(decoder_input, padding="post")
decoder_output = pad_sequences(decoder_output, padding="post")

In [None]:
print(encoder_input[:3])

In [None]:
print(encoder_input.shape)
print(decoder_input.shape)

In [None]:
a_to_index = tokenizer_a.word_index
index_to_a = tokenizer_a.index_word

In [None]:
test_size = 2500
encoder_input_train = encoder_input[:-test_size]
decoder_input_train = decoder_input[:-test_size]
decoder_output_train = decoder_output[:-test_size]

encoder_input_test = encoder_input[-test_size:]
decoder_input_test = decoder_input[-test_size:]
decoder_output_test = decoder_output[-test_size:]

# Training

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model

In [None]:
encoder_inputs = Input(shape=(15,))
encoder_embed = Embedding(len(tokenizer_q.word_index)+1, 50)(encoder_inputs)
encoder_mask = Masking(mask_value=0)(encoder_embed)
encoder_outputs, h_state, c_state = LSTM(50, return_state=True)(encoder_mask)

In [None]:
decoder_inputs = Input(shape=(22,))
decoder_embed = Embedding(len(tokenizer_a.word_index)+1, 50)(decoder_inputs)
decoder_mask = Masking(mask_value=0)(decoder_embed)

decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_mask, initial_state=[h_state, c_state])

decoder_dense = Dense(len(tokenizer_a.word_index)+1, activation='softmax')
decoder_softmax_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit(x = [encoder_input_train, decoder_input_train], y = decoder_output_train, validation_data = ([encoder_input_test, decoder_input_test], decoder_output_test), batch_size = 128, epochs = 100)

# Prediction

In [None]:
encoder_model = Model(encoder_inputs, [h_state, c_state])

In [None]:
encoder_h_state = Input(shape=(50,))
encoder_c_state = Input(shape=(50,))

pd_decoder_outputs, pd_h_state, pd_c_state = decoder_lstm(decoder_mask, initial_state=[encoder_h_state, encoder_c_state])
pd_decoder_softmax_outputs = decoder_dense(pd_decoder_outputs)

decoder_model = Model([decoder_inputs, encoder_h_state, encoder_c_state], [pd_decoder_softmax_outputs, pd_h_state, pd_c_state])

In [None]:
input_stc = input()
token_stc = input_stc.split()
encode_stc = tokenizer_q.texts_to_sequences([token_stc])
pad_stc = pad_sequences(encode_stc, maxlen=15, padding="post")

states_value = encoder_model.predict(pad_stc)

predicted_seq = np.zeros((1,1))
predicted_seq[0, 0] = a_to_index['<start>']
print(predicted_seq)

decoded_stc = []

while True:
    output_words, h, c = decoder_model.predict([predicted_seq] + states_value)

    predicted_word = index_to_a[np.argmax(output_words[0,0])]

    if predicted_word == '<end>':
        break

    decoded_stc.append(predicted_word)

    predicted_seq = np.zeros((1,1))
    predicted_seq[0,0] = np.argmax(output_words[0,0])

    states_value = [h, c]

print(' '.join(decoded_stc))