In [131]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [164]:
input_data = pd.read_csv('input_data.csv')
num_obs = 20_000
input_data = input_data.sample(num_obs, random_state=10)

In [165]:
print(input_data.shape)
input_data.head()

(20000, 2)


Unnamed: 0,clean_quest,clean_ans
98754,what a dork huh huh,yeah heh heh he is a anus heh heh
20364,the safety circuit is failed,we are losing atmosphere ...
188963,that is okay i have already had lunch,you must eat lunch pretty early it is only ele...
189144,the battery is in that is done with besides yo...,... u might /u do the same i have to warn you ...
20610,are you kidding,no i distinctly remember you walked out my doo...


In [166]:
input_data = input_data.dropna(axis=0)

encoder_input = input_data['clean_quest']
decoder_input = input_data['clean_ans']
decoder_input = '<BOS> ' + decoder_input + ' <EOS>'
full_input = pd.concat([encoder_input, decoder_input], 
                       ignore_index=True)

decoder_input

98754         <BOS> yeah heh heh he is a anus heh heh <EOS>
20364              <BOS> we are losing atmosphere ... <EOS>
188963    <BOS> you must eat lunch pretty early it is on...
189144    <BOS> ... u might /u do the same i have to war...
20610     <BOS> no i distinctly remember you walked out ...
                                ...                        
77404                <BOS> then i will call you a cab <EOS>
72632          <BOS> 'just a few loose ends to tie up <EOS>
133109    <BOS> that is a pretty hairy ride full colonel...
216599    <BOS> well keep your u wits /u about you and l...
148032       <BOS> what the hey let is go for a drive <EOS>
Name: clean_ans, Length: 19972, dtype: object

In [167]:
encoder_input

98754                                   what a dork huh huh
20364                          the safety circuit is failed
188963                that is okay i have already had lunch
189144    the battery is in that is done with besides yo...
20610                                       are you kidding
                                ...                        
77404                           nope can not we are through
72632            it is clearly federation in origin captain
133109                                               i know
216599    you missed your u plane /u my u life /u is ove...
148032    now do not make me feel as if i am being too f...
Name: clean_quest, Length: 19972, dtype: object

In [168]:
NUM_WORDS = 10_000

tokenizer = Tokenizer(num_words=10_000)
tokenizer.fit_on_texts(full_input)

encoder_seq = tokenizer.texts_to_sequences(encoder_input)
decoder_seq = tokenizer.texts_to_sequences(decoder_input)

MAXLEN = 20

encoder_seq = pad_sequences(encoder_seq, maxlen=MAXLEN, 
                            padding='post', truncating='post')
        
decoder_seq = pad_sequences(decoder_seq, maxlen=MAXLEN, 
                            padding='post', truncating='post')      

In [169]:
result_output = np.zeros(shape=(len(encoder_seq), MAXLEN, NUM_WORDS), 
                         dtype='float32')

for i, sequences in enumerate(decoder_seq):
    for j, sequence in enumerate(sequences):
        if j > 0:
            result_output[i, j, sequence] = 1

In [201]:
word_index = tokenizer.word_index
word_dict = [(index, word) for word, index in word_index.items()]
word_dict = dict(word_dict)

### Modelling

In [170]:
encoder_input_layer = Input(shape=(MAXLEN,), dtype='int32')
embedding_layer = Embedding(NUM_WORDS, 50)
encoder_embedding = embedding_layer(encoder_input_layer)
encoder_lstm = LSTM(100, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

decoder_input_layer = Input(shape=(MAXLEN,), dtype='int32')
decoder_embedding = embedding_layer(decoder_input_layer)
decoder_lstm = LSTM(100, return_state=True, return_sequences=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, 
                                    initial_state=[state_h, state_c])

dense_layer = Dense(NUM_WORDS, activation='softmax')
output = TimeDistributed(dense_layer)(decoder_output)
model = Model([encoder_input_layer, decoder_input_layer], output)

model.summary()

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 20, 50)       500000      input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, 100), (None, 60400       embedding_3[0][0]     

In [171]:
BATCH_SIZE = 32
EPOCHS = 5

model.compile(optimizer='adam', 
              loss ='categorical_crossentropy', 
              metrics = ['accuracy'])

history = model.fit([encoder_seq, decoder_seq], 
                     result_output, 
                     epochs=EPOCHS, 
                     batch_size=BATCH_SIZE)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Results

In [199]:
test_data = encoder_seq[0]
test_data = test_data[np.newaxis,]

expected_answer = np.zeros((1, MAXLEN))
expected_answer[0, -1] = 1

In [200]:
for i in range(6):
    print(i)
    res = model.predict([test_data, expected_answer])
    ye = np.argmax(res)
    mp = np.argmax(ye)
    expected_answer[0, 0:-1] = expected_answer[0, 1:]
    expected_answer[0, -1] = mp

0
1
2
3
4
5
