In [109]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [97]:
input_data = pd.read_csv('input_data.csv')
num_obs = 50_000
input_data = input_data.sample(num_obs, random_state=10)

In [98]:
print(input_data.shape)
input_data.head()

(50000, 2)


Unnamed: 0,clean_quest,clean_ans
98754,what a dork huh huh,yeah heh heh he is a anus heh heh
20364,the safety circuit is failed,we are losing atmosphere ...
188963,that is okay i have already had lunch,you must eat lunch pretty early it is only ele...
189144,the battery is in that is done with besides yo...,... u might /u do the same i have to warn you ...
20610,are you kidding,no i distinctly remember you walked out my doo...


In [99]:
input_data = input_data.dropna(axis=0)

encoder_input = input_data['clean_quest']
decoder_input = input_data['clean_ans']
decoder_input = '<BOS> ' + decoder_input + ' <EOS>'
full_input = pd.concat([encoder_input, decoder_input], 
                       ignore_index=True)

decoder_input

98754         <BOS> yeah heh heh he is a anus heh heh <EOS>
20364              <BOS> we are losing atmosphere ... <EOS>
188963    <BOS> you must eat lunch pretty early it is on...
189144    <BOS> ... u might /u do the same i have to war...
20610     <BOS> no i distinctly remember you walked out ...
                                ...                        
107051    <BOS> please they show previews for foreign mo...
53397     <BOS> we are not sure she is been to all kinds...
22028                          <BOS> mama what is ... <EOS>
177844                                  <BOS> do tell <EOS>
57445                                    <BOS> nothin <EOS>
Name: clean_ans, Length: 49935, dtype: object

In [100]:
encoder_input

98754                                   what a dork huh huh
20364                          the safety circuit is failed
188963                that is okay i have already had lunch
189144    the battery is in that is done with besides yo...
20610                                       are you kidding
                                ...                        
107051                                       beverly center
53397                                          what is that
22028     it is alright now little sister everything is ...
177844                      know what i love about dynamite
57445                                           no i ai not
Name: clean_quest, Length: 49935, dtype: object

In [101]:
NUM_WORDS = 10_000

tokenizer = Tokenizer(num_words=10_000)
tokenizer.fit_on_texts(full_input)

encoder_seq = tokenizer.texts_to_sequences(encoder_input)
decoder_seq = tokenizer.texts_to_sequences(decoder_input)

MAXLEN = 20

encoder_seq = pad_sequences(encoder_seq, maxlen=MAXLEN, 
                            padding='post', truncating='post')
        
decoder_seq = pad_sequences(decoder_seq, maxlen=MAXLEN, 
                            padding='post', truncating='post')      

In [107]:
decoder_output = np.zeros(shape=(len(encoder_seq), MAXLEN, NUM_WORDS), 
                          dtype='uint8')

for i, sequences in enumerate(decoder_seq):
    for j, sequence in enumerate(sequences):
        if j > 0:
            decoder_output[i, j, sequence] = 1

### Modelling

In [110]:
encoder_input_layer = Input(shape=(MAXLEN,), dtype='int32')
embedding_layer = Embedding(NUM_WORDS, 50)
encoder_embedding = embedding_layer(encoder_input_layer)
encoder_lstm = LSTM(100, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

decoder_input_layer = Input(shape=(MAXLEN,), dtype='int32')
decoder_embedding = embedding_layer(decoder_input_layer)
decoder_lstm = LSTM(100, return_state=True, return_sequences=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, 
                                    initial_state=[state_h, state_c])

dense_layer = Dense(NUM_WORDS, activation='softmax')
output = TimeDistributed(dense_layer)(decoder_output)
model = Model([encoder_input_layer, decoder_input_layer], output)

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 50)       500000      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 100), (None, 60400       embedding[0][0]       