# Generating a Text using the Chatbot Model

![Alt Text](https://cdn-images-1.medium.com/max/1200/1*CUFxTTJ4M54YLRhwrlTjpw.gif)

In [39]:
# import libs
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk
import os
import sys
import zipfile
import pickle
import urllib.request

### Parameters

In [40]:
# RNN Size
HIDDEN_UNITS = 256

# list of characters
WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,'

# GLOVE Embedding Size
GLOVE_EMBEDDING_SIZE = 100

# GLOVE file path
GLOVE_MODEL = "data/glove-data/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt"

### Download and load Glove files

In [42]:
def download_glove():
    '''
    Function to download GloVe files if not exist
    '''
    if not os.path.exists(GLOVE_MODEL):

        glove_zip = 'data/glove-data/glove.6B.zip'

        if not os.path.exists('data/glove-data'):
            os.makedirs('data/glove-data')

        if not os.path.exists(glove_zip):
            print('glove file does not exist, downloading from internet')
            urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip,
                                       reporthook=reporthook)

        print('unzipping glove file')
        zip_ref = zipfile.ZipFile(glove_zip, 'r')
        zip_ref.extractall('data/glove-data')
        zip_ref.close()

In [43]:
def reporthook(block_num, block_size, total_size):
    read_so_far = block_num * block_size
    if total_size > 0:
        percent = read_so_far * 1e2 / total_size
        s = "\r%5.1f%% %*d / %d" % (
            percent, len(str(total_size)), read_so_far, total_size)
        sys.stderr.write(s)
        if read_so_far >= total_size:  # near the end
            sys.stderr.write("\n")
    else:  # total size is unknown
        sys.stderr.write("read %d\n" % (read_so_far,))

In [44]:
def load_glove():
    '''
    Function to read gloVe files
        * return: dict of all words and their embedding vectors
    '''
    download_glove()
    word2em = {}
    file = open(GLOVE_MODEL, mode='rt', encoding='utf8')
    for line in file:
        words = line.strip().split()
        word = words[0]
        embeds = np.array(words[1:], dtype=np.float32)
        word2em[word] = embeds
    file.close()
    return word2em

### Retrive Paramters

In [41]:
def in_white_list(_word):
    for char in _word:
        if char in WHITELIST:
            return True

    return False

In [45]:
# read file data
def load_data(path):
    ''' Function to read training and testing files
            *args:
                path: file path as string 
            *return:
                data: raw string text
    '''
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [46]:
def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    with open('models/preprocess.p', mode='rb') as in_file:
        return pickle.load(in_file)

### Prediction class

In [47]:

class CornellWordGloveChatBot(object):
    model = None
    encoder_model = None
    decoder_model = None
    target_word2idx = None
    target_idx2word = None
    max_decoder_seq_length = None
    max_encoder_seq_length = None
    num_decoder_tokens = None
    word2em = None

    def __init__(self):
        
        # load glove
        self.word2em = load_glove()
        
        # load parameters
        context, input_texts_word2em, target_texts, word2em, (self.target_word2idx,self.target_idx2word) = load_preprocess()
        
        self.max_encoder_seq_length = context['encoder_max_seq_length']
        self.max_decoder_seq_length = context['decoder_max_seq_length']
        self.num_decoder_tokens = context['num_decoder_tokens']

        # encoder
        encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs')
        encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm")
        encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
        encoder_states = [encoder_state_h, encoder_state_c]

        # decoder
        decoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='decoder_inputs')
        decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm')
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense')
        decoder_outputs = decoder_dense(decoder_outputs)

        # model inputs
        self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        # load saved model weights
        self.model.load_weights('models/keras-glove-weights.h5')
        self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

        self.encoder_model = Model(encoder_inputs, encoder_states)

        decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

        
    def reply(self, input_text):
        '''
        Function to generate a text response
            *args:
                input_text: message
            *return:
                generated text
        '''
        
        input_seq = []
        input_emb = []
        #split senetence
        for word in nltk.word_tokenize(input_text.lower()):
            # check sentence characters
            if not in_white_list(word):
                continue
            # create embedding vectors for input sentence
            emb = np.zeros(shape=GLOVE_EMBEDDING_SIZE)
            if word in self.word2em:
                emb = self.word2em[word]
            input_emb.append(emb)
        input_seq.append(input_emb)
        # padding senetence
        input_seq = pad_sequences(input_seq, self.max_encoder_seq_length)
        #predict
        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, GLOVE_EMBEDDING_SIZE))
        target_seq[0, 0, :] = self.word2em['start']
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = self.target_idx2word[sample_token_idx]
            target_text_len += 1

            if sample_word != 'start' and sample_word != 'end':
                target_text += ' ' + sample_word

            if sample_word == 'end' or target_text_len >= self.max_decoder_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, GLOVE_EMBEDDING_SIZE))
            if sample_word in self.word2em:
                target_seq[0, 0, :] = self.word2em[sample_word]

            states_value = [h, c]
        return target_text.strip()

    def test_run(self, sentance_lst):
        '''
        function to generate a response for a list of input text
        '''
        for sentance in sentance_lst:
            print('  Input Message   : {}'.format(sentance))
            print('  Response        : {}'.format(self.reply(sentance)))
            print('\n')

## Predict testset

In [48]:
source_path = 'data/datasets/train.from'

In [49]:
source_test = load_data(source_path)

In [50]:
model = CornellWordGloveChatBot()

In [None]:
model.test_run(source_test.split("\n")[98:103])