            GENERERATIVE CHATBOT MADE w/ KERAS, RNN (LSTM) AND TRAINED w/ SARC DS.

In [None]:
#Basic libraries to import:
import numpy as np  #used for scientific computing
import pandas as pd #for data manipulation and analysis - used to upload de DS we are working with.
import pickle

#NLP
import nltk # Natural Language Toolkit, platform for building Python programs to work with human language data.

#nltk.download('punkt') # tokenizer that divides a text into a list of sentences

from collections import Counter 

from keras.models import Model
from keras.models import load_model 

from keras.layers import Dense, Input, Embedding

from keras.layers.recurrent import LSTM

from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.model_selection import train_test_split

In [None]:
def generate_batch(input_data, output_text_data, BATCH_SIZE):
    '''
    Custom function to generate batches
    
    input: 
        - input_data 
        - output_text_data
        - BATCH_SIZE
        
    output:
        - generator object
    '''
    
    num_batches = len(input_data) // BATCH_SIZE
    
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length)
            
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0
                    
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

In [None]:
np.random.seed(2018)

# set default parameters
BATCH_SIZE = 128 # number of samples processed before the model is updated. 
NUM_EPOCHS = 500 # number of complete passes through the training dataset.
HIDDEN_UNITS = 100 #number of hidden layers, they perform nonlinear transformations of the inputs entered into the network.
MAX_INPUT_SEQ_LENGTH = 20 # max. number of words the chatbot will consider as input
MAX_TARGET_SEQ_LENGTH = 20 # max. number of words the chatbot will reply with
MAX_VOCAB_SIZE = 20000 #10-20k  https://coursefinders.com/blog/es/5669/espanol-cuantas-palabras-se-necesitan-para-hablar-con-fluidez-un-idioma

# read the data
df = pd.read_csv('SARC_DS.csv')
lines = df['all']

# Containers that keeps track of how many times equivalent values are added.
input_counter = Counter()
target_counter = Counter()

#create the vocabulary from the dataset to train the model
input_texts = []
target_texts = []
prev_words = []

for line in lines:

    next_words = [w.lower() for w in nltk.word_tokenize(line) if w.isalpha()]

    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        
        for w in prev_words:
            input_counter[w] += 1
            
        target_words = next_words[:]
        target_words.insert(0, 'START')
        target_words.append('END')
        
        for w in target_words:
            target_counter[w] += 1
            
        target_texts.append(target_words)

    prev_words = next_words

# encode the data

input_word2idx = dict()
target_word2idx = dict()


for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
    
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1

input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
target_word2idx['UNK'] = 0

input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)


encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    
    for w in input_words:
        w2idx = 1
        
        if w in input_word2idx:
            w2idx = input_word2idx[w]
            
        encoder_input_wids.append(w2idx)

    encoder_input_data.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)

    
context = dict()
context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length


# input layer creation
encoder_inputs = Input(shape=(None,), 
                       name='encoder_inputs')

decoder_inputs = Input(shape=(None, num_decoder_tokens), 
                       name='decoder_inputs')

# embedding layer creation
encoder_embedding = Embedding(input_dim=num_encoder_tokens, 
                              output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length, 
                              name='encoder_embedding')

# LSTM layer creation
encoder_lstm = LSTM(units=HIDDEN_UNITS, 
                    return_state=True, 
                    name='encoder_lstm')

encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(
                                                encoder_embedding(
                                                    encoder_inputs))

encoder_states = [encoder_state_h, encoder_state_c]

decoder_lstm = LSTM(units=HIDDEN_UNITS, 
                    return_state=True, 
                    return_sequences=True, 
                    name='decoder_lstm')

decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(
                                        decoder_inputs,
                                        initial_state=encoder_states)

# Dense layer creation
decoder_dense = Dense(
                units=num_decoder_tokens,
                activation='softmax', #converts a real vector to a vector of categorical probabilities
                name='decoder_dense'
                )
decoder_outputs = decoder_dense(decoder_outputs)

# model creation
model = Model([encoder_inputs, decoder_inputs], 
               decoder_outputs)

encoder_model = Model(encoder_inputs, encoder_states)

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# spliting and training preparation
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, 
                                                    target_texts, 
                                                    test_size=0.2, 
                                                    random_state=42)

train_gen = generate_batch(X_train, y_train, BATCH_SIZE)
test_gen = generate_batch(X_test, y_test, BATCH_SIZE)

train_num_batches = len(X_train) // BATCH_SIZE
test_num_batches = len(X_test) // BATCH_SIZE

In [None]:
checkpoint = ModelCheckpoint('model_best_weights', #name of the document where the checkpoints will be saved
                            monitor='loss', 
                            verbose=1, 
                            save_best_only=True, 
                            mode='min', 
                            period= 1) #saved every <INT> epochs when loss improves
                            

my_callbacks = [checkpoint]

model.summary() #shows models layers & summary

In [None]:
#model = load_model("name_of_file")
fitted_model = model.fit_generator(generator=train_gen,
                    steps_per_epoch=train_num_batches,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=test_gen,
                    validation_steps=test_num_batches,
                    callbacks = my_callbacks
                   )


In [None]:
#model = load_model('name_of_file2') # to use a saved model load it.

encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

In [None]:
#### We test the models to see if they work properly

In [None]:
input_text = input()
input_seq = []
input_wids = []
max_encoder_seq_length = 10
max_decoder_seq_length = 10

for word in nltk.word_tokenize(input_text.lower()):
    idx = 1
    if word in input_word2idx:
        idx = input_word2idx[word]
    input_wids.append(idx)
    
input_seq.append(input_wids)
input_seq = pad_sequences(input_seq, max_encoder_seq_length)
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, target_word2idx['START']] = 1
target_text = ''
target_text_len = 0
terminated = False

while not terminated:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sample_token_idx = np.argmax(output_tokens[0, -1, :])
    sample_word = target_idx2word[sample_token_idx]
    target_text_len += 1

    if sample_word != 'START' and sample_word != 'END':
        target_text += ' ' + sample_word

    if sample_word == 'END' or target_text_len >= max_decoder_seq_length:
        terminated = True

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sample_token_idx] = 1

    states_value = [h, c]
    

target_text.strip().replace('UNK', '')

In [None]:
model.save("name_of_file2") #saved to .h5 no need to add extension

encoder_model.save('encoder_model_500e_2kl')
decoder_model.save('decoder_model_500e_2kl')


with open('target_word2idx_500e_2kl.pickle', 'wb') as handle:
    pickle.dump(target_word2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('input_word2idx_500e_2kl.pickle', 'wb') as handle:
    pickle.dump(input_word2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)