In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re, time
import itertools
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('vader_lexicon')
from collections import defaultdict
tf.compat.v1.disable_eager_execution()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zekinchangmail.com/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Data Pre-Processing

In [49]:
def clean_text(text):
    '''Text Cleaning'''
    text = text.lower()
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    text = re.sub(' +', ' ', text) 
    return text

In [3]:
lines = open ('./data/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')[:-1]
conversations = open ('./data/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')[:-1]
id2line = {line.split(' +++$+++ ')[0] : line.split(' +++$+++ ')[-1] for line in lines}
conversations_ids = [con.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "").split(',') 
                     for con in conversations]

In [38]:
foo = list(itertools.chain(*[_q[ : -1] for _q in conversations_ids]))
bar = list(itertools.chain(*[_q[1 : ] for _q in conversations_ids]))
questions = [clean_text(id2line.get(x)) for x in foo]
answers = [clean_text(id2line.get(x)) + ' <EOS>' for x in bar]

Unnamed: 0,Questions,Answers
0,can we make this quick roxanne korrine and and...,well i thought we would start with pronunciati...
1,well i thought we would start with pronunciati...,not the hacking and gagging and spitting part ...
2,not the hacking and gagging and spitting part ...,okay then how 'bout we try out some french cui...
3,you are asking me out that is so cute what is ...,forget it <EOS>
4,no no it is my fault we did not have a proper ...,cameron <EOS>


In [39]:
# create word count dictionaries
word2count = defaultdict(int)
for _q in questions + answers:
    for _word in _q.split(" "):
        word2count[_word] += 1
print(len(word2count))

73062


In [40]:
threshold = 20
foo = {k: v for k, v in word2count.items() if v >=  threshold}
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
words2int = {k : v for v, k in enumerate(list(foo.keys()) + tokens)}
int2words = {v : k for v, k in enumerate(list(foo.keys()) + tokens)}

In [41]:
foo = (_quest.split() for _quest in questions)
questions_into_int = [[words2int.get(x, words2int['<OUT>']) for x in y] for y in foo]

foo = (_quest.split() for _quest in answers)
answers_into_int = [[words2int.get(x, words2int['<OUT>']) for x in y] for y in foo]

In [42]:
df = pd.DataFrame({"Questions" : questions_into_int, "Answers" : answers_into_int})
df = df[(df['Questions'].str.len() > 0) & (df['Questions'].str.len() < 25)]
df['len'] = df['Questions'].str.len()
df = df.sort_values('len')
sorted_clean_questions = df['Questions']
sorted_clean_answers = df['Answers']
df.head()

Unnamed: 0,Questions,Answers,len
110830,[660],"[19, 51, 4035, 8792, 8792, 54, 16, 1103, 8792,...",1
85263,[4144],"[16, 987, 2914, 180, 8792, 3, 1344, 44, 35, 83...",1
85259,[8792],"[8792, 8791]",1
85247,[37],"[75, 46, 8792, 5201, 123, 2344, 368, 1269, 785...",1
85233,[5550],"[284, 26, 465, 179, 8791]",1


# Model Layers Construction

In [57]:
# deep learning layers

def model_inputs():
    inputs = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name='input')
    targets = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name='target')
    lr = tf.compat.v1.placeholder(tf.float32, shape=[None, None], name='learning_rate')
    keep_prob = tf.compat.v1.placeholder(tf.float32, shape=[None, None], name='keep_prob')
    return inputs, targets, lr, keep_prob

def preprocess_targets(targets, word2int, batch_size):
    '''Add SOS in left side and remove last token in the right'''
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1, 1])
    preprocessed_targets = tf.concat([left_side, right_side], axis=1)
    return preprocessed_targets

def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                                    cell_bw = encoder_cell,
                                                                    sequence_length = sequence_length,
                                                                    inputs = rnn_inputs,
                                                                    dtype = tf.float32)
    return encoder_state

def decode_training_set(encoder_state, 
                        decoder_cell, 
                        decoder_embedded_input, 
                        sequence_length, 
                        decoding_scope, 
                        output_function, 
                        keep_prob, 
                        batch_size):
    
    att_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    att_keys, att_val, att_score_func, att_construct_func = tf.contrib.seq2seq.prepare_attention(att_states,                                                                                                                   
                                                                                                 attention_option = "bahdanau", 
                                                                                                 num_units = decoder_cell.output_size)
    train_dc_func = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                  att_keys,
                                                                  att_val,
                                                                  att_score_func,
                                                                  att_construct_func,
                                                                  name = "attn_dec_train")
    
    dc_outp, dc_final_state, dc_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                             train_dc_func,
                                                                                             decoder_embedded_input,
                                                                                             sequence_length,
                                                                                             scope = decoding_scope)
    dc_outp_dropout = tf.nn.dropout(dc_outp, keep_prob)
    return output_function(dc_outp_dropout)

def decode_test_set(encoder_state, 
                    decoder_cell, 
                    decoder_embeddings_matrix, 
                    sos_id, 
                    eos_id, 
                    maximum_length, 
                    num_words, 
                    decoding_scope, 
                    output_function, 
                    keep_prob, 
                    batch_size):
    
    att_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    att_keys, att_val, att_score_func, att_construct_func = tf.contrib.seq2seq.prepare_attention(att_states,                                                                                                                   
                                                                                                 attention_option = "bahdanau", 
                                                                                                 num_units = decoder_cell.output_size)

    test_dc_func = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                      encoder_state[0],
                                                                      att_keys,
                                                                      att_val,
                                                                      att_score_func,
                                                                      att_construct_func,
                                                                      decoder_embeddings_matrix,
                                                                      sos_id,
                                                                      eos_id,
                                                                      maximum_length,
                                                                      num_words,
                                                                      name = "attn_dec_inf")
    
    test_pred, dc_final_state, dc_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                               test_dc_func,
                                                                                               scope = decoding_scope)
    return test_pred

def seq2seq_model(inputs, 
                  targets, 
                  keep_prob, 
                  batch_size, 
                  sequence_length, 
                  answers_num_words, 
                  questions_num_words, 
                  encoder_embedding_size, 
                  decoder_embedding_size, 
                  rnn_size, 
                  num_layers, 
                  questionswords2int):
    
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                              answers_num_words + 1,
                                                              encoder_embedding_size,
                                                              initializer = tf.random_uniform_initializer(0, 1))
    
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
    
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions

# Model Training

In [69]:
# configuration
epochs, batch_size, rnn_size, num_layers = 100, 64, 512, 3
encoding_embedding_size, decoding_embedding_size, learning_rate, learning_rate_decay = 512, 512, 0.01, 0.9

In [55]:
# defining session
tf.compat.v1.reset_default_graph()
session = tf.compat.v1.InteractiveSession()

In [62]:
# loading model inputs
inputs, targets, lr, keep_prob = model_inputs()

# Setting the sequence length, 25 as maximum length
sequence_length = tf.compat.v1.placeholder_with_default(25, None, name = 'sequence_length')

# Getting the shape of the inputs tensor
input_shape = tf.shape(inputs)

In [76]:
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(words2int),
                                                       len(words2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       words2int)

In [None]:
# Setting up the Loss Error, the Optimizer and Gradient Clipping
with tf.name_scope("optimization"):
    loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions,
                                                  targets,
                                                  tf.ones([input_shape[0], sequence_length]))
    
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss_error)
    clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) 
                         for grad_tensor, grad_variable in gradients if grad_tensor is not None]
    
    optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

In [78]:
# Padding the sequences with the <PAD> token
def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) 
            for sequence in batch_of_sequences]
 
# Splitting the data into batches of questions and answers
def split_into_batches(questions, answers, batch_size):
    for batch_index in range(0, len(questions) // batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index + batch_size]
        answers_in_batch = answers[start_index : start_index + batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

In [None]:
# Splitting the questions and answers into training and validation sets
training_validation_split = int(len(sorted_clean_questions) * 0.15)
training_questions = sorted_clean_questions[training_validation_split:]
training_answers = sorted_clean_answers[training_validation_split:]
validation_questions = sorted_clean_questions[:training_validation_split]
validation_answers = sorted_clean_answers[:training_validation_split]

In [None]:
# Training
batch_index_check_training_loss = 100
batch_index_check_validation_loss = ((len(training_questions)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 1000
checkpoint = "./model/chatbot_weights.ckpt"
session.run(tf.global_variables_initializer())

In [None]:
for epoch in range(1, epochs + 1):
    for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(training_questions, 
                                                                                                          training_answers, 
                                                                                                          batch_size)):
        starting_time = time.time()
        _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error], {inputs: padded_questions_in_batch,
                                                                                               targets: padded_answers_in_batch,
                                                                                               lr: learning_rate,
                                                                                               sequence_length: padded_answers_in_batch.shape[1],
                                                                                               keep_prob: keep_probability})
        total_training_loss_error += batch_training_loss_error
        ending_time = time.time()
        batch_time = ending_time - starting_time
        if batch_index % batch_index_check_training_loss == 0:
            print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}, \
            Training Time on 100 Batches: {:d} seconds'.format(epoch,
                                                               epochs,
                                                               batch_index,
                                                               len(training_questions) // batch_size,
                                                               total_training_loss_error / batch_index_check_training_loss,
                                                               int(batch_time * batch_index_check_training_loss)))
            total_training_loss_error = 0
        if batch_index % batch_index_check_validation_loss == 0 and batch_index > 0:
            total_validation_loss_error = 0
            starting_time = time.time()
            for batch_index_validation, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(validation_questions, 
                                                                                                                             validation_answers, 
                                                                                                                             batch_size)):
                batch_validation_loss_error = session.run(loss_error, {inputs: padded_questions_in_batch,
                                                                       targets: padded_answers_in_batch,
                                                                       lr: learning_rate,
                                                                       sequence_length: padded_answers_in_batch.shape[1],
                                                                       keep_prob: 1})
                total_validation_loss_error += batch_validation_loss_error
            ending_time = time.time()
            batch_time = ending_time - starting_time
            average_validation_loss_error = total_validation_loss_error / (len(validation_questions) / batch_size)
            print('Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds'.format(average_validation_loss_error, int(batch_time)))
            learning_rate *= learning_rate_decay
            if learning_rate < min_learning_rate:
                learning_rate = min_learning_rate
            list_validation_loss_error.append(average_validation_loss_error)
            if average_validation_loss_error <= min(list_validation_loss_error):
                print('I speak better now!!')
                early_stopping_check = 0
                saver = tf.train.Saver()
                saver.save(session, checkpoint)
            else:
                print("Sorry I do not speak better, I need to practice more.")
                early_stopping_check += 1
                if early_stopping_check == early_stopping_stop:
                    break
    if early_stopping_check == early_stopping_stop:
        print("My apologies, I cannot speak better anymore. This is the best I can do.")
        break
print("Game Over")

# Application

In [None]:
# Loading the weights and Running the session
checkpoint = "./model/chatbot_weights.ckpt"
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(session, checkpoint)

In [79]:
# Converting the questions from strings to lists of encoding integers
def convert_string2int(question, word2int):
    question = clean_text(question)
    return [word2int.get(word, word2int['<OUT>']) for word in question.split()]

In [None]:
# Setting up the chat
while(True):
    question = input("You: ")
    if question == 'Goodbye':
        break
    question = convert_string2int(question, questionswords2int)
    question = question + [questionswords2int['<PAD>']] * (25 - len(question))
    fake_batch = np.zeros((batch_size, 25))
    fake_batch[0] = question
    predicted_answer = session.run(test_predictions, {inputs: fake_batch, keep_prob: 0.5})[0]
    answer = ''
    for i in np.argmax(predicted_answer, 1):
        if answersints2word[i] == 'i':
            token = ' I'
        elif answersints2word[i] == '<EOS>':
            token = '.'
        elif answersints2word[i] == '<OUT>':
            token = 'out'
        else:
            token = ' ' + answersints2word[i]
        answer += token
        if token == '.':
            break
    print('ChatBot: ' + answer)