In [1]:
# Importing the libraries
import os
import sys
import argparse
import time
import numpy as np
!pip3 install 'tensorflow==1.13.0rc1'
import tensorflow as tf
from tqdm import tqdm

#data_processing
import re

#seq2seq
from pprint import pprint



In [2]:
tf.__version__

'1.13.0-rc1'

##### data_processing

In [3]:
## Main Data Pre-Processing File
def create_data_from_files(lines_file, conversations_file, verbose=False):
    ### Importing Dataset
    with open(lines_file, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.read().split('\n')
    with open(conversations_file, 'r', encoding='utf-8', errors='ignore') as f:
        conversations = f.read().split('\n')
    if(verbose == True):
        print('{} lines and {} conversations have been extracted.'.format(len(lines), len(conversations)))
    return (lines, conversations)

def create_lines_dictionary(lines, verbose=False):
    '''
    Creating a dictionary that maps each line and its id
    '''
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            id2line[_line[0]] = _line[4]
    return id2line

def create_conversations_ids(conversations, verbose=False):
    '''
    To get a list of conversation ids for each conversation by removing brackets, quotes and spaces.
    '''
    conversations_ids = []
    for conversation in conversations[:-1]:
        _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
        conversations_ids.append(_conversation.split(','))
    if(verbose == True):
        print('{} conversations ids have been created.'.format(len(conversations_ids)))
    return conversations_ids

#porque hace esto?
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "I am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

def create_questions_and_answers(id2line, conversations_ids, verbose=False):
    questions = []
    answers = []
    for conversation in conversations_ids:
        for i in range(len(conversation) - 1):
            questions.append(clean_text(id2line[conversation[i]]))
            answers.append(clean_text(id2line[conversation[i+1]]))
    if(verbose == True):
        print('{} quesions and {} answers have been created.'.format(len(questions), len(answers)))
    return (questions, answers)

def get_words_and_occurences(questions, answers, verbose=False):
    word2count = {}
    for sentence in questions + answers:
        for word in sentence.split():
            if word in word2count:
                word2count[word] += 1
            else:
                word2count[word] = 1
    if(verbose == True):
        print('{} words have been found.'.format(len(word2count)))
    return word2count

def remove_less_frequent_words(word2count, threshold, verbose=False):
    questionswords2int = {}
    answerswords2int = {}
    word_number = 0
    for word, count in word2count.items():
        if count >= threshold:
            questionswords2int[word] = word_number
            answerswords2int[word] = word_number
            word_number += 1
    if(verbose == True):
        print('Total tokens after removing less frequent words: ', len(answerswords2int))
    return (questionswords2int, answerswords2int)

def add_tokens_to_words(questionswords2int, answerswords2int, verbose=False):
    tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
    for token in tokens:
        questionswords2int[token] = len(questionswords2int) + 1
        answerswords2int[token] = len(answerswords2int) + 1
    if(verbose == True):
        print('Total Tokens', len(answerswords2int))
    return (questionswords2int, answerswords2int)

def get_inverse_dictionary(answerswords2int, verbose=False):
    answersints2word = {w_i:w for w, w_i in answerswords2int.items()}
    if(verbose == True):
        print('Words inverse dictionary of answers has been created!')
    return answersints2word

def add_eos_to_sentences(answers, verbose=False):
    for i in range(len(answers)):
        answers[i] += ' <EOS>'
    if(verbose == True):
        print('<EOS> has been added to answers')
    return answers

def words_to_tokens(questions, answers, questionswords2int, answerswords2int, verbose=False):
    questions_to_int = []
    for question in questions:
        ints = []
        for word in question.split():
            if word not in questionswords2int:
                ints.append(questionswords2int['<OUT>'])
            else:
                ints.append(questionswords2int[word])
        questions_to_int.append(ints)
    answers_to_int = []
    for answer in answers:
        ints = []
        for word in answer.split():
            if word not in answerswords2int:
                ints.append(answerswords2int['<OUT>'])
            else:
                ints.append(answerswords2int[word])
        answers_to_int.append(ints)
    return (questions_to_int, answers_to_int)

def sort_questions_and_answers(questions, answers, sequence_length, verbose=False):
    sorted_questions = []
    sorted_answers = []
    for length in range(1, sequence_length + 1):
        for i in enumerate(questions):
            if(len(i[1]) == length):
                sorted_questions.append(questions[i[0]])
                sorted_answers.append(answers[i[0]])
    if(verbose == True):
        print('Questions and answers have been sorted according to the length of questions.')
    return (sorted_questions, sorted_answers)

def get_processed_questions_and_answers(lines_file, conversations_file, threshold, sequence_length, verbose=False):
    lines, conversations = create_data_from_files(lines_file, conversations_file, verbose=verbose)
    id2line = create_lines_dictionary(lines, verbose=verbose)
    conversations_ids = create_conversations_ids(conversations, verbose=verbose)
    questions, answers = create_questions_and_answers(id2line, conversations_ids, verbose=verbose)
    word2count = get_words_and_occurences(questions, answers, verbose=verbose)
    questionswords2int, answerswords2int = remove_less_frequent_words(word2count, threshold, verbose=verbose)
    questionswords2int, answerswords2int = add_tokens_to_words(questionswords2int, answerswords2int, verbose=verbose)
    answersints2word = get_inverse_dictionary(answerswords2int, verbose=verbose)
    answers = add_eos_to_sentences(answers, verbose=verbose)
    questions, answers = words_to_tokens(questions, answers, questionswords2int, answerswords2int, verbose=verbose)
    questions, answers = sort_questions_and_answers(questions, answers, sequence_length, verbose=verbose)
    return (questions, answers, questionswords2int, answerswords2int, answersints2word)

##### nlp_utils

In [4]:
## Main utility functions
def preprocess_targets(targets, words2int, batch_size):
    left_side = tf.fill([batch_size, 1], words2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
    return tf.concat([left_side, right_side], 1)

def apply_padding(batch_of_sequences, words2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [words2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

def split_into_batches(questions, answers, questionswords2int, answerswords2int, batch_size):
    for batch_index in range(0, len(questions) // batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index + batch_size]
        answers_in_batch = answers[start_index : start_index + batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

def get_training_validation_data(questions, answers, validation_set_ratio):
    training_validation_split = int(len(questions) * validation_set_ratio)
    training_questions = questions[training_validation_split:]
    training_answers = answers[training_validation_split:]
    validation_questions = questions[:training_validation_split]
    validation_answers = answers[:training_validation_split]
    return (training_questions, training_answers, validation_questions, validation_answers)

##### seq2seq

In [5]:
## Main encoding-decoding functions

def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                       cell_bw = encoder_cell,
                                                       sequence_length = sequence_length,
                                                       inputs = rnn_inputs,
                                                       dtype = tf.float32)
    return encoder_state

def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length,
                                      decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = \
                        tf.contrib.seq2seq.prepare_attention(attention_states,
                                                             attention_option = 'bahdanau',
                                                             num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name = 'attn_dec_train')
    decoder_output, decoder_final_state, decoder_final_context_state = \
                                        tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                  training_decoder_function,
                                                                  decoder_embedded_input,
                                                                  sequence_length,
                                                                  scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length,
                                        num_words, decoding_scope, output_function, keep_prob, batch_size):
    '''
    For decoding the test/validation set
    '''
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = \
                                            tf.contrib.seq2seq.prepare_attention(attention_states,
                                                                    attention_option = 'bahdanau',
                                                                    num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                              encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              decoder_embeddings_matrix,
                                                                              sos_id,
                                                                              eos_id,
                                                                              maximum_length,
                                                                              num_words,
                                                                              name = 'attn_dec_inf')
    test_predictions, decoder_final_state, decoder_final_context_state = \
                                        tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                    test_decoder_function,
                                                                    scope = decoding_scope)
    return test_predictions

def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words,
                    sequence_length, rnn_size, num_layers, words2int, keep_prob, batch_size):
    '''
    Decoder RNN
    '''
    with tf.variable_scope('decoding') as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                      num_words,
                                                                      None,
                                                                      scope = decoding_scope,
                                                                      weights_initializer = weights,
                                                                      biases_initializer = biases)
        training_predictions = decode_training_set(encoder_state,
                                                   decoder_cell,
                                                   decoder_embedded_input,
                                                   sequence_length,
                                                   decoding_scope,
                                                   output_function,
                                                   keep_prob,
                                                   batch_size)
        decoding_scope.reuse_variables()
        test_predictions = decode_test_set(encoder_state,
                                           decoder_cell,
                                           decoder_embeddings_matrix,
                                           words2int['<SOS>'],
                                           words2int['<EOS>'],
                                           sequence_length - 1,
                                           num_words,
                                           decoding_scope,
                                           output_function,
                                           keep_prob,
                                           batch_size)
    return training_predictions, test_predictions

def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words,
                        encoder_embedding_size, decoder_embeddings_size, rnn_size, num_layers, questionswords2int):
    encoder_embedding_input = tf.contrib.layers.embed_sequence(inputs,
                                                               answers_num_words + 1,
                                                               encoder_embedding_size,
                                                               initializer = tf.random_uniform_initializer(0, 1))
    encoder_state = encoder_rnn(encoder_embedding_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embeddings_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions

##### chatbot

In [24]:
def parse_arguments(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '-V', '--verbose', action='store_true')
    # Hyper Parameters
    parser.add_argument('-bs', '--batch_size', type=int, default=32)
    parser.add_argument('-th', '--threshold', type=int, default=20)
    parser.add_argument('-sl', '--sequence_length', type=int, default=25)
    parser.add_argument('-ep', '--epochs', type=int, default=100)
    parser.add_argument('-rs', '--rnn_size', type=int, default=1024)
    parser.add_argument('-nl', '--num_layers', type=int, default=3)
    parser.add_argument('-ee', '--encoding_embedding_size', type=int, default=1024)
    parser.add_argument('-de', '--decoding_embedding_size', type=int, default=1024)
    parser.add_argument('-lr', '--learning_rate', type=float, default=0.001)
    parser.add_argument('-lrd', '--learning_rate_decay', type=float, default=0.9)
    parser.add_argument('-mlr', '--minimum_learning_rate', type=float, default=0.0001)
    parser.add_argument('-kp', '--keep_probability', type=float, default=0.5)
    parser.add_argument('-vr', '--validation_set_ratio', type=float, default=0.15)
    # File locations
    parser.add_argument('-lf', '--lines_file', required=False, default='./data/movie_lines.txt')
    parser.add_argument('-cf', '--conversations_file', required=False, default='./data/movie_conversations.txt')
    try:
        arguments = parser.parse_args(args=args)
    except:
        parser.print_help()
        sys.exit(0)
    arguments = vars(arguments)
    return arguments

In [7]:
def model_inputs(verbose = False):
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='target')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    if(verbose == True):
        print('model inputs placeholders have been created!')
    return (inputs, targets, lr, keep_prob)

In [8]:
def build_model(batch_size = None,
                threshold = None,
                sequence_length = None,
                epochs = None,
                rnn_size = None,
                num_layers = None,
                encoding_embedding_size = None,
                decoding_embedding_size = None,
                learning_rate = None,
                learning_rate_decay = None,
                minimum_learning_rate = None,
                keep_probability = None,
                validation_set_ratio = None,
                lines_file = None,
                conversations_file = None,
                verbose = None):
    tf.reset_default_graph()
    session = tf.InteractiveSession()
    inputs, targets, lr, keep_prob = model_inputs(verbose)
    questions, answers, questionswords2int, answerswords2int, answersints2word = \
                                                    get_processed_questions_and_answers(lines_file,
                                                    conversations_file,
                                                    threshold,
                                                    sequence_length,
                                                    verbose)
    sequence_length = tf.placeholder_with_default(sequence_length, None, name = 'sequence_length')
    input_shape = tf.shape(inputs)
    print('Shape of inputs, targets after preprocess_targets: ', np.shape(inputs), np.shape(targets))
    print('Shape of inputs, targets after preprocess_targets: ', np.shape(tf.reverse(inputs, [-1])), np.shape(targets))
    training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                           targets,
                                                           keep_prob,
                                                           batch_size,
                                                           sequence_length,
                                                           len(answerswords2int),
                                                           len(questionswords2int),
                                                           encoding_embedding_size,
                                                           decoding_embedding_size,
                                                           rnn_size,
                                                           num_layers,
                                                           questionswords2int)
    print('Shape of training_questions, test_predictions: ', np.shape(training_predictions), np.shape(test_predictions))
    with tf.name_scope('optimization'):
        print('Shape of training predictions, targets: ', np.shape(training_predictions), np.shape(targets))
        loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions,
                                                      targets,
                                                      tf.ones([input_shape[0], sequence_length]))
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = optimizer.compute_gradients(loss_error)
        print('Shape of gradients: ', np.shape(gradients))
        clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
        optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

    # Training and Validation Split
    training_questions, training_answers, validation_questions, validation_answers = \
                                get_training_validation_data(questions,
                                                             answers,
                                                             validation_set_ratio)
    print('Shape of training: questions-> {}, answers->{}'.format(np.shape(training_questions), np.shape(training_answers)))
    print('Shape of validation: questions-> {}, answers->{}'.format(np.shape(validation_questions), np.shape(validation_answers)))

    # Training
    batch_index_check_training_loss = 100
    batch_index_check_validation_loss = ((len(training_questions)) // batch_size // 2) - 1
    total_training_loss_error = 0
    list_validation_loss_error = []
    early_stopping_check = 0
    early_stopping_stop = 1000
    checkpoint = "chatbot_weight.ckpt"
    session.run(tf.global_variables_initializer())
    print('Varibles have been initialized!')
    for epoch in tqdm(range(1, epochs + 1)):
        for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in \
                                enumerate(split_into_batches(training_questions,
                                                             training_answers,
                                                             questionswords2int,
                                                             answerswords2int,
                                                             batch_size)):
            print('Shape of padded_question_in_batch, padded_answers_in_batch: ', np.shape(padded_questions_in_batch), np.shape(padded_answers_in_batch))
            starting_time = time.time()
            print('inputs: ', type(padded_questions_in_batch))
            print('targets: ', type(padded_answers_in_batch))
            print('learning_rate: ', type(learning_rate))
            print('sequence_length: ', type(padded_questions_in_batch.shape[1]))
            _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error],
                                            feed_dict = {
                                            inputs: padded_questions_in_batch,
                                            targets: padded_answers_in_batch,
                                            lr: learning_rate,
                                            sequence_length: padded_questions_in_batch.shape[1],
                                            keep_prob: keep_probability,
                                            })
            total_training_loss_error += batch_training_loss_error
            ending_time  = time.time()
            batch_time = ending_time - starting_time
            if(batch_index % batch_index_check_training_loss == 0):
                print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}, Training Time on {} Batches: {:d} seconds'.format(
                                            epoch,
                                            epochs,
                                            batch_index,
                                            len(training_questions) // batch_size,
                                            batch_index_check_training_loss,
                                            total_training_loss_error // batch_index_check_training_loss,
                                            int(batch_index * batch_index_check_training_loss)
                                            ))
                total_training_loss_error = 0
            if(batch_index % batch_index_check_validation_loss == 0 and batch_index > 0):
                total_validation_loss_error = 0
                starting_validation_time = time.time()
                for batch_index_validation, (padded_questions_in_batch, padded_answers_in_batch) in \
                                                enumerate(split_into_batches(validation_questions,
                                                                             validation_answers,
                                                                             questionswords2int,
                                                                             answerswords2int,
                                                                             batch_size)):
                    batch_validation_loss_error = session.run(loss_error,
                                                    {
                                                    inputs: padded_questions_in_batch,
                                                    targets: padded_answers_in_batch,
                                                    lr: learning_rate,
                                                    sequence_length: padded_questions_in_batch.shape[1],
                                                    keep_prob: 1
                                                    })
                    total_validation_loss_error += batch_validation_loss_error
                    ending_validation_time  = time.time()
                    batch_validation_time = endingvalidation__time - startingvalidation__time
                    average_validation_loss_error = total_validation_loss_error / (len(validation_questions) / batch_size)
                    print('Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds'.format(average_validation_loss_error, int(batch_validation_time), ))
                    learning_rate *= learning_rate_decay
                    if learning_rate < minimum_learning_rate:
                        learning_rate = minimum_learning_rate
                    list_validation_loss_error.append(average_validation_loss_error)
                    if average_validation_loss_error <= min(list_validation_loss_error):
                        print('I speak better now')
                        early_stopping_check = 0
                        saver = tf.train.Saver()
                        saver.save(session, checkpoint)
                    else:
                        print('Sorry, I do not speak better, I need to practice more!')
                        early_stopping_check += 1
                        if early_stopping_check >= early_stopping_stop:
                            break
        if early_stopping_check >= early_stopping_stop:
            print("My apologies, I cannot speak better anymore, This is the best I can do!")
            break
    print('Game over!')


In [9]:
def convert_string2int(question, words2int):
    question = clean_text(question)
    return [words2int.get(word, words2int['<OUT>']) for word in question.split()]

In [None]:
def chat_with_bot():
    checkpoint = './checkpoint.ckpt'
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())
    
    while True:
        question = input("You: ")
        if(question == 'Goodbye'):
            break
        else:
            question = convert_string2int(question, questionswords2int)
            question = question + [questionswords2int['<PAD>']] * (20 - len(question))
            fake_batch = np.zeros((batch_size, 20))
            fake_batch[0] = question
            predicted_answer = session.run(test_predictions, {input: fake_batch, keep_prob: 0.5})[0]
            answer = ''
            for i in np.argmax(predicted_answer, 1):
                if answersints2word[i] == 'i':
                    token = 'I'
                elif answersints2word[i] == '<EOS>':
                    token = '.'
                elif answersints2word[i] == '<OUT>':
                    token = 'out'
                else:
                    token = ' ' + answersints2word[i]
                answer += token
                if token == '.':
                    break
            print('Chatbot:', answer)
        
    saver = tf.train.Saver()
    saver.restore(session, checkpoint)

In [39]:
def main(argv=sys.argv):
    '''
    The main implementation of the movie conversations chatbot
    '''
    arguments = parse_arguments(argv[1:])
    build_model(**arguments)
    chat_with_bot()
    if __name__ == '__main__':
        main()
        

In [40]:
chat_with_bot()



You: I


NameError: name 'questionswords2int' is not defined