# Training Chatbot Model
In this project, I'm going to build a neural network model using machine translation concept.  I will be training a sequence to sequence model on a dataset of movies conversations

In [1]:
# import libs
import pickle
import warnings
import numpy as np
import tensorflow as tf
from tensorflow.python.layers.core import Dense
from distutils.version import LooseVersion

  from ._conv import register_converters as _register_converters


### Retrieve Preprocessed Data and Parameters

In [2]:
def load_preprocess():
    """
    Load the Preprocessed Training data and return them
    """
    with open('models/preprocess.p', mode='rb') as in_file:
        return pickle.load(in_file)

In [3]:
# Load the following:
#     1. list of lists for the training data as index
#     2. two sets of dict,a set for comments , another one for replay (used to convert words to indexes and vice versa)
((source_int_text, target_int_text),
(source_vocab_to_int, source_int_to_vocab),
(target_vocab_to_int, target_int_to_vocab)) = load_preprocess()

In [4]:
print('Vocabulary size of comments:', len( source_int_to_vocab))

Vocabulary size of comments: 13929


In [5]:
print('Vocabulary size of replays:', len(target_vocab_to_int))

Vocabulary size of replays: 13406


### Check the Version of TensorFlow and Access to GPU

In [6]:
# Check TensorFlow Version
assert LooseVersion(tf.__version__) in [LooseVersion('1.10.1')], 'This project requires TensorFlow version 1.0  You are using {}'.format(tf.__version__)
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.10.1
Default GPU Device: /device:GPU:0


## Build the Neural Network

### Inputs

In [7]:
#just tensor for saving
def model_inputs():
    '''
    Function:
        *return:
            inputs: tensor of encode train input
            targets: tensor of  decode train input
            lr_rate: tensor of learning rate
            keep_prob: tensor of the keep probability for Dropouts
            target_sequence_length: tensor of length of each sentance in targets tensor
            max_target_length: max of previous tensor
            source_sequence_length: tensor of length of each sentance in inputs tensor
    '''
    inputs = tf.placeholder(tf.int32,shape=[None,None],name='input') # [batch size,sentences lengths] Q
    targets = tf.placeholder(tf.int32,shape=[None,None],name='targets') # [batch size,sentences lengths] A
    
    learningrate = tf.placeholder(tf.float32,shape=[],name='learningrate')
    keep_prob = tf.placeholder(tf.float32,shape=[],name='keep_prob')
    
    #  The maximum length of setence is different from batch to batch, so it cannot be set with the exact number
    # This particular value is required as an argument of TrainerHelper to build decoder model for training.
    target_sequence_length = tf.placeholder(tf.int32,[None,],name='target_sequence_length')
    
    #gets the maximum value out of lengths of all the target sentences(sequences)
    max_target_length = tf.reduce_max(target_sequence_length)
    
    source_sequence_length = tf.placeholder(tf.int32,[None,],name='source_sequence_length')
    
    return (inputs, targets, learningrate, keep_prob, target_sequence_length, max_target_length, source_sequence_length)


### Process Decoder Input
1. Remove the last word id from each batch in target_data
2. concat the GO ID to the begining of each batch.

In [8]:
def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    """
    Function to preprocess target data for encoding
        *args:
            target_data: tensor, current batch (replay) (Target Placehoder)
            vocab_to_int: ictionary to go from the target words to an id
            batch_size: number 
        *return:
            after_concat:Preprocessed target data after:
                            1- slice data to batches size to remove the last word id from each batch
                            2- add '<GO>' add to each sentance at the begining 
    """
        
    #  splitting into multiple tensors with the striding window size from begin to end
    ending = tf.strided_slice(target_data, [0,0], [batch_size, -1], [1,1])  # TF Tensor, Begin, End, Strides
    
    # fill : creates a tensor filled with a scalar value. args: TF Tensor, value to fill
    # concat : concatenates tensors along one dimension. tensor['<GO'] + ending
    decoder_input = tf.concat([tf.fill([batch_size,1], target_vocab_to_int['<GO>']), ending], 1)
    
    return decoder_input

### Encoding Model

In [9]:
def create_cell(rnn_size):
    '''
    Function to create LSTM layer
        *args:
            rnn_size: number , how many hidden units we have in each layer
        *return: LSTM layer
    '''
    # LSTMCell: simply specifies how many internal units it has
    lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
    
    # DropoutWrapper: wraps a cell with keep probability value
    drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
    return drop

In [10]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, source_sequence_length, source_vocab_size,encoding_embedding_size):
    """
    Function: Build encoding model that consists of two different parts.
        1- the embedding layer. Each word in a sentence will be represented with the number of features specified as embedding_size.
        2- the RNN layer(s).
        *args:
            rnn_inputs: tensor, current batch (comments), reversed
            rnn_size: number , how many hidden units we have in each layer
            num_layers: number , how many hidden layers
            keep_prob: Dropout keep probability
            source_sequence_length : length 
            source_vocab_size : number, vocablary size
            encoding_embedding_size: number , dimenations
        *return:
            RNN output:
            RNN state:
    """
    # Each word in a sentence will be represented with the number of features specified as encoding_embedding_size
    enc_embed = tf.contrib.layers.embed_sequence(rnn_inputs, source_vocab_size, encoding_embedding_size)
    
    # MultiRNNCell: stacks multiple RNN (type) cells
    enc_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
    
    # dynamic_rnn: put Embedding layer and RNN layer(s) all together
    encoding_output, encoding_state = tf.nn.dynamic_rnn(enc_cell, enc_embed, 
                                                        sequence_length=source_sequence_length,dtype=tf.float32)
    
    return encoding_output, encoding_state

### Decoding Model
Decoding model can be thought of two separate processes, training and inference. They share the same parameters but they have different strategy to feed the shared model.

#### Decoding - Training

In [11]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
    '''
    Function to Create a training process in decoding layer 
        *args:
            encoder_state: encoder long term momery LTM
            dec_cell: created MultiRNN Cells 
            dec_embed_input: embedded current batch (replay) 
            target_sequence_length: list of lengths for each sentence in current batch (replay)
            max_summary_length: number, The length of the longest sequence in the batch (replay)
            output_layer: created output layer
            keep_prob: Dropout keep probability
        *return:
            training_decoder_output: BasicDecoderOutput containing training logits and sample_id
    '''

    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=target_sequence_length,
                                                        time_major=False)
    
    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       encoder_state,
                                                       output_layer)
    
    training_decoder_output = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                impute_finished=True,
                                                                maximum_iterations=max_summary_length)[0]
    return training_decoder_output

#### Decoding - Inference

In [12]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
    '''
    Function Create a inference process in decoding layer  
        *args:
            encoder_state: encoder long term momery LTM
            dec_cell: created MultiRNN Cells 
            dec_embeddings: Weights for embedding
            start_of_sequence_id: index of '<GO>' in vocab_to_int dict 
            end_of_sequence_id: index of '<EOS>' in vocab_to_int dict
            max_target_sequence_length: number, The length of the longest sequence in the batch (replay)
            vocab_size: number
            output_layer: created output layer
            batch_size: number
            keep_prob: Dropout keep probability
        *return:
            inference_decoder_output: BasicDecoderOutput containing inference logits and sample_id
    '''

    start_tokens = tf.tile(tf.constant([start_of_sequence_id], dtype=tf.int32), [batch_size], name='start_tokens')

    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings,
                                                                start_tokens,
                                                                end_of_sequence_id)
    
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        encoder_state,
                                                        output_layer)
    
    inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)[0]
    return inference_decoder_output

#### Decoding Layer

In [13]:
def decoding_layer(dec_input, encoder_state, target_sequence_length,
                   max_target_sequence_length, rnn_size, num_layers,
                   target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size):
    '''
    Function to Create decoding layer
        *args:
            dec_input: tensor of tensor, current batch (replay) after apply process_decoder_input function (add '<GO>')
            encoder_state: encoder long term momery LTM
            target_sequence_length: list of lengths for each sentence in current batch (replay)
            max_target_sequence_length: number, longest length of (relay) sentence in the whole file
            rnn_size: number , how many hidden units we have in each layed
            num_layers: number , how many hidden layers
            target_vocab_to_int: Dictionary to go from the target words to an id
            target_vocab_size: Size of target vocabulary
            batch_size: number
            keep_prob: keep_probability of droupout
            decoding_embedding_size: Decoding embedding size
        *return:
            train_output: Training BasicDecoderOutput
            infer_output: Inference BasicDecoderOutput
    '''
    # 1. Decoder Embedding
    '''
     TF nn.embedding_lookup + manually created embedding parameters returns the similar result to the TF contrib.
     layers.embed_sequence. For the inference process,whenever the output of the current time step is calculated via decoder, 
     it will be embeded by the shared embedding parameter and become the input for the next time step.
     You only need to provide the embedding parameter to the GreedyEmbeddingHelper
    '''
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    

    dec_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
    
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))

    # decoder - training
    with tf.variable_scope("decode"):
        #Create an output layer to map the outputs of the decoder to the elements of our vocabulary
        # This is just a fully connected layer to get probabilities of occurance of each words at the end
        train_decoder_out = decoding_layer_train(encoder_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_target_sequence_length, output_layer, keep_prob)
    
    # decode - inference
    with tf.variable_scope("decode", reuse=True):
        infer_decoder_out = decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, 
                             target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], max_target_sequence_length, 
                             target_vocab_size, output_layer, batch_size, keep_prob)
        
    return (train_decoder_out, infer_decoder_out)

### Build the Neural Network

In [14]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  source_sequence_length, target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    '''
    Function to build the Sequence-to-Sequence model
        *args:
            input_data: tensor, current batch (comments), reversed  placeholder
            target_data: tensor, current batch (replay) placeholder
            keep_prob: Dropout keep probability placeholder
            batch_size: number
            source_sequence_length: list of lengths for each sentence in current batch (comment)
            target_sequence_length: list of lengths for each sentence in current batch (replay)
            max_target_sentence_length: number, longest length of (relay) sentence in the whole file
            source_vocab_size : number, Source vocablary size
            target_vocab_size : number, Target vocablary size
            embedding_size: number , dimenations
            enc_embedding_size: number, Decoder embedding size
            dec_embedding_size: number, Encoder embedding size
            rnn_size: number , how many hidden units we have in each layed
            num_layers: number , how many hidden layers
            vocab_to_int: target_vocab_to_int: Dictionary to go from the target words to an id
        *return:
            training_decoder_output: Training BasicDecoderOutput
            inference_decoder_output: Inference BasicDecoderOutput
    '''
    
    _, enc_state = encoding_layer(input_data, rnn_size, num_layers, keep_prob, 
                   source_sequence_length, source_vocab_size, 
                   enc_embedding_size)
    
    dec_input = process_decoder_input(target_data, target_vocab_to_int, batch_size)
    
    training_decoder_output, inference_decoder_output = decoding_layer(dec_input, enc_state,
                   target_sequence_length, max_target_sentence_length,
                   rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, dec_embedding_size)
    
    return training_decoder_output, inference_decoder_output

## Neural Network Training
### Hyperparameters

In [15]:
# Number of Epochs
epochs = 10
# Batch Size
batch_size = 256
# RNN Size
rnn_size = 512
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 256
decoding_embedding_size = 256
# Learning Rate
learning_rate = 0.001
# Dropout Keep Probability
keep_probability = 0.75
display_step = 10

### Build the Graph

In [16]:
save_path = 'checkpoints/dev'

In [17]:


((source_int_text, target_int_text),
(source_vocab_to_int, source_int_to_vocab),
(target_vocab_to_int, target_int_to_vocab)) = load_preprocess()

max_target_sentence_length = max([len(sentence) for sentence in source_int_text])


In [18]:
# A Graph contains a set of tf.Operation objects, which represent units of computation;
# overrides the current default graph for the lifetime 
train_graph = tf.Graph()
with train_graph.as_default():
    
    # encode inputs
    # decode inputs 
    input_data, targets, lr, keep_prob, target_sequence_length, max_target_sequence_length, source_sequence_length = model_inputs()
    input_shape = tf.shape(input_data)

    train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   source_sequence_length,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int)


    #Return a tensor with the same shape and contents as input
    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # https://www.tensorflow.org/api_docs/python/tf/sequence_mask
    # - Returns a mask tensor representing the first N positions of each cell.
    '''creates [batch_size, max_target_sequence_length] size of variable, 
    then maks only the first target_sequence_length number of elements to 1.
    It means parts will have less weight than others.'''
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    #used to group some variables together in an op
    with tf.name_scope("optimization"):
        # Loss function - weighted softmax cross entropy
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


Batch and pad the source and target sequences

In [19]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    ''' Function to pad sentences with <PAD> so that each sentence of a batch has the same length
            *args:
               sentence_batch: list of lists
               pad_int: '<pad>' code
            *return:
               list of lists : resize each sentance in the batch by adding '<pad>' at the end
    '''
    # get the length of longest sentence in the batch
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [20]:
def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    '''
    Function to batch targets, sources, and the lengths of their sentences together
        *args:
            sources: list of lists for each sentance as index not words (comment)
            targets: list of lists for each sentance as index not words (replay)
            batch_size: number , how many sentance will process each time 
            source_pad_int: index of '<pad>' in source vocab dict
            target_pad_int: index of '<pad>' in target vocab dict
        * return:
            pad_sources_batch: np array for each batch after padding the end (comment)
            pad_targets_batch: np array for each batch after padding the end (replay)
            pad_source_lengths: list, length of each senetnce in the batch 
            pad_targets_lengths: list, length of each senetnce in the batch 
    '''
    
    # for each batch:
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths


### Start Training
Train the neural network on the preprocessed data.

In [21]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

In [22]:
# Split data to training and validation sets

# use only one batch as test batch 
valid_source = source_int_text[:batch_size]
valid_target = target_int_text[:batch_size]

# rest datat for train
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]

# padding each batch for valid data
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_vocab_to_int['<PAD>'],
                                                                                                             target_vocab_to_int['<PAD>']))                                                                                                  
# create a TensorFlow session to run parts of the graph   
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    # for each padded batch in training data
    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_vocab_to_int['<PAD>'],
                            target_vocab_to_int['<PAD>'])):

            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths,
                 keep_prob: keep_probability})


            if batch_i % display_step == 0 and batch_i > 0:


                batch_train_logits = sess.run(
                    inference_logits,
                    {input_data: source_batch,
                     source_sequence_length: sources_lengths,
                     target_sequence_length: targets_lengths,
                     keep_prob: 1.0})


                batch_valid_logits = sess.run(
                    inference_logits,
                    {input_data: valid_sources_batch,
                     source_sequence_length: valid_sources_lengths,
                     target_sequence_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits)

                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)

                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

        # Save Model each epochs
        saver = tf.train.Saver()
        saver.save(sess, save_path)
        print('Model Trained and Saved')

Epoch   0 Batch   10/355 - Train Accuracy: 0.7112, Validation Accuracy: 0.7306, Loss: 2.0971
Epoch   0 Batch   20/355 - Train Accuracy: 0.1532, Validation Accuracy: 0.1328, Loss: 1.8223
Epoch   0 Batch   30/355 - Train Accuracy: 0.5167, Validation Accuracy: 0.5251, Loss: 1.5579
Epoch   0 Batch   40/355 - Train Accuracy: 0.6510, Validation Accuracy: 0.6618, Loss: 1.4929
Epoch   0 Batch   50/355 - Train Accuracy: 0.7016, Validation Accuracy: 0.7219, Loss: 1.6353
Epoch   0 Batch   60/355 - Train Accuracy: 0.6633, Validation Accuracy: 0.7136, Loss: 1.6643
Epoch   0 Batch   70/355 - Train Accuracy: 0.6766, Validation Accuracy: 0.6994, Loss: 1.5168
Epoch   0 Batch   80/355 - Train Accuracy: 0.7136, Validation Accuracy: 0.7252, Loss: 1.4154
Epoch   0 Batch   90/355 - Train Accuracy: 0.6790, Validation Accuracy: 0.7136, Loss: 1.4779
Epoch   0 Batch  100/355 - Train Accuracy: 0.6812, Validation Accuracy: 0.6836, Loss: 1.3092
Epoch   0 Batch  110/355 - Train Accuracy: 0.7444, Validation Accuracy

Epoch   2 Batch  190/355 - Train Accuracy: 0.6801, Validation Accuracy: 0.7252, Loss: 1.2360
Epoch   2 Batch  200/355 - Train Accuracy: 0.6789, Validation Accuracy: 0.7267, Loss: 1.3287
Epoch   2 Batch  210/355 - Train Accuracy: 0.7111, Validation Accuracy: 0.7252, Loss: 1.2028
Epoch   2 Batch  220/355 - Train Accuracy: 0.6490, Validation Accuracy: 0.7267, Loss: 1.5573
Epoch   2 Batch  230/355 - Train Accuracy: 0.7434, Validation Accuracy: 0.7252, Loss: 1.0210
Epoch   2 Batch  240/355 - Train Accuracy: 0.6825, Validation Accuracy: 0.7144, Loss: 1.3361
Epoch   2 Batch  250/355 - Train Accuracy: 0.7172, Validation Accuracy: 0.7267, Loss: 1.0853
Epoch   2 Batch  260/355 - Train Accuracy: 0.7025, Validation Accuracy: 0.7252, Loss: 1.1689
Epoch   2 Batch  270/355 - Train Accuracy: 0.6821, Validation Accuracy: 0.7267, Loss: 1.3412
Epoch   2 Batch  280/355 - Train Accuracy: 0.7073, Validation Accuracy: 0.7252, Loss: 1.2463
Epoch   2 Batch  290/355 - Train Accuracy: 0.6720, Validation Accuracy

Epoch   5 Batch   20/355 - Train Accuracy: 0.6873, Validation Accuracy: 0.7171, Loss: 1.2559
Epoch   5 Batch   30/355 - Train Accuracy: 0.7147, Validation Accuracy: 0.7258, Loss: 1.1151
Epoch   5 Batch   40/355 - Train Accuracy: 0.7046, Validation Accuracy: 0.7200, Loss: 1.1171
Epoch   5 Batch   50/355 - Train Accuracy: 0.7072, Validation Accuracy: 0.7239, Loss: 1.2742
Epoch   5 Batch   60/355 - Train Accuracy: 0.6705, Validation Accuracy: 0.7183, Loss: 1.2707
Epoch   5 Batch   70/355 - Train Accuracy: 0.7030, Validation Accuracy: 0.7252, Loss: 1.2047
Epoch   5 Batch   80/355 - Train Accuracy: 0.7116, Validation Accuracy: 0.7264, Loss: 1.1575
Epoch   5 Batch   90/355 - Train Accuracy: 0.3856, Validation Accuracy: 0.4075, Loss: 1.1792
Epoch   5 Batch  100/355 - Train Accuracy: 0.7132, Validation Accuracy: 0.7203, Loss: 1.0461
Epoch   5 Batch  110/355 - Train Accuracy: 0.7503, Validation Accuracy: 0.7239, Loss: 0.8747
Epoch   5 Batch  120/355 - Train Accuracy: 0.6986, Validation Accuracy

Epoch   7 Batch  200/355 - Train Accuracy: 0.3269, Validation Accuracy: 0.3367, Loss: 1.2124
Epoch   7 Batch  210/355 - Train Accuracy: 0.7156, Validation Accuracy: 0.7228, Loss: 1.0897
Epoch   7 Batch  220/355 - Train Accuracy: 0.6474, Validation Accuracy: 0.7215, Loss: 1.4233
Epoch   7 Batch  230/355 - Train Accuracy: 0.7414, Validation Accuracy: 0.7230, Loss: 0.9414
Epoch   7 Batch  240/355 - Train Accuracy: 0.6860, Validation Accuracy: 0.7172, Loss: 1.2081
Epoch   7 Batch  250/355 - Train Accuracy: 0.7128, Validation Accuracy: 0.7227, Loss: 0.9867
Epoch   7 Batch  260/355 - Train Accuracy: 0.6183, Validation Accuracy: 0.6259, Loss: 1.0789
Epoch   7 Batch  270/355 - Train Accuracy: 0.6801, Validation Accuracy: 0.7234, Loss: 1.2192
Epoch   7 Batch  280/355 - Train Accuracy: 0.5789, Validation Accuracy: 0.5702, Loss: 1.1395
Epoch   7 Batch  290/355 - Train Accuracy: 0.3654, Validation Accuracy: 0.3083, Loss: 1.1620
Epoch   7 Batch  300/355 - Train Accuracy: 0.7184, Validation Accuracy

### Save Parameters
Save the `batch_size` and `save_path` parameters for inference.

In [23]:
def save_params(params):
    """
    Save parameters to file
    """
    with open('models/params.p', 'wb') as out_file:
        pickle.dump(params, out_file)
        
    
# Save parameters for checkpoint
save_params(save_path)