Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

print("TensorFlow Version:",tf.__version__)

TensorFlow Version: 1.13.1


First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
log_dir = './logs/3_regularization'
#Delete previous logs
if tf.gfile.Exists(log_dir):
    tf.gfile.DeleteRecursively(log_dir)
#Create folder
tf.gfile.MakeDirs(log_dir)

In [60]:
# We can't initialize these variables to 0 - the network will get stuck.
def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape = shape)
    #initial = tf.zeros(shape=shape)
    return tf.Variable(initial)

def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)

        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))

        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)

def layer(input_tensor, input_dim, output_dim, layer_name, act = tf.nn.relu):
    # Adding a name scope ensures logical grouping of the layers in the graph.
    with tf.name_scope(layer_name):
        '''VARIABLES'''
        # These are the parameters that we are going to be training. The weight
        # matrix will be initialized using random values following a (truncated)
        # normal distribution. The biases get initialized to zero.

        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            weights = weight_variable([input_dim, output_dim])
            #variable_summaries(weights)

        with tf.name_scope('biases'):
            biases = bias_variable([output_dim])
            #variable_summaries(biases)

        '''TRAINING COMPUTATION'''
        # We multiply the inputs with the weight matrix, and add biases. We compute
        # the softmax and cross-entropy (it's one operation in TensorFlow, because
        # it's very common, and it can be optimized)
        with tf.name_scope('Wx_plus_b'):
            logits = tf.matmul(input_tensor, weights) + biases
            #tf.summary.histogram('logits', logits)

        return logits, weights, biases     
    
def accuracy(predictions, labels, name = 'Train'):
    with tf.name_scope('accuracy_' + name):
        with tf.name_scope('correct_prediction'):
            #correct_prediction = tf.equal(tf.argmax(predictions, axis=1), labels)
            correct_prediction = tf.equal(tf.math.argmax(predictions, axis=1), tf.math.argmax(labels, axis=1))

        #accuracy = (100.0 * np.sum(correct_prediction) / predictions.shape[0])       
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        #tf.summary.scalar('accuracy', accuracy)
    return accuracy

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [6]:
# This is to expedite the process 
train_subset = 10000
valid_subset = valid_dataset.shape[0]
test_subset = test_dataset.shape[0]

# This is a good beta value to start with
beta = 0.01

graph = tf.Graph()
with graph.as_default():
    # Input data.
    with tf.name_scope('input'):
        with tf.name_scope('Train'):
            tf_train_dataset = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            tf_train_labels = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')

        with tf.name_scope('Validation'):
            x_valid = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            y_valid = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')

        with tf.name_scope('Test'):
            x_test = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            y_test = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')
    
    # Save Image
    with tf.name_scope('input_reshape'):
        image_shaped_input = tf.reshape(tf_train_dataset, [-1, 28, 28, 1])
        tf.summary.image('input', image_shaped_input, 3)
    
    # Create tensor with beta
    tf_beta = tf.constant(beta, name = "beta")
    
    ##Training
    with tf.name_scope('Train'):
        # Run Layer
        # They are variables we want to update and optimize.
        logits, weights, biases = layer(tf_train_dataset, image_size * image_size, num_labels, "Layer")

        # Original loss function
        with tf.name_scope('loss_Original'):
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels,  logits=logits) )
        
        '''Regularization'''
        # Loss function using L2 Regularization
        with tf.name_scope('Regularization'):
            regularizer = tf.nn.l2_loss(weights)
        
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(loss + tf_beta * regularizer)
        
        '''Optimizer'''
        with tf.name_scope('Optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

        '''Predictions for the training, validation, and test data.'''
        with tf.name_scope('Prediction'):
            train_prediction = tf.nn.softmax(logits)
    
    #Add to Graph
    tf.summary.scalar("loss", loss)

    ##Accuracy
    tr_acc = accuracy(train_prediction, tf_train_labels)

    # Validation
    with tf.name_scope('Validation'):
        with tf.name_scope('Prediction'):
            with tf.name_scope('Wx_plus_b'):
                logits = tf.matmul(x_valid, weights) + biases
                #tf.summary.histogram('logits', logits)
            valid_prediction = tf.nn.softmax(logits)
    v_acc = accuracy(valid_prediction, y_valid, 'Valid')
      
    # Test
    with tf.name_scope('Test'):
        with tf.name_scope('Prediction'):
            with tf.name_scope('Wx_plus_b'):
                logits = tf.matmul(x_test, weights) + biases
                #tf.summary.histogram('logits', logits)
            test_prediction = tf.nn.softmax(logits)
    ts_acc = accuracy(test_prediction, y_test, 'Test')
    
    accuracyValues = [tr_acc , v_acc, ts_acc]

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
# # This is to expedite the process 
# train_subset = 10000
# # This is a good beta value to start with
# beta = 0.01

# graph = tf.Graph()
# with graph.as_default():
#     # Input data.
#     # They're all constants.
#     tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
#     tf_train_labels = tf.constant(train_labels[:train_subset])
#     tf_valid_dataset = tf.constant(valid_dataset)
#     tf_test_dataset = tf.constant(test_dataset)
    
#     # Variables    
#     # They are variables we want to update and optimize.
#     weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
#     biases = tf.Variable(tf.zeros([num_labels]))
    
#     # Training computation.
#     logits = tf.matmul(tf_train_dataset, weights) + biases 
#     # Original loss function
#     loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels,  logits=logits) )

#     # Loss function using L2 Regularization
#     regularizer = tf.nn.l2_loss(weights)
#     loss = tf.reduce_mean(loss + beta * regularizer)
    
#     # Optimizer.
#     optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
#     # Predictions for the training, validation, and test data.
#     train_prediction = tf.nn.softmax(logits)
#     valid_prediction = tf.nn.softmax( tf.matmul(tf_valid_dataset, weights) + biases )
#     test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

1. Run Computation & Iterate 

In [7]:
def feed_dict():
    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
    xs = train_dataset[:train_subset, :]
    ys = train_labels[:train_subset]

    #Valid Data
    xv = valid_dataset
    yv = valid_labels

    #Test Data
    xt = test_dataset
    yt = test_labels

    return {tf_train_dataset: xs, tf_train_labels: ys,
            x_valid: xv, y_valid: yv,
            x_test: xt, y_test: yt}

def accuracyOld(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [8]:
num_steps = 801

with tf.Session(graph = graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the
    # biases.
    tf.global_variables_initializer().run()
    # Merge all summary inforation.
    merged = tf.summary.merge_all()
    write = tf.summary.FileWriter(log_dir + "/multinomial", session.graph)
    print('Initialized')
    for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.
        summary, _, l, acc, predictions = session.run([merged, optimizer, loss, accuracyValues, train_prediction],
                                                  feed_dict = feed_dict())
        write.add_summary(summary, step)
        if (step % 100 == 0):
            print('======================================')
            print('Loss at step {}: {}'.format(step, l))
            print('Training accuracy: %.2f%%' % acc[0])
            print('(Old) Training accuracy: {:.2f}%%'.format(accuracyOld(predictions, 
                                                         train_labels[:train_subset, :])))
            # Calling .eval() on valid_prediction is basically like calling run(), but
            # just to get that one numpy array. Note that it recomputes all its graph
            # dependencies.
            
            # You don't have to do .eval above because we already ran the session for the
            # train_prediction
            print('Validation accuracy: %.2f%%' % acc[1])
            print('(Old) Validation accuracy: {:.2f}%%'.format(accuracyOld(valid_prediction.eval({x_valid: valid_dataset}), 
                                                           valid_labels)))
    write.close()
    print('=============== Finished!! =====================')  
    print('Test accuracy: %.2f%%' % acc[2])
    print('(Old) Test accuracy: {:.2f}%%'.format(accuracyOld(test_prediction.eval({x_test: test_dataset}),
                                                     test_labels)))

Initialized
Loss at step 0: 2.9396636486053467
Training accuracy: 17.28%
(Old) Training accuracy: 17.28%%
Validation accuracy: 17.18%
(Old) Validation accuracy: 50.93%%
Loss at step 100: 0.8092483282089233
Training accuracy: 82.68%
(Old) Training accuracy: 82.68%%
Validation accuracy: 81.86%
(Old) Validation accuracy: 81.89%%
Loss at step 200: 0.7303615808486938
Training accuracy: 83.76%
(Old) Training accuracy: 83.76%%
Validation accuracy: 82.07%
(Old) Validation accuracy: 82.06%%
Loss at step 300: 0.7047470211982727
Training accuracy: 83.93%
(Old) Training accuracy: 83.93%%
Validation accuracy: 82.22%
(Old) Validation accuracy: 82.21%%
Loss at step 400: 0.6958234310150146
Training accuracy: 84.09%
(Old) Training accuracy: 84.09%%
Validation accuracy: 82.16%
(Old) Validation accuracy: 82.16%%
Loss at step 500: 0.6925710439682007
Training accuracy: 84.17%
(Old) Training accuracy: 84.17%%
Validation accuracy: 82.16%
(Old) Validation accuracy: 82.16%%
Loss at step 600: 0.6913259029388428

#### Neural Network with L2 Regularization

-  Hidden Layer using RELUs
    



In [91]:
num_nodes= 1024
batch_size = 128
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    with tf.name_scope('Input'):
        tf_train_dataset = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
        tf_train_labels = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')
        
        with tf.name_scope('Validation'):
            tf_valid_dataset = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            tf_valid_labels = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')

        with tf.name_scope('Test'):
            tf_test_dataset = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            tf_test_labels = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')

    # Create tensor with beta
    tf_beta = tf.constant(beta, name = "beta")
    
    # Variables.
    with tf.name_scope('Train'):
        # Layer1
        logits_1, weights_1, biases_1 = layer(tf_train_dataset, image_size * image_size, num_nodes, "Layer1")
        
        #Activation function
        relu_layer = tf.nn.relu(logits_1, name = 'activation')
        #tf.summary.histogram('Activations', relu_layer)
        
        # Layer2
        logits_2, weights_2, biases_2 = layer(relu_layer, num_nodes, num_labels, "Layer2")
        
        # Normal loss function
        with tf.name_scope('loss_Original'):
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels,logits = logits_2))
        
        # Loss function with L2 Regularization with beta=0.01
        with tf.name_scope('Regularization'):
            regularizers = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2)

        with tf.name_scope('loss'):
            loss = tf.reduce_mean(loss + tf_beta * regularizers)

        # Optimizer.
        with tf.name_scope('Optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

        '''Predictions for the training, validation, and test data.'''
        # Predictions for the training
        with tf.name_scope('Prediction'):
            train_prediction = tf.nn.softmax(logits_2)

    #Add to Graph
    tf.summary.scalar("loss", loss)
    
    ##Accuracy
    tr_acc = accuracy(train_prediction, tf_train_labels)
    
    # Predictions for validation 
    with tf.name_scope('Validation'):
        with tf.name_scope('Prediction'):
            with tf.name_scope('layer_1'):
                logits_1 = tf.matmul(tf_valid_dataset, weights_1) + biases_1
            with tf.name_scope('Activation'):
                relu_layer= tf.nn.relu(logits_1)
            with tf.name_scope('layer_2'):
                logits_2 = tf.matmul(relu_layer, weights_2) + biases_2
    
            valid_prediction = tf.nn.softmax(logits_2)

    v_acc = accuracy(valid_prediction, tf_valid_labels, 'Valid')

    # Predictions for test
    with tf.name_scope('Test'):
        with tf.name_scope('Prediction'):
            with tf.name_scope('layer_1'):
                logits_1 = tf.matmul(tf_test_dataset, weights_1) + biases_1
            with tf.name_scope('Activation'):
                relu_layer= tf.nn.relu(logits_1)
            with tf.name_scope('layer_2'):
                logits_2 = tf.matmul(relu_layer, weights_2) + biases_2
    
            test_prediction =  tf.nn.softmax(logits_2)
    ts_acc = accuracy(test_prediction, tf_test_labels, 'Test')
    
    #accuracyValues = tf.stack([tr_acc , v_acc, ts_acc])

In [92]:
num_steps = 3001

def feed_dict_test(step):
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]

    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    return {tf_train_dataset: batch_data, tf_train_labels: batch_labels,
            tf_valid_dataset: valid_dataset, tf_valid_labels: valid_labels,
            tf_test_dataset: test_dataset, tf_test_labels: test_labels}

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()

    # Merge all summary information.
    merged = tf.summary.merge_all()
    write = tf.summary.FileWriter(log_dir + "/1_hidden_layer", session.graph)
    
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        #batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
#        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        #summary, _, l, predictions = session.run([merged, optimizer, loss, train_prediction], feed_dict=feed_dict_test(step))
        summary, _, l, predictions = session.run([merged, optimizer, loss, train_prediction],
                                             feed_dict = feed_dict_test(step))
        
        # Write Summary
        #if (step % 25 == 0):
        write.add_summary(summary, step)
            
        if (step % 500 == 0):
            print('============================================')
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}%".format(accuracyOld(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}%".format(accuracyOld(valid_prediction.eval({tf_valid_dataset: valid_dataset}), valid_labels)))
    #write = tf.summary.FileWriter("./logs/3_regularization", session.graph)
    print('=================Finished!!=====================')
    print("Test accuracy: {:.1f}%".format(accuracyOld(test_prediction.eval({tf_test_dataset: test_dataset}), test_labels)))

Initialized
Minibatch loss at step 0: 35.90025329589844
Minibatch accuracy: 12.5%
Validation accuracy: 27.7%
Minibatch loss at step 500: 1.0640349388122559
Minibatch accuracy: 81.2%
Validation accuracy: 83.8%
Minibatch loss at step 1000: 0.7056258320808411
Minibatch accuracy: 82.8%
Validation accuracy: 83.5%
Minibatch loss at step 1500: 0.7183153033256531
Minibatch accuracy: 83.6%
Validation accuracy: 83.9%
Minibatch loss at step 2000: 0.6869140863418579
Minibatch accuracy: 84.4%
Validation accuracy: 83.5%
Minibatch loss at step 2500: 0.7358001470565796
Minibatch accuracy: 82.0%
Validation accuracy: 83.8%
Minibatch loss at step 3000: 0.7844299077987671
Minibatch accuracy: 80.5%
Validation accuracy: 83.9%
Test accuracy: 90.3%


In [86]:
num_nodes= 1024
batch_size = 128
beta = 0.01

graphFull = tf.Graph()
with graphFull.as_default():
    '''INPUT DATA'''
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    with tf.name_scope('Input'):
        with tf.name_scope('Train'):
            tf_train_dataset = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            tf_train_labels = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')

        with tf.name_scope('Validation'):
            x_valid = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            y_valid = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')

        with tf.name_scope('Test'):
            x_test = tf.placeholder(tf.float32, shape = [None, image_size * image_size], name = 'x-input')
            y_test = tf.placeholder(tf.float32, shape = [None, num_labels], name = 'y-input')
    
    # Create tensor with beta
    tf_beta = tf.constant(beta, name = "beta")

    # Variables.
    with tf.name_scope('Train'):
        # Layer1
        logits_1, weights_1, biases_1 = layer(tf_train_dataset, image_size * image_size, num_nodes, "Layer1")
        
        #Activation function
        relu_layer = tf.nn.relu(logits_1, name = 'activation')
        #tf.summary.histogram('Activations', relu_layer)
        
        # Layer2
        logits_2, weights_2, biases_2 = layer(relu_layer, num_nodes, num_labels, "Layer2")
   
        # Normal loss function
        with tf.name_scope('loss_Original'):
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels,  logits=logits_2))
        
        '''Regularization'''
        # Loss function using L2 Regularization
        with tf.name_scope('Regularization'):
            regularizer = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2)
        
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(loss + tf_beta * regularizer)
        
        '''Optimizer'''
        with tf.name_scope('Optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

        '''Predictions for the training, validation, and test data.'''
        with tf.name_scope('Prediction'):
            train_prediction = tf.nn.softmax(logits_2)
    
    #Add to Graph
    tf.summary.scalar("loss", loss)
            
    ##Accuracy
    tr_acc = accuracy(train_prediction, tf_train_labels)

    # Predictions for validation 
    with tf.name_scope('Validation'):
        with tf.name_scope('Prediction'):
            with tf.name_scope('layer_1'):
                logits_1 = tf.matmul(x_valid, weights_1) + biases_1
                #tf.summary.histogram('logits_1', logits_1)
            with tf.name_scope('Activations'):
                relu_layer = tf.nn.relu(logits_1, name = 'activation')
                #tf.summary.histogram('relu_layer', relu_layer)
            with tf.name_scope('layer_2'):
                logits_2 = tf.matmul(relu_layer, weights_2) + biases_2
                #tf.summary.histogram('logits_2', logits_2)
            valid_prediction = tf.nn.softmax(logits_2)
    v_acc = accuracy(valid_prediction, y_valid, 'Valid')

    # Predictions for test
    with tf.name_scope('Test'):
        with tf.name_scope('Prediction'):
            with tf.name_scope('layer_1'):
                logits_1 = tf.matmul(x_test, weights_1) + biases_1
                #tf.summary.histogram('logits_1', logits_1)
            with tf.name_scope('Activations'):
                relu_layer= tf.nn.relu(logits_1, name = 'activation')
                #tf.summary.histogram('relu_layer', relu_layer)
            with tf.name_scope('layer_2'):
                logits_2 = tf.matmul(relu_layer, weights_2) + biases_2
                #tf.summary.histogram('logits_2', logits_2)
            test_prediction = tf.nn.softmax(logits_2)
    ts_acc = accuracy(test_prediction, y_test, 'Test')
    
    accuracyValues = [tr_acc , v_acc, ts_acc]    

In [89]:
num_steps = 3001

def feed_dict_SGD(step):
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    
    #Valid Data
    xv = valid_dataset
    yv = valid_labels

    #Test Data
    xt = test_dataset
    yt = test_labels

    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    return {tf_train_dataset: batch_data, tf_train_labels: batch_labels,
            x_valid: xv, y_valid: yv,
            x_test: xt, y_test: yt}

with tf.Session(graph = graphFull) as session:
    tf.global_variables_initializer().run()

    # Merge all summary information.
    merged = tf.summary.merge_all()
    write = tf.summary.FileWriter(log_dir + "/1_hidden_layer", session.graph)

    print("Initialized with 1-hidden layer")
    for step in range(num_steps):
        # (Old)
        #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        #batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Run Tensors
        #summary, _, l, acc, predictions = session.run([merged, optimizer, loss, accuracyValues, train_prediction],
#                                             feed_dict = feed_dict_SGD(step))

        summary, _, l, predictions = session.run([merged, optimizer, loss, train_prediction], feed_dict=feed_dict_SGD(step))

#        summary, _, l, acc = session.run([merged, optimizer, loss, accuracyValues],
#                                             feed_dict = feed_dict_SGD(step))

        # Write Summary
        #if (step % 25 == 0):
        write.add_summary(summary, step)

        if (step % 500 == 0):
            print('============================================')
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: %.2f%%" % tr_acc.eval({tf_train_dataset: batch_data, tf_train_labels: batch_labels}))
            print("(Old) Minibatch accuracy: {:.2f}".format(accuracyOld(predictions, batch_labels)))
            print("Validation accuracy: %.2f%%" % v_acc.eval({x_valid: valid_dataset, y_valid: valid_labels}))
            print("(Old) Validation accuracy: {:.2f}".format(accuracyOld(valid_prediction.eval({x_valid: valid_dataset}), valid_labels)))

    write.close()
    print('=================Finished!!=====================')
    print("Test accuracy: %.2f%%" % ts_acc.eval({x_test: test_dataset, y_test: test_labels}))
    print("(Old) Test accuracy: {:.2f}".format(accuracyOld(test_prediction.eval({x_test: test_dataset}), test_labels)))

Initialized with 1-hidden layer
Minibatch loss at step 0: 35.51650619506836
Minibatch accuracy: 0.27%
(Old) Minibatch accuracy: 10.16
Validation accuracy: 0.29%
(Old) Validation accuracy: 29.02
Minibatch loss at step 500: 1.0710657835006714
Minibatch accuracy: 0.80%
(Old) Minibatch accuracy: 11.72
Validation accuracy: 0.84%
(Old) Validation accuracy: 83.88
Minibatch loss at step 1000: 0.7011016607284546
Minibatch accuracy: 0.80%
(Old) Minibatch accuracy: 8.59
Validation accuracy: 0.83%
(Old) Validation accuracy: 83.40
Minibatch loss at step 1500: 0.7219771146774292
Minibatch accuracy: 0.82%
(Old) Minibatch accuracy: 10.16
Validation accuracy: 0.84%
(Old) Validation accuracy: 83.83
Minibatch loss at step 2000: 0.6839468479156494
Minibatch accuracy: 0.84%
(Old) Minibatch accuracy: 14.84
Validation accuracy: 0.83%
(Old) Validation accuracy: 83.32
Minibatch loss at step 2500: 0.7321859002113342
Minibatch accuracy: 0.82%
(Old) Minibatch accuracy: 11.72
Validation accuracy: 0.84%
(Old) Valid

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---
#### Continuing from the Neural Network with L2 Regularization above

In [93]:
num_steps = 3001

train_dataset_2 = train_dataset[:500, :]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset_2[offset:(offset + batch_size), :]
        batch_labels = train_labels_2[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('============================================')
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}%".format(accuracyOld(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}%".format(accuracyOld(valid_prediction.eval({tf_valid_dataset: valid_dataset}), valid_labels)))
    print('=================Finished!!=====================')
    print("Test accuracy: {:.1f}%".format(accuracyOld(test_prediction.eval({tf_test_dataset: test_dataset}), test_labels)))

Initialized
Minibatch loss at step 0: 35.15644073486328
Minibatch accuracy: 6.2%
Validation accuracy: 31.9%
Minibatch loss at step 500: 0.4985537827014923
Minibatch accuracy: 100.0%
Validation accuracy: 77.3%
Minibatch loss at step 1000: 0.27435389161109924
Minibatch accuracy: 100.0%
Validation accuracy: 77.1%
Minibatch loss at step 1500: 0.259139746427536
Minibatch accuracy: 100.0%
Validation accuracy: 77.2%
Minibatch loss at step 2000: 0.25506243109703064
Minibatch accuracy: 100.0%
Validation accuracy: 77.1%
Minibatch loss at step 2500: 0.25356459617614746
Minibatch accuracy: 100.0%
Validation accuracy: 77.3%
Minibatch loss at step 3000: 0.2595043182373047
Minibatch accuracy: 100.0%
Validation accuracy: 77.4%
Test accuracy: 84.2%


As you can see, there's high training accuracy but low validation accuracy. There is overfitting here.


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [94]:
num_nodes= 1024
batch_size = 128
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_nodes]))
    biases_1 = tf.Variable(tf.zeros([num_nodes]))
    weights_2 = tf.Variable(tf.truncated_normal([num_nodes, num_labels]))
    biases_2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits_1 = tf.matmul(tf_train_dataset, weights_1) + biases_1
    relu_layer= tf.nn.relu(logits_1)
    # Dropout on hidden layer: RELU layer
    keep_prob = tf.placeholder("float")
    relu_layer_dropout = tf.nn.dropout(relu_layer, keep_prob)
    
    logits_2 = tf.matmul(relu_layer_dropout, weights_2) + biases_2
    # Normal loss function
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels , logits = logits_2))
    # Loss function with L2 Regularization with beta=0.01
    regularizers = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2)
    loss = tf.reduce_mean(loss + beta * regularizers)

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training
    train_prediction = tf.nn.softmax(logits_2)
    
    # Predictions for validation 
    logits_1 = tf.matmul(tf_valid_dataset, weights_1) + biases_1
    relu_layer= tf.nn.relu(logits_1)
    logits_2 = tf.matmul(relu_layer, weights_2) + biases_2
    
    valid_prediction = tf.nn.softmax(logits_2)
    
    # Predictions for test
    logits_1 = tf.matmul(tf_test_dataset, weights_1) + biases_1
    relu_layer= tf.nn.relu(logits_1)
    logits_2 = tf.matmul(relu_layer, weights_2) + biases_2
    
    test_prediction =  tf.nn.softmax(logits_2)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [95]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('============================================')
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}%".format(accuracyOld(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}%".format(accuracyOld(valid_prediction.eval({tf_valid_dataset: valid_dataset}), valid_labels)))
    print('=================Finished!!=====================')
    print("Test accuracy: {:.1f}%".format(accuracyOld(test_prediction.eval({tf_test_dataset: test_dataset}), test_labels)))

Initialized
Minibatch loss at step 0: 3645.82177734375
Minibatch accuracy: 12.5%
Validation accuracy: 28.5%
Minibatch loss at step 500: 21.557558059692383
Minibatch accuracy: 80.5%
Validation accuracy: 84.4%
Minibatch loss at step 1000: 0.9626157283782959
Minibatch accuracy: 80.5%
Validation accuracy: 83.1%
Minibatch loss at step 1500: 0.8295962810516357
Minibatch accuracy: 80.5%
Validation accuracy: 83.2%
Minibatch loss at step 2000: 0.7876985669136047
Minibatch accuracy: 83.6%
Validation accuracy: 83.3%
Minibatch loss at step 2500: 0.7596803903579712
Minibatch accuracy: 83.6%
Validation accuracy: 83.5%
Minibatch loss at step 3000: 0.8931594491004944
Minibatch accuracy: 81.2%
Validation accuracy: 83.6%
Test accuracy: 89.9%


#### Extreme Overfitting

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


#### Model

- 5 hidden layers NN
  -  RELUs
  -  Number of nodes decrease by 50% with each hidden layer that is deeper in the neural net
-  Overfitting measures
  -  L2 Regularization
    -  Learning rate (beta) with exponential decay
  -  Dropout

-  10,000 steps

In [96]:
import math as math

In [97]:
batch_size = 128
beta = 0.001

hidden_nodes_1 = 1024
hidden_nodes_2 = int(hidden_nodes_1 * 0.5)
hidden_nodes_3 = int(hidden_nodes_1 * np.power(0.5, 2))
hidden_nodes_4 = int(hidden_nodes_1 * np.power(0.5, 3))
hidden_nodes_5 = int(hidden_nodes_1 * np.power(0.5, 4))


graph = tf.Graph()
with graph.as_default():

    '''Input Data'''
    # For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    '''Variables'''
    # Hidden RELU layer 1
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes_1], stddev=math.sqrt(2.0/(image_size*image_size))))
    biases_1 = tf.Variable(tf.zeros([hidden_nodes_1]))

    # Hidden RELU layer 2
    weights_2 = tf.Variable(tf.truncated_normal([hidden_nodes_1, hidden_nodes_2], stddev=math.sqrt(2.0/hidden_nodes_1)))
    biases_2 = tf.Variable(tf.zeros([hidden_nodes_2]))
    
    # Hidden RELU layer 3
    weights_3 = tf.Variable(tf.truncated_normal([hidden_nodes_2, hidden_nodes_3], stddev=math.sqrt(2.0/hidden_nodes_2)))
    biases_3 = tf.Variable(tf.zeros([hidden_nodes_3]))
    
    # Hidden RELU layer 4
    weights_4 = tf.Variable(tf.truncated_normal([hidden_nodes_3, hidden_nodes_4], stddev=math.sqrt(2.0/hidden_nodes_3)))
    biases_4 = tf.Variable(tf.zeros([hidden_nodes_4]))
    
    # Hidden RELU layer 5
    weights_5 = tf.Variable(tf.truncated_normal([hidden_nodes_4, hidden_nodes_5], stddev=math.sqrt(2.0/hidden_nodes_4)))
    biases_5 = tf.Variable(tf.zeros([hidden_nodes_5]))
    
    # Output layer
    weights_6 = tf.Variable(tf.truncated_normal([hidden_nodes_5, num_labels], stddev=math.sqrt(2.0/hidden_nodes_5)))
    biases_6 = tf.Variable(tf.zeros([num_labels]))
    
    '''Training computation'''
    # Hidden RELU layer 1
    logits_1 = tf.matmul(tf_train_dataset, weights_1) + biases_1
    hidden_layer_1 = tf.nn.relu(logits_1)
    # Dropout on hidden layer: RELU layer
    keep_prob = tf.placeholder("float")
    hidden_layer_1_dropout = tf.nn.dropout(hidden_layer_1, keep_prob)
    
    
    # Hidden RELU layer 2
    logits_2 = tf.matmul(hidden_layer_1_dropout, weights_2) + biases_2
    hidden_layer_2 = tf.nn.relu(logits_2)
    # Dropout on hidden layer: RELU layer
    hidden_layer_2_dropout = tf.nn.dropout(hidden_layer_2, keep_prob)
    
    # Hidden RELU layer 3
    logits_3 = tf.matmul(hidden_layer_2_dropout, weights_3) + biases_3
    hidden_layer_3 = tf.nn.relu(logits_3)
    # Dropout on hidden layer: RELU layer
    hidden_layer_3_dropout = tf.nn.dropout(hidden_layer_3, keep_prob)
    
    # Hidden RELU layer 4
    logits_4 = tf.matmul(hidden_layer_3_dropout, weights_4) + biases_4
    hidden_layer_4 = tf.nn.relu(logits_4)
    # Dropout on hidden layer: RELU layer

    hidden_layer_4_dropout = tf.nn.dropout(hidden_layer_4, keep_prob)
    
    # Hidden RELU layer 5
    logits_5 = tf.matmul(hidden_layer_4_dropout, weights_5) + biases_5
    hidden_layer_5 = tf.nn.relu(logits_5)
    # Dropout on hidden layer: RELU layer
    hidden_layer_5_dropout = tf.nn.dropout(hidden_layer_5, keep_prob)
    
    # Output layer
    logits_6 = tf.matmul(hidden_layer_5_dropout, weights_6) + biases_6 
    
    # Normal loss function
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf_train_labels, logits = logits_6))
    # Loss function with L2 Regularization with decaying learning rate beta=0.5
    regularizers = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2) + \
                   tf.nn.l2_loss(weights_3) + tf.nn.l2_loss(weights_4) + \
                   tf.nn.l2_loss(weights_5) + tf.nn.l2_loss(weights_6)
    loss = tf.reduce_mean(loss + beta * regularizers)
    
    '''Optimizer'''
    # Decaying learning rate
    global_step = tf.Variable(0)  # count the number of steps taken.
    start_learning_rate = 0.5
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
    
    # Predictions for the training
    train_prediction = tf.nn.softmax(logits_6)
    
    # Predictions for validation 
    valid_logits_1 = tf.matmul(tf_valid_dataset, weights_1) + biases_1
    valid_relu_1 = tf.nn.relu(valid_logits_1)
    
    valid_logits_2 = tf.matmul(valid_relu_1, weights_2) + biases_2
    valid_relu_2 = tf.nn.relu(valid_logits_2)
    
    valid_logits_3 = tf.matmul(valid_relu_2, weights_3) + biases_3
    valid_relu_3 = tf.nn.relu(valid_logits_3)
    
    valid_logits_4 = tf.matmul(valid_relu_3, weights_4) + biases_4
    valid_relu_4 = tf.nn.relu(valid_logits_4)
    
    valid_logits_5 = tf.matmul(valid_relu_4, weights_5) + biases_5
    valid_relu_5 = tf.nn.relu(valid_logits_5)
    
    valid_logits_6 = tf.matmul(valid_relu_5, weights_6) + biases_6
    
    valid_prediction = tf.nn.softmax(valid_logits_6)
    
    # Predictions for test
    test_logits_1 = tf.matmul(tf_test_dataset, weights_1) + biases_1
    test_relu_1 = tf.nn.relu(test_logits_1)
    
    test_logits_2 = tf.matmul(test_relu_1, weights_2) + biases_2
    test_relu_2 = tf.nn.relu(test_logits_2)
    
    test_logits_3 = tf.matmul(test_relu_2, weights_3) + biases_3
    test_relu_3 = tf.nn.relu(test_logits_3)
    
    test_logits_4 = tf.matmul(test_relu_3, weights_4) + biases_4
    test_relu_4 = tf.nn.relu(test_logits_4)
    
    test_logits_5 = tf.matmul(test_relu_4, weights_5) + biases_5
    test_relu_5 = tf.nn.relu(test_logits_5)
    
    test_logits_6 = tf.matmul(test_relu_5, weights_6) + biases_6
    
    test_prediction = tf.nn.softmax(test_logits_6)   
    

In [98]:
num_steps = 15000

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
                
        if (step % 500 == 0):
            print('============================================')
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}%".format(accuracyOld(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}%".format(accuracyOld(valid_prediction.eval({tf_valid_dataset: valid_dataset}), valid_labels)))
    print('=================Finished!!=====================')
    print("Test accuracy: {:.1f}%".format(accuracyOld(test_prediction.eval({tf_test_dataset: test_dataset}), test_labels)))

Initialized
Minibatch loss at step 0: 5.573212146759033
Minibatch accuracy: 10.2%
Validation accuracy: 4.6%
Minibatch loss at step 500: 1.9766143560409546
Minibatch accuracy: 76.6%
Validation accuracy: 81.6%
Minibatch loss at step 1000: 1.5036698579788208
Minibatch accuracy: 82.0%
Validation accuracy: 83.6%
Minibatch loss at step 1500: 1.175445318222046
Minibatch accuracy: 82.8%
Validation accuracy: 84.7%
Minibatch loss at step 2000: 0.9739924669265747
Minibatch accuracy: 84.4%
Validation accuracy: 84.8%
Minibatch loss at step 2500: 0.9726921319961548
Minibatch accuracy: 82.0%
Validation accuracy: 85.4%
Minibatch loss at step 3000: 0.9776606559753418
Minibatch accuracy: 82.8%
Validation accuracy: 85.5%
Minibatch loss at step 3500: 1.0429762601852417
Minibatch accuracy: 78.1%
Validation accuracy: 85.0%
Minibatch loss at step 4000: 1.0002951622009277
Minibatch accuracy: 81.2%
Validation accuracy: 85.4%
Minibatch loss at step 4500: 0.7759418487548828
Minibatch accuracy: 82.0%
Validation a