
# Deep Learning
## Assignment 4

Previously in 2_fullyconnected.ipynb and 3_regularization.ipynb, we trained fully connected networks to classify notMNIST characters.

The goal of this assignment is make the neural network convolutional.


In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import matplotlib.pyplot as plt

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)



Reformat into a TensorFlow-friendly shape:

   - convolutions need the image data formatted as a cube (width by height by #channels)
   - labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [5]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [6]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3.195922
Minibatch accuracy: 18.8%
Validation accuracy: 10.4%
Minibatch loss at step 500: 0.860197
Minibatch accuracy: 62.5%
Validation accuracy: 79.8%
Minibatch loss at step 1000: 0.128282
Minibatch accuracy: 100.0%
Validation accuracy: 83.3%
Test accuracy: 90.4%


## Discussion:
This is a simple NN with two layers of Conv, followed by two FC layers.
The architecture is: 

Input(N,28, 28,1)->conv(5,5,16, s=2)->ReLu->conv(5,5,16, s=2)->ReLu->FC(7*7*16)->FC->Out

- The code is a bit unclear and there is no separation of Conv and Fully connectedlayer; 
- with batch_size=16, and num_steps = 1001; we are not using the whole training dataset of length= 200000.
- Required steps to traverse the whole dataset (total no of batches)= 12500

Next we will rewrite the above code in a easy-readable format and use it in next subsection:

In [7]:
image_size = 28
batch_size = 16
filter_size = 5
in_image_channnels = 1 #grayscale image
depth = 16 #no of channels in conv layer
num_hidden = 64
learning_rate = 0.05

"""
tensorflow conv2d requires 4D input and filter tensor;
tf.nn.conv2d(
    input,
    filter,
    strides,
    padding,...)
input tensor of shape = [batch, in_height, in_width, in_channels]
filter / kernel tensor of shape = [filter_height, filter_width, in_channels, out_channels]
"""

#define a function that creates convolutional layer
def create_conv_layer(in_data, num_in_channels, num_out_channels, filter_shape,conv_stride, name):
    ''' in_data = input Data (need 4D shape defined above)
        in_channels = no of channels in input image, 1(grayscale), 3(RGB)
        out_channels = depth of conv layers
        filter_shape = filter shape used for convolution, for e.g. [3,3], or [5,5]
        name = any valid string'''
    #define 4D shape for filter/kernel tensor that will be used for creating weights
    conv_filter_shape = [filter_shape[0], filter_shape[1], num_in_channels, num_out_channels]
    
    weights = tf.Variable(tf.truncated_normal(conv_filter_shape, stddev = 0.1), name = name+ '_W')
    bias = tf.Variable(tf.zeros(num_out_channels), name = name+"_b")
    
    strides = [1, conv_stride[0], conv_stride[1], 1]
    #define the conv2d layer
    outlayer = tf.nn.conv2d(in_data, weights, strides = strides, padding = 'SAME')
    #add bias
    outlayer = outlayer + bias
    #apply ReLu activation
    outlayer = tf.nn.relu(outlayer)
    
    #usually there is pooling layer; not in this case
    return outlayer

# the dataset train_dataset, train_labels is already formatted
# Input data placeholders
# using shape = [None, ] allows us to use it as a general label placeholder
# shape=(batch_size, image_size, image_size, in_image_channnels)
tf_train_dataset = tf.placeholder(tf.float32, shape=(None, image_size, image_size, in_image_channnels))
#tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_train_labels = tf.placeholder(tf.float32, shape=(None, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)

# MODEL definition
# now define the two conv layer
layer1 = create_conv_layer(tf_train_dataset, in_image_channnels, depth,\
                           filter_shape =[5,5],conv_stride =[2,2], name ='layer1')
layer2 = create_conv_layer(layer1,depth, 16, [5,5], [2,2], name ='layer2' )

# now we need to flatten and add two FC layers
# after two layers of stride 2, we go from 28 x 28, to 14 x 14 to 7 x 7 x,y co-ordinates, 
# but with 16 output channels.  To create the fully connected,
# "dense" layer, the new shape needs to be [-1, 7 x 7 x 16]
#flattened = tf.reshape(layer2, [-1, 7 * 7 * 16])
flattened = tf.reshape(layer2, [-1, 7 * 7 * 16])

#setup weights and bias for dense layer
w1 = tf.Variable(tf.truncated_normal([7*7* 16, num_hidden], stddev = 0.1), name ='w_dense1')
b1 = tf.Variable(tf.zeros([num_hidden]), name ='b_dense1')
# fully connected layer1
dense_layer1 = tf.nn.relu(tf.matmul(flattened, w1) + b1)

# weights and bias for dense layer2
w2 = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev = 0.1), name ='w_dense2')
b2 = tf.Variable(tf.zeros([num_labels]), name = 'b_dense2')
# define the output logits = dense_layer
dense_layer2 = tf.matmul(dense_layer1, w2) + b2

# get the logits from the model
logits = dense_layer2
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

prediction = tf.nn.softmax(logits)
# define an accuracy assessment operation
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(tf_train_labels, 1))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Predictions for the training, validation, and test data.
# these predictions are derrived by 
# calling the accuracy_op operation and feeding the placeholders in feed_dict

In [8]:
# now training part
epochs = 1

with tf.Session() as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  total_batches = int(train_labels.shape[0]/batch_size)
  #total_batches = 1001 # = num_Steps
  print(" Total training dataset length: ", len(train_labels))
  print(" total no of batches: ", total_batches)
  for epoch in range(epochs):
      for i in range(total_batches):
          offset = (i * batch_size)
          batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
          batch_labels = train_labels[offset:(offset + batch_size), :]
          feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
          _, l = session.run([optimizer, loss], feed_dict=feed_dict)
      test_acc = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset,tf_train_labels: test_labels})
      print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(l), " test accuracy: {:.3f}%".format(test_acc*100))
  print("\nTraining complete!")

Initialized
 Total training dataset length:  200000
 total no of batches:  12500
Epoch: 1 cost = 0.748  test accuracy: 94.830%

Training complete!


WOW! with just 1 epoch and running through the complete train_dataset, we achieved accuracy of 94.8%.(last one =90%)

Most times, people try to increase layers in the model rather than getting a picture of whats happening. Here even with the same model, we achived better accuracy after making 100% use of data.

Lets run through the training dataset thrice(epochs= 3) and lets examine.

In [9]:
# now training part
epochs = 3

with tf.Session() as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  total_batches = int(train_labels.shape[0]/batch_size)
  #total_batches = 1001 # = num_Steps
  print(" Total training dataset length: ", len(train_labels))
  print(" total no of batches: ", total_batches)
  for epoch in range(epochs):
      for i in range(total_batches):
          offset = (i * batch_size)
          batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
          batch_labels = train_labels[offset:(offset + batch_size), :]
          feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
          _, l = session.run([optimizer, loss], feed_dict=feed_dict)
      test_acc = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset,tf_train_labels: test_labels})
      print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(l), " test accuracy: {:.3f}%".format(test_acc*100))
  print("\nTraining complete!")

Initialized
 Total training dataset length:  200000
 total no of batches:  12500
Epoch: 1 cost = 0.690  test accuracy: 94.640%
Epoch: 2 cost = 0.695  test accuracy: 95.120%
Epoch: 3 cost = 0.669  test accuracy: 95.470%

Training complete!


### Problem 1

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (nn.max_pool()) of stride 2 and kernel size 2.

- The bar is much higher now with 95.47% accuracy obtained from last model. In this problem we will use our last section model/functions and chnage the stride together with introduction of max-pool layer.

In [5]:
image_size = 28
batch_size = 16
filter_size = 5
in_image_channnels = 1 #grayscale image
depth1 = 16 #no of channels in conv1 layer
depth2 = 16 #no of channels in conv2 layer
num_hidden = 64
learning_rate = 0.05

"""
tensorflow conv2d requires 4D input and filter tensor;
tf.nn.conv2d(
    input,
    filter,
    strides,
    padding,...)
input tensor of shape = [batch, in_height, in_width, in_channels]
filter / kernel tensor of shape = [filter_height, filter_width, in_channels, out_channels]
"""

#define a function that creates convolutional layer
def create_conv_layer(in_data, num_in_channels, num_out_channels, filter_shape,conv_stride, name):
    ''' in_data = input Data (need 4D shape defined above)
        in_channels = no of channels in input image, 1(grayscale), 3(RGB)
        out_channels = depth of conv layers
        filter_shape = filter shape used for convolution, for e.g. [3,3], or [5,5]
        name = any valid string'''
    #define 4D shape for filter/kernel tensor that will be used for creating weights
    conv_filter_shape = [filter_shape[0], filter_shape[1], num_in_channels, num_out_channels]
    
    weights = tf.Variable(tf.truncated_normal(conv_filter_shape, stddev = 0.1), name = name+ '_W')
    bias = tf.Variable(tf.zeros(num_out_channels), name = name+"_b")
    
    strides = [1, conv_stride[0], conv_stride[1], 1]
    #define the conv2d layer
    outlayer = tf.nn.conv2d(in_data, weights, strides = strides, padding = 'SAME')
    #add bias
    outlayer = outlayer + bias
    #apply ReLu activation
    outlayer = tf.nn.relu(outlayer)
    
    ## now perform max pooling
    # define the 4D dimension of pooling filter =[1, pool_filter_x, pool_filter_y, 1]
    ksize = [1, 2, 2, 1]
    # now define stride for pool-layer =[1, x-strides, y-strides, 1]
    pool_stride  = [1, 2, 2, 1]
    outlayer = tf.nn.max_pool( outlayer, ksize, strides = pool_stride, padding ='SAME')
    return outlayer

# the dataset train_dataset, train_labels is already formatted
# Input data placeholders
# using shape = [None, ] allows us to use it as a general label placeholder
# shape=(batch_size, image_size, image_size, in_image_channnels)
tf_train_dataset = tf.placeholder(tf.float32, shape=(None, image_size, image_size, in_image_channnels))
#tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_train_labels = tf.placeholder(tf.float32, shape=(None, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)

# MODEL definition
# now define the two conv layer
layer1 = create_conv_layer(tf_train_dataset, in_image_channnels, depth1,\
                           filter_shape =[5,5],conv_stride =[1,1], name ='layer1')
layer2 = create_conv_layer(layer1,depth1, depth2, [5,5], [1,1], name ='layer2' )

# now we need to flatten and add two FC layers
# after two layers with max-pool of stride =2, we go from 28 x 28, to 14 x 14 to 7 x 7 x,y co-ordinates, 
# but with 16 output channels.  To create the fully connected,
# "dense" layer, the new shape needs to be [-1, 7 x 7 x 16]
#flattened = tf.reshape(layer2, [-1, 7 * 7 * 16])
flattened = tf.reshape(layer2, [-1, 7 * 7 * depth2])

#setup weights and bias for dense layer
w1 = tf.Variable(tf.truncated_normal([7*7* depth2, num_hidden], stddev = 0.1), name ='w_dense1')
b1 = tf.Variable(tf.zeros([num_hidden]), name ='b_dense1')
# fully connected layer1
dense_layer1 = tf.nn.relu(tf.matmul(flattened, w1) + b1)

# weights and bias for dense layer2
w2 = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev = 0.1), name ='w_dense2')
b2 = tf.Variable(tf.zeros([num_labels]), name = 'b_dense2')
# define the output logits = dense_layer
logits = tf.matmul(dense_layer1, w2) + b2

# get the logits from the model
#logits = dense_layer2
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

prediction = tf.nn.softmax(logits)
# define an accuracy assessment operation
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(tf_train_labels, 1))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Predictions for the training, validation, and test data.
# these predictions are derrived by 
# calling the accuracy_op operation and feeding the placeholders in feed_dict

In [None]:
# now training part
epochs = 1

with tf.Session() as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  total_batches = int(train_labels.shape[0]/batch_size)
  #total_batches = 1001 # = num_Steps
  print(" Total training dataset length: ", len(train_labels))
  print(" total no of batches: ", total_batches)
  for epoch in range(epochs):
      for i in range(total_batches):
          offset = (i * batch_size)
          batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
          batch_labels = train_labels[offset:(offset + batch_size), :]
          feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
          _, l = session.run([optimizer, loss], feed_dict=feed_dict)
      test_acc = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset,tf_train_labels: test_labels})
      #valid_acc = session.run(accuracy_op, feed_dict ={tf_train_dataset:valid_dataset, tf_train_labels: valid_labels})
      print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(l), " test accuracy: {:.2f}%".format(test_acc*100))
  print("\nTraining complete!")

Initialized
 Total training dataset length:  200000
 total no of batches:  12500


# IMP: Error while training
- "An error ocurred while starting the kernel  tensorflow/core/platform/cpu_feature_guard.cc:137]"
 If you received the message above, chances are the Jupyter or the python IDE ran out of memory.
 
 Solutions:
 - try runnning the python script from command line
 - try deleting the unused variables
 - isolate the problem
   - in my case the error was occuring while calculating test_accuracy operation
   - Solution that worked for me: calculate the test arracy for 1st part and 2nd part separately
 - [Last resort]: update your tensorflow or reinstall tensorflwo with dependencies

In [8]:
# now training part
epochs = 5
global_step = 0
with tf.Session() as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  total_batches = int(train_labels.shape[0]/batch_size)
  #total_batches = 1001 # = num_Steps
  print(" Total training dataset length: ", len(train_labels))
  print(" total no of batches: ", total_batches)
  for epoch in range(epochs):
      for i in range(total_batches):
          offset = (i * batch_size)
          batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
          batch_labels = train_labels[offset:(offset + batch_size), :]
          feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
          _, l = session.run([optimizer, loss], feed_dict=feed_dict)
          global_step +=1
          if (global_step% 2000 == 0):
              print("At Step = %d, Minibatch train_accuracy = %f " \
                    %(global_step,session.run(accuracy_op, feed_dict = feed_dict)))
          del batch_data, batch_labels, feed_dict
      test_len = int(len(test_labels)/2) # only taking half length
      test_acc1 = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset[0:test_len, :, :, :],tf_train_labels: test_labels[0:test_len, :]})
      test_acc2 = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset[test_len:, :, :, :],tf_train_labels: test_labels[test_len:, :]})
      # find the avg test acuuracy from the accuracy of 1st half and 2nd half
      test_acc = (test_acc1 + test_acc2) /2.0
      #valid_acc = session.run(accuracy_op, feed_dict ={tf_train_dataset:valid_dataset, tf_train_labels: valid_labels})
      print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(l), " test accuracy: {:.2f}%".format(test_acc*100))
  print("\nTraining complete!")

Initialized
 Total training dataset length:  200000
 total no of batches:  12500
At Step = 2000, Minibatch train_accuracy = 0.937500 
At Step = 4000, Minibatch train_accuracy = 0.937500 
At Step = 6000, Minibatch train_accuracy = 1.000000 
At Step = 8000, Minibatch train_accuracy = 0.937500 
At Step = 10000, Minibatch train_accuracy = 0.937500 
At Step = 12000, Minibatch train_accuracy = 0.875000 
Epoch: 1 cost = 0.511  test accuracy: 95.05%
At Step = 14000, Minibatch train_accuracy = 0.937500 
At Step = 16000, Minibatch train_accuracy = 0.937500 
At Step = 18000, Minibatch train_accuracy = 0.937500 
At Step = 20000, Minibatch train_accuracy = 1.000000 
At Step = 22000, Minibatch train_accuracy = 0.687500 
At Step = 24000, Minibatch train_accuracy = 0.937500 
Epoch: 2 cost = 0.530  test accuracy: 95.76%
At Step = 26000, Minibatch train_accuracy = 1.000000 
At Step = 28000, Minibatch train_accuracy = 0.937500 
At Step = 30000, Minibatch train_accuracy = 0.937500 
At Step = 32000, Miniba

 - Well, that an improvement from 95.47% to 96.13%.
 - we will later realize that improving accuracy above 95% will be lot tougher and in small increments
 - Also, now the training time is much longer
 

# Problem 2
Try to get the best performance you can using a convolutional net. Look for example at the classic LeNet5 architecture, adding Dropout, and/or adding learning rate decay.
- Here is the description of LeNet architecture

    * Layer C1 is a convolution layer with 6 feature maps and a 5×5 kernel for each feature map.
    * Layer S1 is a subsampling layer with 6 feature maps and a 2×2 kernel for each feature map.
    * Layer C3 is a convolution layer with 16 feature maps and a 6×6 kernel for each feature map.
    * Layer S4 is a subsampling layer with 16 feature maps and a 2×2 kernel for each feature map.
    * Layer C5 is a convolution layer with 120 feature maps and a 6×6 kernel for each feature map.
    * Layer C6 is a fully connected layer with 84 layers
- The current network is much deeper( 16 feature maps in both C1 and C2). Not sure if the Lenet architecture can improve much of accuracy. However, practising to create the Lenet architecture will be helpful
- Adding dropout, learning rate decay or even using AdamOptimizer seems to be  agood option.


In [14]:
#LeNet architecture
image_size = 28
batch_size = 16
filter_size = 5# we will use same filtersize for c1, c3 and c5
in_image_channnels = 1 #grayscale image
depth1 = 6 #no of channels in conv1 layer
depth2 = 16 #no of channels in conv2 layer
depth3 = 120
num_hidden = 84
learning_rate = 0.05

"""
tensorflow conv2d requires 4D input and filter tensor;
tf.nn.conv2d(
    input,
    filter,
    strides,
    padding,...)
input tensor of shape = [batch, in_height, in_width, in_channels]
filter / kernel tensor of shape = [filter_height, filter_width, in_channels, out_channels]
"""

#define a function that creates convolutional layer
def create_conv_layer(in_data, num_in_channels, num_out_channels, filter_shape,conv_stride, name, use_subsample= 1):
    ''' in_data = input Data (need 4D shape defined above)
        in_channels = no of channels in input image, 1(grayscale), 3(RGB)
        out_channels = depth of conv layers
        filter_shape = filter shape used for convolution, for e.g. [3,3], or [5,5]
        name = any valid string'''
    #define 4D shape for filter/kernel tensor that will be used for creating weights
    conv_filter_shape = [filter_shape[0], filter_shape[1], num_in_channels, num_out_channels]
    
    weights = tf.Variable(tf.truncated_normal(conv_filter_shape, stddev = 0.1), name = name+ '_W')
    bias = tf.Variable(tf.zeros(num_out_channels), name = name+"_b")
    
    strides = [1, conv_stride[0], conv_stride[1], 1]
    #define the conv2d layer
    outlayer = tf.nn.conv2d(in_data, weights, strides = strides, padding = 'SAME')
    #add bias
    outlayer = outlayer + bias
    #apply ReLu activation
    outlayer = tf.nn.relu(outlayer)
    
    ## now perform max pooling if "use_subsample=true"
    if(use_subsample == 1):
        # define the 4D dimension of pooling filter =[1, pool_filter_x, pool_filter_y, 1]
        ksize = [1, 2, 2, 1]
        # now define stride for pool-layer =[1, x-strides, y-strides, 1]
        pool_stride  = [1, 2, 2, 1]
        outlayer = tf.nn.max_pool( outlayer, ksize, strides = pool_stride, padding ='SAME')
    return outlayer

# the dataset train_dataset, train_labels is already formatted
# Input data placeholders
# using shape = [None, ] allows us to use it as a general label placeholder
# shape=(batch_size, image_size, image_size, in_image_channnels)
tf_train_dataset = tf.placeholder(tf.float32, shape=(None, image_size, image_size, in_image_channnels))
#tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_train_labels = tf.placeholder(tf.float32, shape=(None, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)

# MODEL definition
# now define the two conv layer
layer1 = create_conv_layer(tf_train_dataset, in_image_channnels, depth1,\
                           filter_shape =[5,5],conv_stride =[1,1], name ='layer1')
layer2 = create_conv_layer(layer1,depth1, depth2, [5,5], [1,1], name ='layer2' )
layer3 = create_conv_layer(layer2,depth2, depth3, [5,5], [1,1], name ='layer3', use_subsample=0)

# now we need to flatten and add two FC layers
# after two layers with max-pool of stride =2, we go from (28,28)->(14,14)->(7,7)  
# but with 120(depth3) output channels.  To create the fully connected,
# "dense" layer, the new shape needs to be [-1, 7 x 7 x 120]
flattened = tf.reshape(layer3, [-1, 7 * 7 * depth3])

#setup weights and bias for dense layer
w1 = tf.Variable(tf.truncated_normal([7*7* depth3, num_hidden], stddev = 0.1), name ='w_dense1')
b1 = tf.Variable(tf.zeros([num_hidden]), name ='b_dense1')
# fully connected layer1
dense_layer1 = tf.nn.relu(tf.matmul(flattened, w1) + b1)

# weights and bias for dense layer2
w2 = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev = 0.1), name ='w_dense2')
b2 = tf.Variable(tf.zeros([num_labels]), name = 'b_dense2')
# define the output logits = dense_layer
logits = tf.matmul(dense_layer1, w2) + b2

# get the logits from the model
#logits = dense_layer2
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

prediction = tf.nn.softmax(logits)
# define an accuracy assessment operation
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(tf_train_labels, 1))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Predictions for the training, validation, and test data.
# these predictions are derrived by 
# calling the accuracy_op operation and feeding the placeholders in feed_dict

In [15]:
# now training part
epochs = 3
step_counter = 0
with tf.Session() as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  total_batches = int(train_labels.shape[0]/batch_size)
  #total_batches = 1001 # = num_Steps
  print(" Total training dataset length: ", len(train_labels))
  print(" total no of batches: ", total_batches)
  for epoch in range(epochs):
      for i in range(total_batches):
          offset = (i * batch_size)
          batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
          batch_labels = train_labels[offset:(offset + batch_size), :]
          feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
          _, l = session.run([optimizer, loss], feed_dict=feed_dict)
          step_counter +=1
          if (step_counter% 2000 == 0):
              print("At Step = %d, Minibatch train_accuracy = %f " \
                    %(step_counter,session.run(accuracy_op, feed_dict = feed_dict)))
          del batch_data, batch_labels, feed_dict
      test_len = int(len(test_labels)/2) # only taking half length
      test_acc1 = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset[0:test_len, :, :, :],tf_train_labels: test_labels[0:test_len, :]})
      test_acc2 = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset[test_len:, :, :, :],tf_train_labels: test_labels[test_len:, :]})
      # find the avg test acuuracy from the accuracy of 1st half and 2nd half
      test_acc = (test_acc1 + test_acc2) /2.0
      #valid_acc = session.run(accuracy_op, feed_dict ={tf_train_dataset:valid_dataset, tf_train_labels: valid_labels})
      print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(l), " test accuracy: {:.2f}%".format(test_acc*100))
  print("\nTraining complete!")

Initialized
 Total training dataset length:  200000
 total no of batches:  12500
At Step = 2000, Minibatch train_accuracy = 0.937500 
At Step = 4000, Minibatch train_accuracy = 1.000000 
At Step = 6000, Minibatch train_accuracy = 1.000000 
At Step = 8000, Minibatch train_accuracy = 0.937500 
At Step = 10000, Minibatch train_accuracy = 1.000000 
At Step = 12000, Minibatch train_accuracy = 0.937500 
Epoch: 1 cost = 0.499  test accuracy: 95.44%
At Step = 14000, Minibatch train_accuracy = 0.937500 
At Step = 16000, Minibatch train_accuracy = 0.937500 
At Step = 18000, Minibatch train_accuracy = 1.000000 
At Step = 20000, Minibatch train_accuracy = 1.000000 
At Step = 22000, Minibatch train_accuracy = 0.812500 
At Step = 24000, Minibatch train_accuracy = 0.937500 
Epoch: 2 cost = 0.424  test accuracy: 95.83%
At Step = 26000, Minibatch train_accuracy = 0.937500 
At Step = 28000, Minibatch train_accuracy = 0.937500 
At Step = 30000, Minibatch train_accuracy = 0.937500 
At Step = 32000, Miniba

This is almost the same accuracy as the problem1(after the end of epoch3).

Next we will add, learning rate decay and Adam optimer to examine the effect of them.


In [30]:
#LeNet architecture
image_size = 28
batch_size = 16
filter_size = 5# we will use same filtersize for c1, c3 and c5
in_image_channnels = 1 #grayscale image
depth1 = 6 #no of channels in conv1 layer
depth2 = 16 #no of channels in conv2 layer
depth3 = 120
num_hidden = 84
learning_rate = 0.05
global_step = tf.Variable(0)

"""
tensorflow conv2d requires 4D input and filter tensor;
tf.nn.conv2d(
    input,
    filter,
    strides,
    padding,...)
input tensor of shape = [batch, in_height, in_width, in_channels]
filter / kernel tensor of shape = [filter_height, filter_width, in_channels, out_channels]
"""

#define a function that creates convolutional layer
def create_conv_layer(in_data, num_in_channels, num_out_channels, filter_shape,conv_stride, name, use_subsample= 1):
    ''' in_data = input Data (need 4D shape defined above)
        in_channels = no of channels in input image, 1(grayscale), 3(RGB)
        out_channels = depth of conv layers
        filter_shape = filter shape used for convolution, for e.g. [3,3], or [5,5]
        name = any valid string'''
    #define 4D shape for filter/kernel tensor that will be used for creating weights
    conv_filter_shape = [filter_shape[0], filter_shape[1], num_in_channels, num_out_channels]
    
    weights = tf.Variable(tf.truncated_normal(conv_filter_shape, stddev = 0.1), name = name+ '_W')
    bias = tf.Variable(tf.zeros(num_out_channels), name = name+"_b")
    
    strides = [1, conv_stride[0], conv_stride[1], 1]
    #define the conv2d layer
    outlayer = tf.nn.conv2d(in_data, weights, strides = strides, padding = 'SAME')
    #add bias
    outlayer = outlayer + bias
    #apply ReLu activation
    outlayer = tf.nn.relu(outlayer)
    
    ## now perform max pooling if "use_subsample=true"
    if(use_subsample == 1):
        # define the 4D dimension of pooling filter =[1, pool_filter_x, pool_filter_y, 1]
        ksize = [1, 2, 2, 1]
        # now define stride for pool-layer =[1, x-strides, y-strides, 1]
        pool_stride  = [1, 2, 2, 1]
        outlayer = tf.nn.max_pool( outlayer, ksize, strides = pool_stride, padding ='SAME')
    return outlayer

# the dataset train_dataset, train_labels is already formatted
# Input data placeholders
# using shape = [None, ] allows us to use it as a general label placeholder
# shape=(batch_size, image_size, image_size, in_image_channnels)
tf_train_dataset = tf.placeholder(tf.float32, shape=(None, image_size, image_size, in_image_channnels))
#tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_train_labels = tf.placeholder(tf.float32, shape=(None, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)

# MODEL definition
# now define the two conv layer
layer1 = create_conv_layer(tf_train_dataset, in_image_channnels, depth1,\
                           filter_shape =[5,5],conv_stride =[1,1], name ='layer1')
layer2 = create_conv_layer(layer1,depth1, depth2, [5,5], [1,1], name ='layer2' )
layer3 = create_conv_layer(layer2,depth2, depth3, [5,5], [1,1], name ='layer3', use_subsample=0)

# now we need to flatten and add two FC layers
# after two layers with max-pool of stride =2, we go from (28,28)->(14,14)->(7,7)  
# but with 120(depth3) output channels.  To create the fully connected,
# "dense" layer, the new shape needs to be [-1, 7 x 7 x 120]
flattened = tf.reshape(layer3, [-1, 7 * 7 * depth3])

#setup weights and bias for dense layer
w1 = tf.Variable(tf.truncated_normal([7*7* depth3, num_hidden], stddev = 0.1), name ='w_dense1')
b1 = tf.Variable(tf.zeros([num_hidden]), name ='b_dense1')
# fully connected layer1
dense_layer1 = tf.nn.relu(tf.matmul(flattened, w1) + b1)

# weights and bias for dense layer2
w2 = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev = 0.1), name ='w_dense2')
b2 = tf.Variable(tf.zeros([num_labels]), name = 'b_dense2')
# define the output logits = dense_layer
logits = tf.matmul(dense_layer1, w2) + b2

# get the logits from the model
#logits = dense_layer2
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    
# Optimizer.
#optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
#global_step = tf.Variable(0)
#learning_rate = tf.train.exponential_decay(0.5, global_step,decay_steps=1000, decay_rate= 0.65, staircase=True)
#learning_rate = 0.05
#optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step = global_step)

prediction = tf.nn.softmax(logits)
# define an accuracy assessment operation
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(tf_train_labels, 1))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Predictions for the training, validation, and test data.
# these predictions are derrived by 
# calling the accuracy_op operation and feeding the placeholders in feed_dict

In [31]:
# now training part
epochs = 5
step_counter = 0
with tf.Session() as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  total_batches = int(train_labels.shape[0]/batch_size)
  #total_batches = 1001 # = num_Steps
  print(" Total training dataset length: ", len(train_labels))
  print(" total no of batches: ", total_batches)
  for epoch in range(epochs):
      for i in range(total_batches):
          offset = (i * batch_size)
          batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
          batch_labels = train_labels[offset:(offset + batch_size), :]
          feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
          _, l = session.run([optimizer, loss], feed_dict=feed_dict)
          step_counter +=1
          if (step_counter% 2000 == 0):
              print("At Step = %d, Minibatch train_accuracy = %f " \
                    %(step_counter,session.run(accuracy_op, feed_dict = feed_dict)))
          del batch_data, batch_labels, feed_dict
      test_len = int(len(test_labels)/2) # only taking half length
      test_acc1 = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset[0:test_len, :, :, :],tf_train_labels: test_labels[0:test_len, :]})
      test_acc2 = session.run(accuracy_op, \
                             feed_dict ={tf_train_dataset:test_dataset[test_len:, :, :, :],tf_train_labels: test_labels[test_len:, :]})
      # find the avg test acuuracy from the accuracy of 1st half and 2nd half
      test_acc = (test_acc1 + test_acc2) /2.0
      #valid_acc = session.run(accuracy_op, feed_dict ={tf_train_dataset:valid_dataset, tf_train_labels: valid_labels})
      print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(l), " test accuracy: {:.2f}%".format(test_acc*100))
  print("\nTraining complete!")

Initialized
 Total training dataset length:  200000
 total no of batches:  12500
At Step = 2000, Minibatch train_accuracy = 0.937500 
At Step = 4000, Minibatch train_accuracy = 1.000000 
At Step = 6000, Minibatch train_accuracy = 1.000000 
At Step = 8000, Minibatch train_accuracy = 0.937500 
At Step = 10000, Minibatch train_accuracy = 0.937500 
At Step = 12000, Minibatch train_accuracy = 0.875000 
Epoch: 1 cost = 0.776  test accuracy: 95.35%
At Step = 14000, Minibatch train_accuracy = 0.937500 
At Step = 16000, Minibatch train_accuracy = 1.000000 
At Step = 18000, Minibatch train_accuracy = 0.937500 
At Step = 20000, Minibatch train_accuracy = 1.000000 
At Step = 22000, Minibatch train_accuracy = 0.812500 
At Step = 24000, Minibatch train_accuracy = 0.937500 
Epoch: 2 cost = 0.562  test accuracy: 95.78%
At Step = 26000, Minibatch train_accuracy = 1.000000 
At Step = 28000, Minibatch train_accuracy = 0.937500 
At Step = 30000, Minibatch train_accuracy = 1.000000 
At Step = 32000, Miniba