In [1]:
import tensorflow as tf
old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)

  from ._conv import register_converters as _register_converters


<h2>Extract MNIST data</h2>
<p style="font-size:20px">You can change the option of one_hot encoding.

In [2]:
from tensorflow.examples.tutorials.mnist import input_data
#get mnist data, with one_hot encoding
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True)
#suppress warnings
tf.logging.set_verbosity(old_v)

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


<h2>Define hyperparameters</h2>

In [38]:
num_train = mnist.train.num_examples #55,000
num_validation = mnist.validation.num_examples #5000
num_test = mnist.test.num_examples #10,000

#learning rate
lr = 0.005
#number of traning steps
num_steps =500
#number of batch_size
batch_size = 256

#network parameters
n_hidden_1 = 200
n_hidden_2 = 100
num_input = 784
num_classes = 10

<h2>Define placeholder and Variables</h2>

In [39]:
tf.reset_default_graph()
#tf graph input
X = tf.placeholder(tf.float32,[None,num_input],name='X')
Y = tf.placeholder(tf.int32,[None,num_classes],name='Y')
initializer = tf.contrib.layers.variance_scaling_initializer()
#initializer = tf.initializers.random_normal()
#Layers weight & bias
weights = {
    'W1': tf.Variable(initializer([num_input, n_hidden_1]),name='W1'),
    'W2': tf.Variable(initializer([n_hidden_1, n_hidden_2]),name='W2'),
    'Wout': tf.Variable(initializer([n_hidden_2, num_classes]),name='Wout')
}

biases = {
    'b1': tf.Variable(tf.zeros(shape=[n_hidden_1]),name='b1'),
    'b2': tf.Variable(tf.zeros(shape=[n_hidden_2]),name='b2'),
    'bout': tf.Variable(tf.zeros(shape=[num_classes]),name='bout')
}

<h2>Define neural network</h2>

In [40]:
#define a neural net model
def neural_net(x):
    layer_1_out = tf.nn.relu(tf.add(tf.matmul(x,weights['W1']),biases['b1']))
    layer_2_out = tf.nn.relu(tf.add(tf.matmul(layer_1_out,weights['W2']),biases['b2']))
    out = tf.add(tf.matmul(layer_2_out,weights['Wout']),biases['bout'])
    return out

<h2>Define cost function and accuracy</h2>

In [41]:
#predicted labels
logits = neural_net(X)

#define loss
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,labels=Y),name='loss')
#define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss)

#compare the predicted labels with true labels
correct_pred = tf.equal(tf.argmax(logits,1),tf.argmax(Y,1))

#compute the accuracy by taking average
accuracy = tf.reduce_mean(tf.cast(correct_pred,tf.float32),name='accuracy')

#Initialize the variables
init = tf.global_variables_initializer()

<h2>Execute training</h2>

In [42]:
with tf.Session() as sess:
    sess.run(init)
    
    for i in range(num_steps):
        #fetch batch
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        #run optimization
        sess.run(train_op, feed_dict={X:batch_x, Y:batch_y})
        if i % 100 ==0:
            acc = sess.run(accuracy,feed_dict={X:batch_x, Y:batch_y})
            print("step "+str(i)+", Accuracy= {:.3f}".format(acc))
    
    print("Training finished!")
    
    print("Testing ACcuracy:", sess.run(accuracy, feed_dict={X:mnist.test.images, Y:mnist.test.labels}))

step 0, Accuracy= 0.293
step 100, Accuracy= 0.957
step 200, Accuracy= 0.961
step 300, Accuracy= 0.973
step 400, Accuracy= 1.000
Training finished!
Testing ACcuracy: 0.9683


<h2>Your results</h2>

<table>
  <tr>
      <th> Activation function </th>
      <th> intialization method </th>
      <th> Optimizer </th>
      <th> Learning Rate</th>
      <th> total number of neurons for 2 layers</th>
      <th> Testing accuracy</th>
  </tr>
  <tr>
      <td> tanh</td>
      <td> random normal</td>
      <td> Adam</td>
      <td> 0.001</td>
      <td> 300</td>
      <td> 0.875</td>
  </tr>
    <tr>
      <td> relu</td>
      <td> random normal</td>
      <td> Adam</td>
      <td> 0.001</td>
      <td> 300</td>
      <td> 0.893</td>
  </tr>
   <tr>
      <td> relu</td>
      <td> He initialization</td>
      <td> Adam</td>
      <td> 0.001</td>
      <td> 300</td>
      <td> 0.961</td>
  </tr>
     <tr>
      <td> relu</td>
      <td> He initialization</td>
      <td> Adam</td>
      <td> 0.001</td>
      <td> 500</td>
      <td> 0.963</td>
  </tr>
   <tr>
      <td> relu</td>
      <td> He initialization</td>
      <td> Adam</td>
      <td> 0.005</td>
      <td> 500</td>
      <td> 0.968</td>
  </tr>
  <tr>
      <td> relu</td>
      <td> He initialization</td>
      <td> Adam</td>
      <td> 0.01</td>
      <td> 300</td>
      <td> 0.926</td>
  </tr>
  <tr>
      <td> relu</td>
      <td> He initialization</td>
      <td> SGD</td>
      <td> 0.005</td>
      <td> 300</td>
      <td> 0.916</td>
  </tr>
</table>

<h3>Conclusion</h3>
<p>After tuning the hyperparameters, I arrive at conclusion that the choice of activation be ReLu makes the performance much better.</p>
<p>And within the setting of ReLu, choosing He initialization are the best.</p> 
<p>For learning rate, the general interval should be betwwen 0.001 to 0.01 to reach a good local minimum.</p>
<p>And the adoption of Adam Optimizer should be essential to make the optimization better. </p>
<p>For number of neurons, numbers at level of several hundreds should be good choice.</p>