# deeper multilayer perception with xavier

In [1]:
# load package and mnist
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('../MNIST_data/', one_hot=True)
%matplotlib inline  
print ("PACKAGES LOADED")

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ../MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ../MNIST_data/train-labels-idx1-ubyte.gz
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting ../MNIST_data/t10k-images-idx3-ubyte.gz
Extracting ../MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
PACKAGES LOADED


## Xavier init

In [2]:
# Xavier init
"""Set the parameter initialization using the method described.
This method is designed to keep the scale of the gradients roughly the same
in all layers.
Xavier Glorot and Yoshua Bengio (2010):
       Understanding the difficulty of training deep feedforward neural
       networks. International conference on artificial intelligence and
       statistics.
Args:
n_inputs: The number of input nodes into each output.
n_outputs: The number of output nodes for each input.
uniform: If true use a uniform distribution, otherwise use a normal.
Returns:
An initializer.
"""
def xavier_init(n_inputs, n_outputs, uniform=True):
    if uniform:
        # 6 is used in the paper
        init_range = tf.sqrt(6.0 / (n_inputs + n_inputs))
        return tf.random_uniform_initializer(-init_range, init_range)
    else:
        # 3 gives us approximately the same limits as above since this repicks
        # values greater than 2 standard deviations from the mean.
        stddev = tf.sqrt(3.0 / (n_inputs + n_outputs))
        return tf.truncated_normal_initializer(stddev=stddev)

In [3]:
# parameters 
learning_rate = 0.001
training_epoch = 50
batch_size = 100
display_step = 5

# network parameters
n_input = 784  # 28 x 28
n_hidden_1 = 256 # 1st layer num features
n_hidden_2 = 256 # 2nd layer num features
n_hidden_3 = 256 # 3rd layer num features
n_hidden_4 = 256 # 4th layer num features
n_classes = 10  # classes(0 ~9 digits)

# tf graph
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
dropout_keep_prob = tf.placeholder("float")

# weight and bias
# Store layers weight & bias
weights = {
    'h1': tf.get_variable("h1", shape=[n_input, n_hidden_1],    initializer=xavier_init(n_input,n_hidden_1)),
    'h2': tf.get_variable("h2", shape=[n_hidden_1, n_hidden_2], initializer=xavier_init(n_hidden_1,n_hidden_2)),
    'h3': tf.get_variable("h3", shape=[n_hidden_2, n_hidden_3], initializer=xavier_init(n_hidden_2,n_hidden_3)),
    'h4': tf.get_variable("h4", shape=[n_hidden_3, n_hidden_4], initializer=xavier_init(n_hidden_3,n_hidden_4)),
    'out': tf.get_variable("out", shape=[n_hidden_4, n_classes], initializer=xavier_init(n_hidden_4,n_classes))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'b4': tf.Variable(tf.random_normal([n_hidden_4])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

Instructions for updating:
Colocations handled automatically by placer.


## model

In [4]:
def multilayer_perception(_x, _weights, _biases, _keep_prob):
    layer_1 = tf.nn.dropout(tf.nn.relu(tf.add(tf.matmul(_x, _weights['h1']), _biases['b1'])), _keep_prob)
    layer_2 = tf.nn.dropout(tf.nn.relu(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])), _keep_prob)
    layer_3 = tf.nn.dropout(tf.nn.relu(tf.add(tf.matmul(layer_2, _weights['h3']), _biases['b3'])), _keep_prob)
    layer_4 = tf.nn.dropout(tf.nn.relu(tf.add(tf.matmul(layer_3, _weights['h4']), _biases['b4'])), _keep_prob)
    return(tf.add(tf.matmul(layer_4, _weights['out']), _biases['out']))

# construct model
pred = multilayer_perception(x, weights, biases, dropout_keep_prob)

# loss
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# accuracy
corr = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(corr, "float"))

# init
init = tf.global_variables_initializer()

print("network ready")

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

network ready


In [5]:
# launch the graph
sess = tf.Session()
sess.run(init)

# training 
for epoch in range(training_epoch):
    avg_loss = 0
    n_batch = int(mnist.train.num_examples / batch_size)
    for batch in range(n_batch):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)
        train_feed = {x: batch_xs, y: batch_ys, dropout_keep_prob: 0.7}
        sess.run(optimizer, feed_dict=train_feed)
        feed = {x: batch_xs, y: batch_ys, dropout_keep_prob: 1.0}
        avg_loss += sess.run(loss, feed_dict=feed)
    avg_loss = avg_loss / n_batch
    
    # display
    if (epoch + 1) % display_step == 0:
        train_accur = sess.run(accuracy, feed_dict=feed)
        test_feed = {x: mnist.test.images, y: mnist.test.labels, dropout_keep_prob: 1.0}
        test_accur = sess.run(accuracy, feed_dict=test_feed)
        print("Epoch: [%03d/%03d]  " "avg loss: %.5f  " "train accuracy: %.3f  " "test accuracy: %.3f  "
             %(epoch + 1, training_epoch, avg_loss, train_accur, test_accur))
    

Epoch: [005/050]  avg loss: 0.06288  train accuracy: 1.000  test accuracy: 0.975  
Epoch: [010/050]  avg loss: 0.03345  train accuracy: 1.000  test accuracy: 0.979  
Epoch: [015/050]  avg loss: 0.01984  train accuracy: 0.990  test accuracy: 0.981  
Epoch: [020/050]  avg loss: 0.01446  train accuracy: 1.000  test accuracy: 0.981  
Epoch: [025/050]  avg loss: 0.01002  train accuracy: 0.990  test accuracy: 0.981  
Epoch: [030/050]  avg loss: 0.00790  train accuracy: 1.000  test accuracy: 0.983  
Epoch: [035/050]  avg loss: 0.00621  train accuracy: 1.000  test accuracy: 0.983  
Epoch: [040/050]  avg loss: 0.00539  train accuracy: 0.990  test accuracy: 0.982  
Epoch: [045/050]  avg loss: 0.00463  train accuracy: 1.000  test accuracy: 0.983  
Epoch: [050/050]  avg loss: 0.00447  train accuracy: 1.000  test accuracy: 0.982  


In [6]:
# close
sess.close()

开始learning_rate设置为0.01，loss会一直上升