In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
# Import data
data_dir = '/tmp/tensorflow/mnist/input_data'
mnist = input_data.read_data_sets(data_dir, one_hot=True)
# 导入数据，选择one-hot编码，数据集为10分类问题

Extracting /tmp/tensorflow/mnist/input_data\train-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data\train-labels-idx1-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data\t10k-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data\t10k-labels-idx1-ubyte.gz


In [3]:
# 定义placeholder，用于后续feed数据
x = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.float32, [None, 10])
learning_rate = tf.placeholder(tf.float32)    # 学习率

## 使用layers构建卷积神经网络

tf.layers.conv2d（）中参数很多，一般只需要设置一些关键参数，如inputs、filters、kernel_size、padding、activation，其他的都使用默认值，权重默认是不进行初始化（可能就是初始化为零），这样网络没办法训练，使用高斯分布的初始化效果很好，所以这边也进行一些额外参数设置，参数设置如下：   

inputs————输入[28,28,1]的图像，高和宽为28个像素，深度方向为1，只有一个通道   
filters————第一层网络先尝试使用32个卷积核   
kernel_size——首先尝试[5,5]的的卷积核    
strides=(1, 1)——直接使用默认步长 1     
padding='SAME'——零填充，默认是valid（不进行填充），但一般用'SAME'比较多，主要是不让卷积操作改变图像的尺寸，把降维的功能全部交给池化层    
activation=tf.nn.relu——使用relu作为激活函数  
kernel_initializer=tf.truncated_normal_initializer(stddev=0.1)——卷积核权重参数默认不进行初始化，这里初始化为高斯分布  
bias_initializer=tf.constant_initializer(0.1)——偏差默认使用0进行初始化，这里选择常数0.1  

其他参数暂时使用默认值

该网络的构建思路如下：  
28x28x1---(kernel:5x5x32)---28x28x32---(pooling 2x2)--14x14x32---(kernel:5x5x64)---14x14x64---(pooling 2x2)---7x7x64([3136])---(FC)---[1024]--(FC)--[10]

In [4]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 32, [5,5], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
  h_conv2 = tf.layers.conv2d(h_pool1, 64, [5,5],padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Second pooling layer.
with tf.name_scope('pool2'):
  h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool2_flat = tf.layers.flatten(h_pool2)
  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-5*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(3000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.01
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))



Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

step 100, entropy loss: 0.712071, l2_loss: 985.892944, total loss: 0.781084
0.79
step 200, entropy loss: 0.404489, l2_loss: 986.878662, total loss: 0.473571
0.85
step 300, entropy loss: 0.382436, l2_loss: 987.444214, total loss: 0.451557
0.91
step 400, entropy loss: 0.286373, l2_loss: 987.867310, total loss: 0.355524
0.93
step 500, entropy loss: 0.268985, l2_loss: 988.232178, total loss: 0.338161
0.94
step 600, entropy loss: 0.189156, l2_loss: 988.487671, total loss: 0.258350
0.97
step 700, entropy loss: 0.315328, l2_loss: 988.713989, total loss: 0.384538
0.94
step 800, entropy loss: 0.147726, l2_loss: 988.917847, total loss: 0.216950
0.98
step 900, entropy loss: 0.259858, l2_loss: 989.116638, total loss: 0.329096
0.96
step 1000, entropy loss: 0.135578, l2_loss: 989.268005, total loss: 0.204827
0.95


小结：  
1、一般在训练阶段，需要进行神经元的随机失活（dropout），保持0.5左右的比例，防止或减轻过拟合，而在测试阶段则不进行dropout，不丢弃参数，故保持为1.0  
2、经过初始化的权重，训练网络的初期，准确率就达到0.79--0.85--0.91--0.93，进行了参数的初始化后，网络在起点就收敛的很快，效果更好些   
3、池化层主要负责对输入数据空间维度降采样，一般使用2x2感受野，步长为2，池化后将丢弃75%的数据。若采用更大的感受野尺寸，可能会因为操作太激烈，数据丢失太多导致算法性能变差  
4、选择L2正则
  
以上的各种参数均由经验和参考代码得出并做一些优化，后续的代码和此代码类似，只针对部分参数微调，查看参数对训练结果的影响  

测试集上最大正确率：1.0  
训练集上最大正确率：0.9787  

## 1. 正则化因子增大 7e-3

In [5]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 32, [5,5], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
  h_conv2 = tf.layers.conv2d(h_pool1, 64, [5,5],padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Second pooling layer.
with tf.name_scope('pool2'):
  h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool2_flat = tf.layers.flatten(h_pool2)
  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-3*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(3000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.01
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))


step 100, entropy loss: 0.803969, l2_loss: 1941.744141, total loss: 14.396178
0.83
step 200, entropy loss: 0.512000, l2_loss: 1915.971802, total loss: 13.923803
0.91
step 300, entropy loss: 0.313385, l2_loss: 1890.097900, total loss: 13.544071
0.93
step 400, entropy loss: 0.189875, l2_loss: 1864.437988, total loss: 13.240941
0.95
step 500, entropy loss: 0.306793, l2_loss: 1839.048584, total loss: 13.180133
0.89
step 600, entropy loss: 0.242895, l2_loss: 1813.929932, total loss: 12.940405
0.94
step 700, entropy loss: 0.358895, l2_loss: 1789.142578, total loss: 12.882894
0.88
step 800, entropy loss: 0.328615, l2_loss: 1764.688965, total loss: 12.681438
0.93
step 900, entropy loss: 0.202620, l2_loss: 1740.521484, total loss: 12.386271
0.95
step 1000, entropy loss: 0.167415, l2_loss: 1716.698730, total loss: 12.184306
0.96
0.9583
step 1100, entropy loss: 0.087045, l2_loss: 1693.206055, total loss: 11.939488
0.99
step 1200, entropy loss: 0.102258, l2_loss: 1670.014404, total loss: 11.792360

增大正则因子，意味着增大对权重和模型复杂度的抑制，降低过拟合程度，从训练结果可看出确实有作用，训练集的正确率和测试集正确率差值变小了，但这个正则因子貌似太大了，以至于训练的正确率整体被拉低了

测试集上最大正确率：0.99  
训练集上最大正确率：0.9771  

## 2. 增大学习率 0.5

In [6]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 32, [5,5], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
  h_conv2 = tf.layers.conv2d(h_pool1, 64, [5,5],padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Second pooling layer.
with tf.name_scope('pool2'):
  h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool2_flat = tf.layers.flatten(h_pool2)
  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-5*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(3000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.5
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))



step 100, entropy loss: 2.301675, l2_loss: 401911094181888.000000, total loss: 28133777408.000000
0.15
step 200, entropy loss: 2.293039, l2_loss: 399107420061696.000000, total loss: 27937519616.000000
0.15
step 300, entropy loss: 2.298238, l2_loss: 396323475947520.000000, total loss: 27742644224.000000
0.12
step 400, entropy loss: 2.304266, l2_loss: 393559127621632.000000, total loss: 27549138944.000000
0.12
step 500, entropy loss: 2.313757, l2_loss: 390813569777664.000000, total loss: 27356950528.000000
0.12
step 600, entropy loss: 2.305824, l2_loss: 388087003742208.000000, total loss: 27166091264.000000
0.11
step 700, entropy loss: 2.302546, l2_loss: 385380134158336.000000, total loss: 26976610304.000000
0.16
step 800, entropy loss: 2.291070, l2_loss: 382691853729792.000000, total loss: 26788429824.000000
0.17
step 900, entropy loss: 2.303253, l2_loss: 380022430892032.000000, total loss: 26601570304.000000
0.05
step 1000, entropy loss: 2.291650, l2_loss: 377371597209600.000000, total

原本学习率为0.01，变为0.5，可发现训练结果惨不忍睹，模型根本无法收敛，训练结果只比随机猜测的0.1正确率大一点点，说明学习率太大了

测试集上最大正确率：0.17  
训练集上最大正确率：0.1135  

## 3. 学习率使用0.05，迭代次数5000

In [13]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 32, [5,5], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
  h_conv2 = tf.layers.conv2d(h_pool1, 64, [5,5],padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Second pooling layer.
with tf.name_scope('pool2'):
  h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool2_flat = tf.layers.flatten(h_pool2)
  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-5*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(5000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.05
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))



step 100, entropy loss: 0.311653, l2_loss: 9288.714844, total loss: 0.961863
0.94
step 200, entropy loss: 0.292706, l2_loss: 9284.415039, total loss: 0.942615
0.95
step 300, entropy loss: 0.234343, l2_loss: 9279.426758, total loss: 0.883903
0.97
step 400, entropy loss: 0.180753, l2_loss: 9274.128906, total loss: 0.829942
0.97
step 500, entropy loss: 0.065057, l2_loss: 9268.803711, total loss: 0.713874
0.99
step 600, entropy loss: 0.147728, l2_loss: 9263.299805, total loss: 0.796159
0.96
step 700, entropy loss: 0.105183, l2_loss: 9257.693359, total loss: 0.753222
0.99
step 800, entropy loss: 0.068860, l2_loss: 9252.121094, total loss: 0.716509
0.97
step 900, entropy loss: 0.077154, l2_loss: 9246.436523, total loss: 0.724405
1.0
step 1000, entropy loss: 0.031039, l2_loss: 9240.721680, total loss: 0.677889
0.99
0.9773
step 1100, entropy loss: 0.031446, l2_loss: 9234.909180, total loss: 0.677890
0.97
step 1200, entropy loss: 0.100049, l2_loss: 9229.095703, total loss: 0.746086
0.98
step 13

学习率调整为0.05，在这几次试验中收敛最快的一组参数了，第一次个batch迭代就达到了0.94的准确率，说明这个学习率选择的相当不错，既不会太小（训练慢，过拟合），也不会太大（无法收敛），而且照这个趋势，继续训练迭代貌似可以进一步提升准确率

测试集上最大正确率：1.0  
训练集上最大正确率:0.9915  


## 4. kernel size调整为 9*9（深度保持不变）

In [7]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 32, [9,9], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
  h_conv2 = tf.layers.conv2d(h_pool1, 64, [9,9],padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Second pooling layer.
with tf.name_scope('pool2'):
  h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool2_flat = tf.layers.flatten(h_pool2)
  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-5*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(3000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.01
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))



step 100, entropy loss: 0.486888, l2_loss: 4383.535645, total loss: 0.793735
0.84
step 200, entropy loss: 0.451044, l2_loss: 4383.639648, total loss: 0.757899
0.87
step 300, entropy loss: 0.372829, l2_loss: 4383.531738, total loss: 0.679676
0.91
step 400, entropy loss: 0.235089, l2_loss: 4383.339355, total loss: 0.541923
0.97
step 500, entropy loss: 0.215314, l2_loss: 4383.067871, total loss: 0.522129
0.98
step 600, entropy loss: 0.243519, l2_loss: 4382.764648, total loss: 0.550312
0.96
step 700, entropy loss: 0.125505, l2_loss: 4382.443359, total loss: 0.432276
0.97
step 800, entropy loss: 0.254385, l2_loss: 4382.099609, total loss: 0.561132
0.94
step 900, entropy loss: 0.115422, l2_loss: 4381.747559, total loss: 0.422145
0.97
step 1000, entropy loss: 0.147517, l2_loss: 4381.381348, total loss: 0.454214
0.97
0.9656
step 1100, entropy loss: 0.108037, l2_loss: 4380.986816, total loss: 0.414706
0.96
step 1200, entropy loss: 0.084366, l2_loss: 4380.604980, total loss: 0.391009
1.0
step 13

调整kernel size为9x9（深度不变），采用更大的感受野，卷积时重复的地方也多，感觉应该可以更细致的学到特征，相对于5x5的卷积核，它确实学到了更多特征，训练集正确率有所提升，但缺点是造成神经元个数增加了很多，从结果来看，貌似有点过拟合

测试集上最大正确率：1.0  
训练集上最大正确率：0.982  


## 5. kernel 数量由 32和64 变为 16和32，size不变

In [8]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 16, [5,5], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
  h_conv2 = tf.layers.conv2d(h_pool1, 32, [5,5],padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Second pooling layer.
with tf.name_scope('pool2'):
  h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=[2,2], strides=[2, 2], padding='VALID')

# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool2_flat = tf.layers.flatten(h_pool2)
  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-5*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(3000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.01
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))



step 100, entropy loss: 1.006719, l2_loss: 5064.546875, total loss: 1.361237
0.73
step 200, entropy loss: 0.510245, l2_loss: 5065.219238, total loss: 0.864810
0.8
step 300, entropy loss: 0.449359, l2_loss: 5065.266113, total loss: 0.803928
0.89
step 400, entropy loss: 0.243581, l2_loss: 5065.145020, total loss: 0.598141
0.96
step 500, entropy loss: 0.333537, l2_loss: 5064.938965, total loss: 0.688083
0.94
step 600, entropy loss: 0.225112, l2_loss: 5064.624512, total loss: 0.579636
0.95
step 700, entropy loss: 0.379347, l2_loss: 5064.306641, total loss: 0.733849
0.92
step 800, entropy loss: 0.231903, l2_loss: 5063.961914, total loss: 0.586380
0.9
step 900, entropy loss: 0.182925, l2_loss: 5063.563477, total loss: 0.537375
0.94
step 1000, entropy loss: 0.249117, l2_loss: 5063.143555, total loss: 0.603537
0.97
0.9548
step 1100, entropy loss: 0.170168, l2_loss: 5062.730469, total loss: 0.524559
0.95
step 1200, entropy loss: 0.156117, l2_loss: 5062.301758, total loss: 0.510478
0.95
step 130

调整kernel的数量，由原本的32和64变为16和32，size不变，发现不管是在训练集还是测试集，性能都有所下降，说明更少的kernel数量造成了网络学习到的特征不够，有点欠拟合

测试集上最大正确率：0.99  
训练集上最大正确率：0.9775  


## 6. 使用单个卷积层

In [9]:
# 将图像数据还原为28*28*1的格式，作为输入，高和宽为28像素，通道数为1
with tf.name_scope('reshape'):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

#定义第一层卷积层
with tf.name_scope('conv1'):
  h_conv1 = tf.layers.conv2d(x_image, 64, [7,7], padding='SAME', activation=tf.nn.relu, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                             bias_initializer=tf.constant_initializer(0.1))

# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
  h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=[2,2], strides=[2, 2], padding='VALID')



# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
  h_pool1_flat = tf.layers.flatten(h_pool1)
  h_fc1 = tf.layers.dense(h_pool1_flat, 1024, activation=tf.nn.relu)

# Dropout - controls the complexity of the model, prevents co-adaptation of
# features.
with tf.name_scope('dropout'):
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
  y = tf.layers.dense(h_fc1_drop, 10, activation=None)




# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

l2_loss = tf.add_n( [tf.nn.l2_loss(w) for w in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] )
total_loss = cross_entropy + 7e-5*l2_loss
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
# Train
for step in range(3000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  lr = 0.01
  _, loss, l2_loss_value, total_loss_value = sess.run(
               [train_step, cross_entropy, l2_loss, total_loss], 
               feed_dict={x: batch_xs, y_: batch_ys, learning_rate:lr, keep_prob:0.5})
  
  if (step+1) % 100 == 0:
    print('step %d, entropy loss: %f, l2_loss: %f, total loss: %f' % 
            (step+1, loss, l2_loss_value, total_loss_value))
    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob:0.5}))
  if (step+1) % 1000 == 0:
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels, keep_prob:1.0}))



step 100, entropy loss: 0.696585, l2_loss: 6033.401855, total loss: 1.118923
0.89
step 200, entropy loss: 0.492299, l2_loss: 6033.671387, total loss: 0.914656
0.85
step 300, entropy loss: 0.325712, l2_loss: 6033.562012, total loss: 0.748061
0.93
step 400, entropy loss: 0.356928, l2_loss: 6033.257812, total loss: 0.779256
0.93
step 500, entropy loss: 0.359615, l2_loss: 6032.893066, total loss: 0.781918
0.88
step 600, entropy loss: 0.350746, l2_loss: 6032.481445, total loss: 0.773020
0.88
step 700, entropy loss: 0.338226, l2_loss: 6032.006836, total loss: 0.760467
0.91
step 800, entropy loss: 0.414592, l2_loss: 6031.524414, total loss: 0.836798
0.9
step 900, entropy loss: 0.315013, l2_loss: 6031.010254, total loss: 0.737184
0.9
step 1000, entropy loss: 0.189375, l2_loss: 6030.511719, total loss: 0.611511
0.97
0.9374
step 1100, entropy loss: 0.241904, l2_loss: 6029.981445, total loss: 0.664003
0.95
step 1200, entropy loss: 0.284403, l2_loss: 6029.437500, total loss: 0.706464
0.94
step 130

以上的方案都是使用两个卷积层，分别用32个 5x5x1 的卷积核和64个 5x5x32 的卷积核，第一个卷积层对原图像的感受野为5x5，第二个卷积层对上一层数据的感受野为5x5，换算为对原图像的感受野为7x7，故尝试直接使用7x7x64的单个卷积层训练  

训练前期收敛的非常快，最初就达到0.89准确率，但后期乏力，最终的训练结果也没有两个小卷积核训练的结果好  
测试集上最大正确率：0.98  
训练集上最大正确率：0.9669  


此卷积网络构建方式不好，原因如下：  
1、参数太多  
两个小卷积的参数：(5x5x32+32)+(5x5x64+64)=2496个参数（含bias）  
一个大卷积的参数：7x7x64+64=3200个参数（含bias）  
如果层数变多后，参数的个数差距更加明显  
2、特征提取能力略弱  
多个卷积层与非线性的激活层交替的结构，比单一卷积层的结构更能提取出深层的更好的特征，可以表达出输入数据中更多个强力特征  

## 总结

所有训练结果中：  
测试集上最大正确率：1.0  
训练集上最大正确率:0.9915  
