Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

  from ._conv import register_converters as _register_converters


In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
test_labels[0]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [5]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [6]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [7]:
# num_steps = 1001

# with tf.Session(graph=graph) as session:
#   tf.global_variables_initializer().run()
#   print('Initialized')
#   for step in range(num_steps):
#     offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
#     batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
#     batch_labels = train_labels[offset:(offset + batch_size), :]
#     feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
#     _, l, predictions = session.run(
#       [optimizer, loss, train_prediction], feed_dict=feed_dict)
#     if (step % 50 == 0):
#       print('Minibatch loss at step %d: %f' % (step, l))
#       print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
#       print('Validation accuracy: %.1f%%' % accuracy(
#         valid_prediction.eval(), valid_labels))
#   print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [15]:
batch_size = 16
patch_size = 5
depth = 25
num_hidden = 64
num_channels = 1
image_size = 28
num_labels = 10

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data, train=False): 
    conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
    print(conv.get_shape())
    hidden = tf.nn.relu(conv + layer1_biases)
    pooled = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME")
    print(pooled.get_shape())
    conv = tf.nn.conv2d(pooled, layer2_weights, [1, 1, 1, 1], padding="SAME")
    hidden = tf.nn.relu(conv + layer2_biases)
    pooled = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME")
    print(pooled.get_shape())
    shape = pooled.get_shape().as_list()
    reshape = tf.reshape(pooled, [shape[0], shape[1] * shape[2] * shape[3]])
    tf.matmul(reshape, layer3_weights)
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    
    if train:
        hidden = tf.nn.dropout(hidden, 0.5)
        
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset, True)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

  # adding regularizers
  regularizers = (tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer1_biases) +
                  tf.nn.l2_loss(layer2_weights) + tf.nn.l2_loss(layer2_biases) +
                  tf.nn.l2_loss(layer3_weights) + tf.nn.l2_loss(layer3_biases) +
                  tf.nn.l2_loss(layer4_weights) + tf.nn.l2_loss(layer4_biases)
                 )
  # Add the regularization term to the loss.
  loss += 3e-4 * regularizers
    
  # Optimizer: set up a variable that's incremented once per batch and
  # controls the learning rate decay.
  batch = tf.Variable(0)
  # Decay once per epoch, using an exponential schedule starting at 0.01.
  learning_rate = tf.train.exponential_decay(
      0.01,                # Base learning rate.
      batch * batch_size,  # Current index into the dataset.
      train_labels.shape[0],          # Decay step.
      0.95,                # Decay rate.
      staircase=True)
  # Use simple momentum for the optimization.
  optimizer = tf.train.MomentumOptimizer(learning_rate,
                                         0.9).minimize(loss,
                                                       global_step=batch)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

(16, 28, 28, 25)
(16, 14, 14, 25)
(16, 7, 7, 25)
(10000, 28, 28, 25)
(10000, 14, 14, 25)
(10000, 7, 7, 25)
(10000, 28, 28, 25)
(10000, 14, 14, 25)
(10000, 7, 7, 25)


In [16]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 6.097877
Minibatch accuracy: 6.2%
Validation accuracy: 10.7%
Minibatch loss at step 50: 2.249521
Minibatch accuracy: 31.2%
Validation accuracy: 27.0%
Minibatch loss at step 100: 2.272937
Minibatch accuracy: 12.5%
Validation accuracy: 20.9%
Minibatch loss at step 150: 2.127567
Minibatch accuracy: 12.5%
Validation accuracy: 26.5%
Minibatch loss at step 200: 1.814353
Minibatch accuracy: 37.5%
Validation accuracy: 50.2%
Minibatch loss at step 250: 1.801741
Minibatch accuracy: 25.0%
Validation accuracy: 69.4%
Minibatch loss at step 300: 1.705539
Minibatch accuracy: 31.2%
Validation accuracy: 70.8%
Minibatch loss at step 350: 1.253408
Minibatch accuracy: 68.8%
Validation accuracy: 72.8%
Minibatch loss at step 400: 1.145070
Minibatch accuracy: 62.5%
Validation accuracy: 68.3%
Minibatch loss at step 450: 0.960605
Minibatch accuracy: 75.0%
Validation accuracy: 78.8%
Minibatch loss at step 500: 1.549298
Minibatch accuracy: 81.2%
Validation accuracy: 78.9%
Mi

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

For reference I did not come up with this, I have copied this from this solution I found online. 

https://github.com/sjuvekar/Udacity-Deep-Learning/blob/master/L3/4_convolutions.ipynb

In [19]:
import time
import sys

batch_size=64
PIXEL_DEPTH = 255
VALIDATION_SIZE = 5000  # Size of the validation set.
SEED = 66478  # Set to None for random seed.
NUM_EPOCHS = 10
EVAL_BATCH_SIZE = 64
EVAL_FREQUENCY = 100  # Number of steps between evaluations.

def lenet(argv=None):  # pylint: disable=unused-argument
  train_size = train_labels.shape[0]

  # This is where training samples and labels are fed to the graph.
  # These placeholder nodes will be fed a batch of training data at each
  # training step using the {feed_dict} argument to the Run() call below.
  train_data_node = tf.placeholder(
      tf.float32,
      shape=(batch_size, image_size, image_size, num_channels))
  train_labels_node = tf.placeholder(tf.float32,
                                     shape=(batch_size, num_labels))
  eval_data = tf.placeholder(
      tf.float32,
      shape=(EVAL_BATCH_SIZE, image_size, image_size, num_channels))

  # The variables below hold all the trainable weights. They are passed an
  # initial value which will be assigned when when we call:
  # {tf.initialize_all_variables().run()}
  conv1_weights = tf.Variable(
      tf.truncated_normal([5, 5, num_channels, 32],  # 5x5 filter, depth 32.
                          stddev=0.1,
                          seed=SEED))
  conv1_biases = tf.Variable(tf.zeros([32]))
  conv2_weights = tf.Variable(
      tf.truncated_normal([5, 5, 32, 64],
                          stddev=0.1,
                          seed=SEED))
  conv2_biases = tf.Variable(tf.constant(0.1, shape=[64]))
  fc1_weights = tf.Variable(  # fully connected, depth 512.
      tf.truncated_normal(
          [image_size // 4 * image_size // 4 * 64, 512],
          stddev=0.1,
          seed=SEED))
  fc1_biases = tf.Variable(tf.constant(0.1, shape=[512]))
  fc2_weights = tf.Variable(
      tf.truncated_normal([512, num_labels],
                          stddev=0.1,
                          seed=SEED))
  fc2_biases = tf.Variable(tf.constant(0.1, shape=[num_labels]))

  # We will replicate the model structure for the training subgraph, as well
  # as the evaluation subgraphs, while sharing the trainable parameters.
  def model(data, train=False):
    """The Model definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [image index, y, x, depth].
    conv = tf.nn.conv2d(data,
                        conv1_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    # Bias and rectified linear non-linearity.
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool = tf.nn.max_pool(relu,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    conv = tf.nn.conv2d(pool,
                        conv2_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    pool = tf.nn.max_pool(relu,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    # Reshape the feature map cuboid into a 2D matrix to feed it to the
    # fully connected layers.
    pool_shape = pool.get_shape().as_list()
    reshape = tf.reshape(
        pool,
        [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that the '+' operation automatically
    # broadcasts the biases.
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
      hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
    return tf.matmul(hidden, fc2_weights) + fc2_biases

  # Training computation: logits + cross-entropy loss.
  logits = model(train_data_node, True)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
      logits=logits, labels=train_labels_node))

  # L2 regularization for the fully connected parameters.
  regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                  tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
  # Add the regularization term to the loss.
  loss += 5e-4 * regularizers

  # Optimizer: set up a variable that's incremented once per batch and
  # controls the learning rate decay.
  batch = tf.Variable(0)
  # Decay once per epoch, using an exponential schedule starting at 0.01.
  learning_rate = tf.train.exponential_decay(
      0.01,                # Base learning rate.
      batch * batch_size,  # Current index into the dataset.
      train_size,          # Decay step.
      0.95,                # Decay rate.
      staircase=True)
  # Use simple momentum for the optimization.
  optimizer = tf.train.MomentumOptimizer(learning_rate,
                                         0.9).minimize(loss,
                                                       global_step=batch)

  # Predictions for the current training minibatch.
  train_prediction = tf.nn.softmax(logits)

  # Predictions for the test and validation, which we'll compute less often.
  eval_prediction = tf.nn.softmax(model(eval_data))

  # Small utility function to evaluate a dataset by feeding batches of data to
  # {eval_data} and pulling the results from {eval_predictions}.
  # Saves memory and enables this to run on smaller GPUs.
  def eval_in_batches(data, sess):
    """Get all predictions for a dataset by running it in small batches."""
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
      raise ValueError("batch size for evals larger than dataset: %d" % size)
    predictions = np.ndarray(shape=(size, num_labels), dtype=np.float32)
    for begin in range(0, size, EVAL_BATCH_SIZE):
      end = begin + EVAL_BATCH_SIZE
      if end <= size:
        predictions[begin:end, :] = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[begin:end, ...]})
      else:
        batch_predictions = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
        predictions[begin:, :] = batch_predictions[begin - size:, :]
    return predictions

  # Create a local session to run the training.
  start_time = time.time()
  with tf.Session() as sess:
    # Run all the initializers to prepare the trainable parameters.
    tf.initialize_all_variables().run()
    print('Initialized!')
    # Loop through training steps.
    for step in range(int(NUM_EPOCHS * train_size) // batch_size):
      # Compute the offset of the current minibatch in the data.
      # Note that we could use better randomization across epochs.
      offset = (step * batch_size) % (train_size - batch_size)
      batch_data = train_dataset[offset:(offset + batch_size), ...]
      batch_labels = train_labels[offset:(offset + batch_size)]
      # This dictionary maps the batch data (as a numpy array) to the
      # node in the graph is should be fed to.
      feed_dict = {train_data_node: batch_data,
                   train_labels_node: batch_labels}
      # Run the graph and fetch some of the nodes.
      _, l, lr, predictions = sess.run(
          [optimizer, loss, learning_rate, train_prediction],
          feed_dict=feed_dict)
      if step % EVAL_FREQUENCY == 0:
        elapsed_time = time.time() - start_time
        start_time = time.time()
        print('Step %d (epoch %.2f), %.1f ms' %
              (step, float(step) * batch_size / train_size,
               1000 * elapsed_time / EVAL_FREQUENCY))
        print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
        print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
        print('Validation accuracy: %.1f%%' % accuracy(
            eval_in_batches(valid_dataset, sess), valid_labels))
        sys.stdout.flush()
    # Finally print the result!
    test_error = accuracy(eval_in_batches(test_dataset, sess), test_labels)
    print('Test accuracy: %.1f%%' % test_error)
    
lenet()

Initialized!
Step 0 (epoch 0.00), 4.3 ms
Minibatch loss: 9.657, learning rate: 0.010000
Minibatch accuracy: 18.8%
Validation accuracy: 18.9%
Step 100 (epoch 0.03), 150.5 ms
Minibatch loss: 3.617, learning rate: 0.010000
Minibatch accuracy: 87.5%
Validation accuracy: 81.2%
Step 200 (epoch 0.06), 149.9 ms
Minibatch loss: 3.594, learning rate: 0.010000
Minibatch accuracy: 85.9%
Validation accuracy: 83.0%
Step 300 (epoch 0.10), 148.6 ms
Minibatch loss: 3.852, learning rate: 0.010000
Minibatch accuracy: 70.3%
Validation accuracy: 83.9%
Step 400 (epoch 0.13), 148.1 ms
Minibatch loss: 3.937, learning rate: 0.010000
Minibatch accuracy: 78.1%
Validation accuracy: 85.1%
Step 500 (epoch 0.16), 148.1 ms
Minibatch loss: 3.345, learning rate: 0.010000
Minibatch accuracy: 92.2%
Validation accuracy: 85.5%
Step 600 (epoch 0.19), 148.3 ms
Minibatch loss: 3.478, learning rate: 0.010000
Minibatch accuracy: 84.4%
Validation accuracy: 86.2%
Step 700 (epoch 0.22), 150.7 ms
Minibatch loss: 3.326, learning rat

Validation accuracy: 90.4%
Step 6200 (epoch 1.98), 149.0 ms
Minibatch loss: 2.135, learning rate: 0.009500
Minibatch accuracy: 90.6%
Validation accuracy: 90.0%
Step 6300 (epoch 2.02), 148.2 ms
Minibatch loss: 2.128, learning rate: 0.009025
Minibatch accuracy: 89.1%
Validation accuracy: 90.0%
Step 6400 (epoch 2.05), 148.3 ms
Minibatch loss: 2.078, learning rate: 0.009025
Minibatch accuracy: 89.1%
Validation accuracy: 90.0%
Step 6500 (epoch 2.08), 153.2 ms
Minibatch loss: 1.843, learning rate: 0.009025
Minibatch accuracy: 93.8%
Validation accuracy: 90.3%
Step 6600 (epoch 2.11), 151.9 ms
Minibatch loss: 1.895, learning rate: 0.009025
Minibatch accuracy: 92.2%
Validation accuracy: 90.2%
Step 6700 (epoch 2.14), 149.6 ms
Minibatch loss: 1.985, learning rate: 0.009025
Minibatch accuracy: 90.6%
Validation accuracy: 90.5%
Step 6800 (epoch 2.18), 149.7 ms
Minibatch loss: 1.768, learning rate: 0.009025
Minibatch accuracy: 95.3%
Validation accuracy: 90.2%
Step 6900 (epoch 2.21), 149.9 ms
Minibatch

Validation accuracy: 91.2%
Step 12400 (epoch 3.97), 148.2 ms
Minibatch loss: 1.341, learning rate: 0.008574
Minibatch accuracy: 89.1%
Validation accuracy: 91.4%
Step 12500 (epoch 4.00), 149.1 ms
Minibatch loss: 1.112, learning rate: 0.008145
Minibatch accuracy: 100.0%
Validation accuracy: 91.1%
Step 12600 (epoch 4.03), 148.4 ms
Minibatch loss: 1.344, learning rate: 0.008145
Minibatch accuracy: 87.5%
Validation accuracy: 91.2%
Step 12700 (epoch 4.06), 150.2 ms
Minibatch loss: 1.243, learning rate: 0.008145
Minibatch accuracy: 90.6%
Validation accuracy: 91.3%
Step 12800 (epoch 4.10), 148.6 ms
Minibatch loss: 1.363, learning rate: 0.008145
Minibatch accuracy: 89.1%
Validation accuracy: 91.3%
Step 12900 (epoch 4.13), 149.8 ms
Minibatch loss: 1.081, learning rate: 0.008145
Minibatch accuracy: 96.9%
Validation accuracy: 91.2%
Step 13000 (epoch 4.16), 153.1 ms
Minibatch loss: 1.375, learning rate: 0.008145
Minibatch accuracy: 89.1%
Validation accuracy: 91.2%
Step 13100 (epoch 4.19), 156.4 ms


Step 18500 (epoch 5.92), 154.6 ms
Minibatch loss: 0.819, learning rate: 0.007738
Minibatch accuracy: 95.3%
Validation accuracy: 91.8%
Step 18600 (epoch 5.95), 153.5 ms
Minibatch loss: 0.836, learning rate: 0.007738
Minibatch accuracy: 92.2%
Validation accuracy: 92.1%
Step 18700 (epoch 5.98), 156.8 ms
Minibatch loss: 0.759, learning rate: 0.007738
Minibatch accuracy: 95.3%
Validation accuracy: 91.9%
Step 18800 (epoch 6.02), 157.1 ms
Minibatch loss: 0.814, learning rate: 0.007351
Minibatch accuracy: 92.2%
Validation accuracy: 91.9%
Step 18900 (epoch 6.05), 155.1 ms
Minibatch loss: 0.908, learning rate: 0.007351
Minibatch accuracy: 93.8%
Validation accuracy: 91.7%
Step 19000 (epoch 6.08), 155.2 ms
Minibatch loss: 0.764, learning rate: 0.007351
Minibatch accuracy: 95.3%
Validation accuracy: 91.7%
Step 19100 (epoch 6.11), 156.6 ms
Minibatch loss: 0.984, learning rate: 0.007351
Minibatch accuracy: 89.1%
Validation accuracy: 91.8%
Step 19200 (epoch 6.14), 169.1 ms
Minibatch loss: 0.784, learn

Validation accuracy: 92.1%
Step 24700 (epoch 7.90), 151.0 ms
Minibatch loss: 0.563, learning rate: 0.006983
Minibatch accuracy: 96.9%
Validation accuracy: 92.4%
Step 24800 (epoch 7.94), 153.0 ms
Minibatch loss: 0.739, learning rate: 0.006983
Minibatch accuracy: 87.5%
Validation accuracy: 92.3%
Step 24900 (epoch 7.97), 151.0 ms
Minibatch loss: 0.606, learning rate: 0.006983
Minibatch accuracy: 92.2%
Validation accuracy: 92.4%
Step 25000 (epoch 8.00), 156.4 ms
Minibatch loss: 0.686, learning rate: 0.006634
Minibatch accuracy: 90.6%
Validation accuracy: 92.2%
Step 25100 (epoch 8.03), 151.7 ms
Minibatch loss: 0.732, learning rate: 0.006634
Minibatch accuracy: 95.3%
Validation accuracy: 92.2%
Step 25200 (epoch 8.06), 162.3 ms
Minibatch loss: 0.555, learning rate: 0.006634
Minibatch accuracy: 95.3%
Validation accuracy: 92.3%
Step 25300 (epoch 8.10), 170.6 ms
Minibatch loss: 0.604, learning rate: 0.006634
Minibatch accuracy: 95.3%
Validation accuracy: 92.4%
Step 25400 (epoch 8.13), 151.9 ms
M

Step 30800 (epoch 9.86), 159.8 ms
Minibatch loss: 0.485, learning rate: 0.006302
Minibatch accuracy: 93.8%
Validation accuracy: 92.7%
Step 30900 (epoch 9.89), 161.2 ms
Minibatch loss: 0.580, learning rate: 0.006302
Minibatch accuracy: 90.6%
Validation accuracy: 92.5%
Step 31000 (epoch 9.92), 154.4 ms
Minibatch loss: 0.557, learning rate: 0.006302
Minibatch accuracy: 93.8%
Validation accuracy: 92.5%
Step 31100 (epoch 9.95), 153.0 ms
Minibatch loss: 0.670, learning rate: 0.006302
Minibatch accuracy: 90.6%
Validation accuracy: 92.6%
Step 31200 (epoch 9.98), 154.4 ms
Minibatch loss: 0.427, learning rate: 0.006302
Minibatch accuracy: 96.9%
Validation accuracy: 92.6%
Test accuracy: 97.2%
