In [2]:
# download the data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Know your data and terms

Each data point has images and labels.

### Images

The images are called `xs`. They are the handwritten individual digits. 

#### Flattening

The images are `flattened` which means that each pixel is represented by a floating point number representing how dark that pixel is (0 being white and .9 being black). 

The images are 28 pixels wide and 28 pixels tall, so in order to flatten the image we removed the 2D structure of it and just create an array of length 784 (784=28\*28). This array is called a 784-dimensional vector space.

### Labels

The labels are called `ys`. They correspond to each image and identify what the digit really is.

### Tensors

A tensor is an n-dimensional array. In this case, `mnist.train.images` is a tensor that has dimensions `[55000, 784]` because we have 55,000 images, and each image is represented by a 784-dimensional vector (array, if you will).

### One-Hot Vectors

The labels are represented as one-hot vectors, which is a way of encoding a value in an array. For example, each of our labels is going to be one of the digits `0-9`. So, a one-hot vector for `0` is: `[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]` and a one-hot vector for `5` is: `[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]`.

The labels `mnist.train.labels` is a tensor that has dimensions `[55000, 10]` because we have 55,000 images, and each image has a one-hot vector that represents the label of that image.

## Know your model

### Softmax Regression

This tutorial uses softmax regression to classify what the digits are. For each image, it will give a probability of that image being a certain image. "For example, our model might look at a picture of a nine and be 80% sure it's a nine, but give a 5% chance to it being an eight (because of the top loop) and a bit of probability to all the others because it isn't sure."

In [12]:
import tensorflow as tf
# initialize the variables
init = tf.initialize_all_variables()
# create an run the session
sess = tf.InteractiveSession()
sess.run(init)

In [4]:
# tf.placeholder sets a 'placeholder' value, which will end up being
# the 2-D tensor of floating-point numbers. Think of x as the type that
# will represent each image. Note: parameter `[None, 784]` gives x dimensions
# that are any value x 784.

x = tf.placeholder(tf.float32, [None, 784])

In [5]:
# tf.Variable is how tensorflow treats weights and values. W is our weights
# and b is our values.
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))

In [6]:
# here we implement the softmax model. tf.matmul matrix-multiplies x and W
# and then we add b. 
y = tf.nn.softmax(tf.matmul(x, W) + b)

In [7]:
# here we create the cost function, we are using cross-entropy as 
# the cost function
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(
    -tf.reduce_sum(y_ * tf.log(y), 
    reduction_indices=[1])
)

In [22]:
# this applies the optimization algorithm that modifies the variables
# and reduces the cost based on the cost function that you provide.
# There are many optimization algorithms that you can replace Gradient-
# DescentOptimizer with.
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
# this will train the model 1000 times
for i in range(1000):
    batch = mnist.train.next_batch(50)
    train_step.run(feed_dict={x: batch[0], y_: batch[1]})

In [25]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy is:",
      accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels})
     )

Accuracy is: 0.9181


## Multilayer Convolutional Network

An improved version of the above network

In [26]:
# the wieghts have a small amount of noise so that it is not completely 
# symmetric which prevents 0 gradients too
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)
def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [27]:
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")
def max_pool_2x2(x):
    return tf.nn.max_pool(
        x,
        ksize=[1, 2, 2, 1],
        strides=[1, 2, 2, 1],
        padding="SAME"
    )

In [28]:
# First Convolutional layer
# the convolutional layer computes 32 features (32 output channels) for each
# 5x5 patch, with 1 input channel
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])

x_image = tf.reshape(x, [-1, 28, 28, 1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

In [31]:
# Second Convolutional layer
# this layer will computes 64 features (64 output channels) for each
# 5x5 patch, with 32 input channels
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

In [32]:
# Deeply Connected Layer
# the image size has been reduced to 7x7, this layer takes that as input
# and processes it with 1024 neurons
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])

# this reshapes the tensor from the pooling layer into a batch of vectors
# then multiplies by a weight matrix, adds a bias, and applies ReLU
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

In [35]:
# Dropout
# this reduces overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [36]:
# Softmax Regression
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

In [38]:
# Training and Evaluating
cross_entropy = tf.reduce_mean(
    -tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1])
)
# this uses a different method of optimizing than above. The ADAM optimizer
# is more sophisticated than the Gradient Descent Optimizer
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
for i in range(20000):
    batch = mnist.train.next_batch(50)
    if i % 100 == 0:
        train_accuracy = accuracy.eval(feed_dict= {
                x: batch[0], y_: batch[1], keep_prob: 1.0
            })
        print("Step:", i, "Training accuracy:", train_accuracy)
    train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
print("Test accuracy: %g"%accuracy.eval(feed_dict={
            x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0
        }))

Step: 0 Training accuracy: 0.18
Step: 100 Training accuracy: 0.96
Step: 200 Training accuracy: 0.92
Step: 300 Training accuracy: 0.9
Step: 400 Training accuracy: 0.98
Step: 500 Training accuracy: 0.96
Step: 600 Training accuracy: 0.92
Step: 700 Training accuracy: 0.92
Step: 800 Training accuracy: 0.96
Step: 900 Training accuracy: 0.98
Step: 1000 Training accuracy: 0.98
Step: 1100 Training accuracy: 0.96
Step: 1200 Training accuracy: 1.0
Step: 1300 Training accuracy: 0.94
Step: 1400 Training accuracy: 1.0
Step: 1500 Training accuracy: 1.0
Step: 1600 Training accuracy: 0.92
Step: 1700 Training accuracy: 0.96
Step: 1800 Training accuracy: 1.0
Step: 1900 Training accuracy: 0.96
Step: 2000 Training accuracy: 0.98
Step: 2100 Training accuracy: 0.96
Step: 2200 Training accuracy: 0.96
Step: 2300 Training accuracy: 0.96
Step: 2400 Training accuracy: 0.98
Step: 2500 Training accuracy: 0.96
Step: 2600 Training accuracy: 0.98
Step: 2700 Training accuracy: 0.98
Step: 2800 Training accuracy: 0.98
St

KeyboardInterrupt: 