### Train CNN on CIFAR10 Dataset

In [None]:
import numpy as np
import os
import sys
import tensorflow as tf
import numpy as np
import os
import tarfile
import pickle
import subprocess
import sys
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from urllib.request import urlretrieve

#### Parameters
Here we define the parameters for our training. We will be using simple stochastic gradient descent with momentum. We are using standard values for the learning rate and momentum. We have set the number of epochs to 3 since this is designed to run on Azure Notebooks with limited computational resources. If you run this on your own machine or a VM with a GPU you can increase this to 10 or more so you get a better idea of how the model trains. The batch size is the number of examples to group together in a batch. The size of your batch may be limited by the memory available on your GPU if you run this notebook on a GPU enabled machine.

In [None]:
# Parameters
EPOCHS = 3
BATCHSIZE = 64
LR = 0.01
MOMENTUM = 0.9
N_CLASSES = 10 # There are 10 classes in the CIFAR10 dataset

#### Functions for downloading and preparing the data
The CIFAR10 dataset comes as a number of pickle files. 5 for training and 1 for testing. These pickle files contain the data as numpy arrays which is convenient since we don't have to transform the images into numpy arrays. Each row of the dataset is a 3x32x32 image. The dimensions represents the Height, Width and Color channels of the images.

In [None]:
def read_pickle(src):
    with open(src, 'rb') as f:
        data = pickle.load(f, encoding='latin1')
    return data

In [None]:
def process_cifar():
    """ Read data
    """
    
    print('Preparing train set...')
    train_list = [read_pickle('./cifar-10-batches-py/data_batch_{0}'.format(i)) for i in range(1, 6)]
    x_train = np.concatenate([t['data'] for t in train_list])
    y_train = np.concatenate([t['labels'] for t in train_list])
    
    print('Preparing test set...')
    tst = read_pickle('./cifar-10-batches-py/test_batch')
    x_test = tst['data']
    y_test = np.asarray(tst['labels'])
    
    return x_train, y_train, x_test, y_test

In [None]:
def load_cifar(src="http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"):
    """ Load CIFAR10 Dataset
    """
    try:
        return process_cifar()
    except FileNotFoundError:
        print('Data does not exist. Downloading ' + src)
        fname, h = urlretrieve(src, './delete.me')
        print('Extracting files...')
        with tarfile.open(fname) as tar:
            tar.extractall()
        os.remove(fname)
    return process_cifar()

In order to be able to use the data we have to do some preprocessing.
* First we need to scale the pixel value to be between 0 and 1
* Then we need to reshape the vector to be a 3 dimensional vector
* Then convert the data to the appropriate data type so that it is treated properly by the following functions but also to reduce memory overhead

In [None]:
def prepare_cifar(x_train, y_train, x_test, y_test):
    
    # Scale pixel intensity
    x_train = x_train / 255.0
    x_test = x_test / 255.0
    
    # Reshape
    x_train = x_train.reshape(-1, 3, 32, 32)
    x_test = x_test.reshape(-1, 3, 32, 32)
    
    x_train = np.swapaxes(x_train, 1, 3)
    x_test = np.swapaxes(x_test, 1, 3)
    
    return (x_train.astype(np.float32), 
            y_train.astype(np.int32), 
            x_test.astype(np.float32), 
            y_test.astype(np.int32))

#### Functions for serving the data to the CNN 
The data will be fed to the NN in minibatches. A minibatch is a set of examples taken from the training set. 

In [None]:
def minibatch_from(X, y, batchsize=BATCHSIZE, shuffle=False):
    if len(X) != len(y):
        raise Exception("The length of X {} and y {} don't match".format(len(X), len(y)))
        
    if shuffle:
        X, y = shuffle_data(X, y)
    
    for i in range(0, len(X), batchsize):
        yield X[i:i + batchsize], y[i:i + batchsize]

It is desirable during training to shuffle the data so that each minibatch isn't always the same. During evaluation or testing though this is not necessary.

In [None]:
def shuffle_data(X, y):
    index = np.arange(len(X))
    np.random.shuffle(index)
    return X[index], y[index]

#### Creating and initialising the model
Below is the definition of our model. It isn't very deep and only has 2 convolutional layers. Both convolution layers have 50 kernels each with a dimension of 3 by 3. The first convolution layer uses relu activation and the second convolution layer carries out max pooling before using relu activation. After that we need to reshape our Tensor into 2D matrix with the first dimension being the size of our batch. After that we pass it into a fully connected layer of 512 nodes. This layer uses relu actionation. Finally we introduce our final dense layer which has 10 outpus, one for each of our classes.

In [None]:
def create_model(model_input, n_classes=N_CLASSES, data_format='channels_last'):
    conv1 = tf.layers.conv2d(model_input, 
                             filters=50, 
                             kernel_size=(3, 3), 
                             padding='same', 
                             data_format=data_format,
                             activation=tf.nn.relu)
    conv2 = tf.layers.conv2d(conv1, 
                             filters=50, 
                             kernel_size=(3, 3), 
                             padding='same', 
                             data_format=data_format,
                             activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(conv2, 
                                    pool_size=(2, 2), 
                                    strides=(2, 2), 
                                    padding='valid', 
                                    data_format=data_format)
    flatten = tf.reshape(pool1, shape=[-1, 50*16*16])
    fc1 = tf.layers.dense(flatten, 512, activation=tf.nn.relu)
    logits = tf.layers.dense(fc1, n_classes, name='output')
    return logits

An important element in training neural network is defining the loss function and optimisation to use. Here we are using cross entropy as our loss function and stochastic gradient descent(SGD) with momentum as our optimisation function. SGD is the standard optimisation method for Deep Learning. The two parameters we have to define are the learning rate and momentum.

In [None]:
def init_model_training(m, labels, learning_rate=LR, momentum=MOMENTUM):
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=m, labels=y)
    loss = tf.reduce_mean(cross_entropy)
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)
    return optimizer.minimize(loss)

#### Load the CIFAR10 data

In [None]:
%%time
# Data into format for library
x_train, y_train, x_test, y_test = prepare_cifar(*load_cifar())
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(x_train.dtype, x_test.dtype, y_train.dtype, y_test.dtype)

#### Create the model
Our images (X) are 32x32 and have 3 color channels. Here we are defining our Tensor with channel color last. The y represents the class output which goes from 0 to 9.

In [None]:
%%time
# Place-holders
X = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
y = tf.placeholder(tf.int32, shape=[None])

# Initialise model
model = create_model(X)

In [None]:
%%time
train_model = init_model_training(model, y)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

# Accuracy logging
correct = tf.nn.in_top_k(model, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

#### Train the model
Here we train the model for the number of epochs we defined at the top of the notebook. During this process we execute the forward pass, calculate the loss and then propagate the error backwards and update the weights. This can take a considerable amount of time dependings on the computational resources you have at your disposal.

In [None]:
%%time
for j in range(EPOCHS):
    for data, label in minibatch_from(x_train, y_train, shuffle=True):
        sess.run(train_model, feed_dict={X: data, y: label})
    # Log
    acc_train = sess.run(accuracy, feed_dict={X: data, y: label})
    print(j, "Train accuracy:", acc_train)

#### Test the model

In [None]:
%%time
y_guess = list()
for data, label in minibatch_from(x_test, y_test):
    pred = tf.argmax(model,1)
    output = sess.run(pred, feed_dict={X: data})
    y_guess.append(output)

In [None]:
print("Accuracy: ", sum(np.concatenate(y_guess) == y_test)/float(len(y_test)))