# Convolutional Neural Network on MNIST data
___

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from datetime import datetime
from scipy.io import loadmat
from sklearn.utils import shuffle

In [2]:
train = pd.read_csv('../input/MNIST/train.csv')
Xtest = pd.read_csv('../input/MNIST/test.csv')

In [3]:
Xtrain = train.drop(columns='label')
Ytrain = train['label']

print('Xtrain shape: ', Xtrain.shape)
print('Xtest shape : ', Xtest.shape)

Xtrain.head()

Xtrain shape:  (42000, 784)
Xtest shape :  (28000, 784)


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
Xtrain = Xtrain.astype(np.float32) / 255.
Xtest = Xtest.astype(np.float32)/ 255.

In [5]:
# reshape Xtrain to 28 x 28 matrices
Xtrain = np.array(Xtrain)
Xtrain.shape[0]

arr = np.zeros(shape=(Xtrain.shape[0], 28, 28))

for i in range(Xtrain.shape[0]):
    arr[i] = np.reshape(Xtrain[i], (28, 28))
    
Xtrain = arr[..., np.newaxis]
Xtrain.shape

(42000, 28, 28, 1)

In [6]:
# reshape Xtest to 28 x 28 matrices
Xtest = np.array(Xtest)
Xtest.shape[0]

arr = np.zeros(shape=(Xtest.shape[0], 28, 28))

for i in range(Xtest.shape[0]):
    arr[i] = np.reshape(Xtest[i], (28, 28))
    
Xtest = arr[..., np.newaxis]
Xtest.shape

(28000, 28, 28, 1)

In [7]:
# make validation and train set
Xvalid = Xtrain[32000:]
Yvalid = Ytrain[32000:]

Xtrain = Xtrain[:32000]
Ytrain = Ytrain[:32000]

print('Xtrain shape: ', Xtrain.shape)
print('Xvalid shape: ', Xvalid.shape)
print('Xtest shape : ', Xtest.shape)

Xtrain shape:  (32000, 28, 28, 1)
Xvalid shape:  (10000, 28, 28, 1)
Xtest shape :  (28000, 28, 28, 1)


In [8]:
del arr

In [9]:
def convpool(X, W, b):
    conv_out = tf.nn.conv2d(X, W, strides=[1, 1, 1, 1], padding='SAME')
    conv_out = tf.nn.bias_add(conv_out, b)
    pool_out = tf.nn.max_pool(conv_out, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    return tf.nn.relu(pool_out)

In [10]:
def init_filter(shape, poolsz):
    w = np.random.randn(*shape) / np.sqrt(np.prod(shape[:-1]) + shape[-1]*np.prod(shape[:-2] / np.prod(poolsz)))
    return w.astype(np.float32)

In [11]:
# gradient descent params
max_iter = 10
print_period = 10
N = Xtrain.shape[0]
batch_sz = 500
n_batches = N // batch_sz

In [12]:
# initial weights
M = 500
K = 10
poolsz = (2, 2)

W1_shape = (5, 5, 1, 20) # (filter_width, filter_height, num_color_channels, num_feature_maps)
W1_init = init_filter(W1_shape, poolsz)
b1_init = np.zeros(W1_shape[-1], dtype=np.float32) # one bias per output feature map

W2_shape = (5, 5, 20, 50) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps)
W2_init = init_filter(W2_shape, poolsz)
b2_init = np.zeros(W2_shape[-1], dtype=np.float32)

# W3_shape = (5, 5, 50, 80) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps)
# W3_init = init_filter(W3_shape, poolsz)
# b3_init = np.zeros(W3_shape[-1], dtype=np.float32)

# vanilla ANN weights
W4_init = np.random.randn(W2_shape[-1]*7*7, M) / np.sqrt(W2_shape[-1]*7*7 + M)
b4_init = np.zeros(M, dtype=np.float32)
W5_init = np.random.randn(M, K) / np.sqrt(M + K)
b5_init = np.zeros(K, dtype=np.float32)

In [13]:
# define variables and expressions
# using None as the first shape element takes up too much RAM unfortunately
X = tf.placeholder(tf.float32, shape=(batch_sz, 28, 28, 1), name='X')
T = tf.placeholder(tf.int32, shape=(batch_sz, ), name='T')

W1 = tf.Variable(W1_init.astype(np.float32))
b1 = tf.Variable(b1_init.astype(np.float32))
W2 = tf.Variable(W2_init.astype(np.float32))
b2 = tf.Variable(b2_init.astype(np.float32))
# W3 = tf.Variable(W3_init.astype(np.float32))
# b3 = tf.Variable(b3_init.astype(np.float32))
W4 = tf.Variable(W4_init.astype(np.float32))
b4 = tf.Variable(b4_init.astype(np.float32))
W5 = tf.Variable(W5_init.astype(np.float32))
b5 = tf.Variable(b5_init.astype(np.float32))

In [14]:
# feedforward operation
Z1 = convpool(X, W1, b1)
Z2 = convpool(Z1, W2, b2)
# Z3 = convpool(Z2, W3, b3)

# flatten Z2 
"""
Z2 shape [2000, 7, 7, 50]
needs to flatten to
[2000, 2450]
"""
Z2_shape = Z2.get_shape().as_list() 
Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) 
Z3 = tf.nn.relu( tf.matmul(Z2r, W4) + b4 )
Yish = tf.matmul(Z3, W5) + b5

In [15]:
cost = tf.reduce_sum(
    tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=Yish,
        labels=T
    )
)

train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost)

In [16]:
def error_rate(p, t):
    """
    Return error rate
    Compare prediction(p) to target(t)
    """
    return np.mean(p != t)

In [17]:
# we'll use this to calculate the error rate
predict_op = tf.argmax(Yish, 1)

t0 = datetime.now()
LL = []

W1_val = None
W2_val = None

init = tf.global_variables_initializer()
with tf.Session() as session:
    session.run(init)

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),]
            
            if len(Xbatch) == batch_sz:
                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    # due to RAM limitations we need to have a fixed size input
                    # so as a result, we have this ugly total cost and prediction computation
                    test_cost = 0
                    prediction = np.zeros(len(Xvalid))
                    for k in range(len(Xvalid) // batch_sz):
                        Xtestbatch = Xvalid[k*batch_sz:(k*batch_sz + batch_sz),]
                        Ytestbatch = Yvalid[k*batch_sz:(k*batch_sz + batch_sz),]
                        test_cost += session.run(cost, feed_dict={X: Xtestbatch, T: Ytestbatch})
                        prediction[k*batch_sz:(k*batch_sz + batch_sz)] = session.run(predict_op, feed_dict={X: Xtestbatch})
                    err = error_rate(prediction, Yvalid)
                    print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
                    LL.append(test_cost)
    W1_val = W1.eval()
    W2_val = W2.eval()

Cost / err at iteration i=0, j=0: 22834.486 / 0.863
Cost / err at iteration i=0, j=10: 10794.899 / 0.216
Cost / err at iteration i=0, j=20: 6474.265 / 0.190
Cost / err at iteration i=0, j=30: 3574.635 / 0.109
Cost / err at iteration i=0, j=40: 2601.829 / 0.079
Cost / err at iteration i=0, j=50: 2211.186 / 0.061
Cost / err at iteration i=0, j=60: 1788.690 / 0.053
Cost / err at iteration i=1, j=0: 1740.824 / 0.052
Cost / err at iteration i=1, j=10: 1469.365 / 0.045
Cost / err at iteration i=1, j=20: 1376.782 / 0.042
Cost / err at iteration i=1, j=30: 1238.146 / 0.037
Cost / err at iteration i=1, j=40: 1125.487 / 0.034
Cost / err at iteration i=1, j=50: 1076.982 / 0.033
Cost / err at iteration i=1, j=60: 953.381 / 0.029
Cost / err at iteration i=2, j=0: 909.250 / 0.028
Cost / err at iteration i=2, j=10: 829.911 / 0.026
Cost / err at iteration i=2, j=20: 816.137 / 0.026
Cost / err at iteration i=2, j=30: 824.625 / 0.026
Cost / err at iteration i=2, j=40: 731.469 / 0.023
Cost / err at itera