# End-to-End model building process for MNIST

This is a complete end-to-end build at MNIST with Theano + Keras + Python 2.7 on Ubuntu 16.04 + GTX 1080 Ti.

Using Data Augmentation and Batch Normalization.

Note: MNIST is great way to revise basics about CNNs because it's very fast to train (28x28 images) and there are plenty  benchmarks available on best approaches.
Also Keras contains a copy of MNIST.

In [1]:
import os, sys
#Create references to important directories we will use over and over
#current_dir = os.getcwd()

#Allow relative imports to directories above lesson3/
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [2]:
#Import modules
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

#In Jupyter notebooks, we need to run this command before doing any plotting
%matplotlib inline

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1080 Ti (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


# Setup


In [3]:
batch_size = 64

In [4]:
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

#### Preprocessing: add missing color and OneHot

MNIST are greyscale images while  Keras expect RGB (so 3 color channels) images so we need to add an empty dimension (the missing "color channel") to MNIST to avoid errors.

In [5]:
X_test = np.expand_dims(X_test, 1)
X_train = np.expand_dims(X_train, 1)
X_train.shape

(60000, 1, 28, 28)

Also we need to *onehot* encode the labels (*y_values*) because they are actual real figures (0,1,2,3 etc), so *Softmax* can approximate the result with a very high value close to 1.

In [6]:
#examples
y_train[:5]

array([5, 0, 4, 1, 9], dtype=uint8)

In [7]:
#Onehot encoding
y_train = onehot(y_train)
y_test = onehot(y_test)

y_train[:5]

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

#### Normalization
We normalize the input by substracting the mean and dividing by the Standard Deviation


In [8]:
mean_px = X_train.mean().astype(np.float32)
std_px = X_train.std().astype(np.float32)

In [9]:
def norm_input(x): return (x-mean_px) / std_px

# 1. Linear Model

We need a linear model which needs to:
- normalize the input, 
- flatten it as a simple vector instead of an image,
- create a Dense layer with 10 outputs under *Softmax*.


In [10]:
def get_lin_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1, 28, 28)),
        Flatten(),
        Dense(10, activation='softmax')    
    ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
lm = get_lin_model()

  .format(self.name, input_shape))


In [12]:
lm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_1 (Lambda)                (None, 1, 28, 28)     0           lambda_input_1[0][0]             
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 784)           0           lambda_1[0][0]                   
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 10)            7850        flatten_1[0][0]                  
Total params: 7,850
Trainable params: 7,850
Non-trainable params: 0
____________________________________________________________________________________________________


In [13]:
#Basic Gen without data augmentation
gen = image.ImageDataGenerator()
batches = gen.flow(X_train, y_train, batch_size = batch_size)
test_batches = gen.flow(X_test, y_test, batch_size = batch_size)

In [14]:
# Let's do a first single epoch run with standard params, including lr=0.001
lm.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3baf845250>

In [15]:
lm.optimizer.lr=0.1

In [16]:
lm.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3bc78f64d0>

In [17]:
lm.optimizer.lr=0.01

In [18]:
lm.fit_generator(batches, batches.n, nb_epoch=5,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3bbb639190>

# 2. Single dense layer

In [19]:
# Let's add one hidden layer fully-connected, like what people called "Neural Networks" in the 80-90's.
def get_fc_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1, 28, 28)),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(10, activation='softmax')    
    ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [20]:
fc = get_fc_model()

  .format(self.name, input_shape))


In [21]:
#And same routine as before
fc.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3b9f526c50>

In [22]:
lm.optimizer.lr=0.1

In [23]:
fc.fit_generator(batches, batches.n, nb_epoch=6,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3b9e51bc10>

In [24]:
lm.optimizer.lr=0.01

In [25]:
fc.fit_generator(batches, batches.n, nb_epoch=8,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3b9e51bb50>

# 3. Basic 'VGG-style' CNN

In [26]:
def get_model():
    model = Sequential([
        Lambda(norm_input, input_shape=(1,28,28)),
        Convolution2D(32,3,3, activation='relu'),
        Convolution2D(32,3,3, activation='relu'),
        MaxPooling2D(),
        Convolution2D(64,3,3, activation='relu'),
        Convolution2D(64,3,3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(10, activation='softmax')        
    ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [27]:
model = get_model()

  .format(self.name, input_shape))


In [28]:
#And same routine as before
model.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3b9c9e33d0>

In [29]:
lm.optimizer.lr=0.1
model.fit_generator(batches, batches.n, nb_epoch=6,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3b9287cb50>

In [30]:
lm.optimizer.lr=0.01
model.fit_generator(batches, batches.n, nb_epoch=8,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3b9287c590>

With lr=0.01, we see a drop in val_acc 0.9922 over train_acc:0.9979 so we are now clearly overfitting compared to lr=0.1

#### Which is great news: we now know we have a model which is complex enough to handle our data.
#### "*Start by overfitting* " said Jeremy Howard, then move to 5-steps dance...

##### Recall on 5-steps to reduce overfitting:
    1. Add more data (not always possible, e.g. Kaggle competitions)
    2. Use Data Augmentation (duplicate+tweak the images)
    3. Use architectures that generalize well
    4. Add regularization via Batch Normalization
    5. Reduce architecture complexity




# Data Augmentation

In [31]:
model = get_model()

  .format(self.name, input_shape))


In [45]:
# https://keras.io/preprocessing/image/#imagedatagenerator
gen = image.ImageDataGenerator(rotation_range=8, width_shift_range=0.08, height_shift_range=0.08,
                               zoom_range=0.08, shear_range=0.3) #, channel_shift_range=0.08)
batches = gen.flow(X_train, y_train, batch_size = batch_size)
test_batches = gen.flow(X_test, y_test, batch_size = batch_size)

In [33]:
#And same routine as before
model.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3bb9cc4190>

In [34]:
lm.optimizer.lr=0.1
model.fit_generator(batches, batches.n, nb_epoch=6,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3bb9592e90>

In [35]:
lm.optimizer.lr=0.01
model.fit_generator(batches, batches.n, nb_epoch=8,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3bb9592750>

In [36]:
lm.optimizer.lr=0.001
model.fit_generator(batches, batches.n, nb_epoch=10,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3bb9592e10>

# BatchNormalization + Data Augmentation

In [37]:
def get_model_bn():
    model = Sequential([
        Lambda(norm_input, input_shape=(1,28,28)),
        Convolution2D(32,3,3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(32,3,3, activation='relu'),
        MaxPooling2D(),
        BatchNormalization(axis=1),
        Convolution2D(64,3,3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(64,3,3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        BatchNormalization(axis=1),
        Dense(512, activation='relu'),
        BatchNormalization(axis=1),
        Dense(10, activation='softmax')        
    ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [38]:
model = get_model_bn()

  .format(self.name, input_shape))


In [39]:
#And same routine as before
model.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3bb7668390>

In [40]:
lm.optimizer.lr=0.1
model.fit_generator(batches, batches.n, nb_epoch=6,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3baf8f4f90>

In [41]:
lm.optimizer.lr=0.01
model.fit_generator(batches, batches.n, nb_epoch=8,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3bb95aa050>

In [46]:
lm.optimizer.lr=0.001
model.fit_generator(batches, batches.n, nb_epoch=10,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3bb9273090>

# BatchNormalization + Dropout + Data Augmentation

In [49]:
def get_model_bn_do():
    model = Sequential([
        Lambda(norm_input, input_shape=(1,28,28)),
        Convolution2D(32,3,3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(32,3,3, activation='relu'),
        MaxPooling2D(),
        BatchNormalization(axis=1),
        Convolution2D(64,3,3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(64,3,3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        BatchNormalization(axis=1),
        Dense(512, activation='relu'),
        BatchNormalization(axis=1),
        Dropout(0.5),
        Dense(10, activation='softmax')        
    ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [50]:
model = get_model_bn_do()

  .format(self.name, input_shape))


In [51]:
#And same routine as before
model.fit_generator(batches, batches.n, nb_epoch=1,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f3b8e32a250>

In [52]:
lm.optimizer.lr=0.1
model.fit_generator(batches, batches.n, nb_epoch=6,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f3b8ed3c510>

In [53]:
lm.optimizer.lr=0.01
model.fit_generator(batches, batches.n, nb_epoch=8,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3b8ed95ed0>

In [54]:
lm.optimizer.lr=0.001
model.fit_generator(batches, batches.n, nb_epoch=10,
                validation_data=test_batches, nb_val_samples=test_batches.n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3b8ec26110>

# Ensembling

In [55]:
def fit_model():
    model = get_model_bn_do()
    
    model.fit_generator(batches, batches.n, nb_epoch=1, verbose=0,
                validation_data=test_batches, nb_val_samples=test_batches.n)
    
    model.optimizer.lr = 0.1
    model.fit_generator(batches, batches.n, nb_epoch=6, verbose=0,
                validation_data=test_batches, nb_val_samples=test_batches.n)
    
    model.optimizer.lr = 0.01
    model.fit_generator(batches, batches.n, nb_epoch=8, verbose=0,
                validation_data=test_batches, nb_val_samples=test_batches.n)
    
    model.optimizer.lr = 0.001
    model.fit_generator(batches, batches.n, nb_epoch=10, verbose=0,
                validation_data=test_batches, nb_val_samples=test_batches.n)
    
    return model
   
    

In [56]:
models = [fit_model() for i in range(6)]

  .format(self.name, input_shape))
  .format(self.name, input_shape))
  .format(self.name, input_shape))
  .format(self.name, input_shape))
  .format(self.name, input_shape))
  .format(self.name, input_shape))


In [59]:
for i,m in enumerate(models):
    m.save_weights('cbb-mnist-' + str(i) + '.pkl')

In [66]:
evals = np.array([m.evaluate(X_test, y_test, batch_size = 256) for m in models])



In [67]:
evals.mean(axis=0)

array([ 0.0135,  0.9956])

In [68]:
all_preds = np.stack([m.predict(X_test, batch_size = 256) for m in models])

In [69]:
all_preds.shape

(6, 10000, 10)

In [70]:
avg_preds = all_preds.mean(axis=0)

In [71]:
keras.metrics.categorical_accuracy(y_test, avg_preds).eval()

array(0.9969000220298767, dtype=float32)