In [9]:
####################################
## Load, make and save data sets: ##
####################################

from pipeline.dataops.load import formatted_mnist
from pipeline.dataops.preproc import save_gnoisy, save_corrupted

# Load MNIST data set from Keras 
(input_shape, (x_train, y_train),(x_test, y_test)) = formatted_mnist()

## Make noisy and corrupted datasets
#gauss_params = ((0,8), (0,32),(0,128))
#save_gnoisy(x_train, gauss_params)

#percent_lerr_tuple = (0.05, 0.15, 0.50)
#save_corrupted(percent_lerr_tuple, y_train)

# Truncate data (testing only)
if True:
    x_train = x_train[0:1000,:,:]
    y_train = y_train[0:1000]
    x_test = x_test[0:500, :, :]
    y_test = y_test[0:500]
    
train, test = (x_train, y_train), (x_test, y_test)

Using TensorFlow backend.


In [14]:
'''Trains a simple convnet on the MNIST dataset.
Gets to 99.25% test accuracy after 12 epochs
(there is still a lot of margin for parameter tuning).
16 seconds per epoch on a GRID K520 GPU. --- Edit this comment and add citation, ma
ke sure doc strings are in place.
'''
from __future__ import print_function # Do we really need this? Where is the old print function called??
"""returns (callbacks, model), where callbacks is a tuple of callbacks
"""
import os #avoid importing the entire module
import keras #avoid importing the entire module
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

## checkpointer callback imports ##
###################################
from keras.callbacks import ModelCheckpoint, TensorBoard
from pipeline.keras.callbacks import LossHistory

model_id = "scrap"

x_train = train[0]
y_train = train[1]
x_test = test[0]
y_test = test[1]
batch_size = 16
num_classes = 10
epochs = 2

img_rows, img_cols = 28, 28
input_shape = (img_rows, img_cols, 1)

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

# If model directory does not exist, make  one
model_name = "mnist_cnn_{}".format(model_id)
model_dir = "./results/" + model_name
checkfile = "/chckpt_weights.{epoch:02d}-{val_loss:.2f}.hdf5"
filepath = model_dir + checkfile
if not os.path.exists(model_dir):
                os.makedirs(model_dir)

## Callbacks
loss_history = LossHistory()
# Warning: don't use .format() method {} conflicts with keras parsing, tuple out of bounds error
checkpointer = ModelCheckpoint(filepath=filepath, verbose=1, save_best_only=True)
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
          verbose=0, validation_data=(x_test, y_test), callbacks = [checkpointer, loss_history])
# saves model as an HDF5 file
model.save(model_dir+"mnist_cnn_{}".format(model_name))

score = model.evaluate(x_test, y_test, verbose=0)
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 00000: val_loss improved from inf to 1.03619, saving model to ./results/mnist_cnn_scrap/chckpt_weights.00-1.04.hdf5
Epoch 00001: val_loss improved from 1.03619 to 0.47838, saving model to ./results/mnist_cnn_scrap/chckpt_weights.01-0.48.hdf5
x_train shape: (1000, 28, 28, 1)
1000 train samples
500 test samples
Test loss: 0.478375179291
Test accuracy: 0.856000000477


In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 12, 12, 64)        0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 12, 12, 64)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               1179776   
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
__________

In [25]:
[loss, accuracy] = model.evaluate(x_test, y_test, verbose=0)
error_rate = 1 - accuracy

[0.4783751792907715, 0.85600000047683711]

In [None]:

        #TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)
################
################

from keras.models import model_from_json
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
score = loaded_model.evaluate(x_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
loaded_model.metric

In [None]:
# Check the above:

#y_pred = model.predict(x_test)
#print(class_correct[0]/class_totals[0])
#print("model accuracy", accuracy)
#print(np.sum(class_correct)/np.sum(class_totals))

# Implementation Notes, Fixes, Etc:

In [1]:
# In preproc.py
#### Image Noise ####
## Use hdf5 instead ... faster, more secure, better integration, more scalable??

In [None]:
# shebang line in package files?? Needed??
#try to avoid importing the entire module

@Binil, @biosopher : I've updated the code. Due to Keras switching backends from Theano to Tensorflow, the image dim ordering is altered as well from (channels, width, height) to (width, height, channels). Once this change has been made, the script performs well.

In [3]:
##In confusion matrix, native rounding as local setting to plot function, not working ##
#This isn't working
#np.set_printoptions(precision=2)

# Do we really need this? Where is the old print function called??
# tabs or spaces, choose and make uniform


## To Do:

In [None]:
# Check MNIST data set is correct
# Check that objectives are met
# Check style guide and your code: tabs or spaces, etc.
# Choose and defend CNN architecture, parameters, optimizers, etc.
# Debug the rest of the code in notebook
# Finish Answers
# Add Filtering image preprocessing, before and after



# Add visualizations of convolutional filters on image maps ...
# tsne embedding
# Add bonus, cross validation of model, end to end done right ... (perhaps without parameter tuning)
# Add bonus, train ensemble of models at different label corruptions and plot at a given parameter
# Add bonus, implement custom loss function robust against label error corruption, and compare results with previous results.
# Time training/cnn diagnostic calculations ... compute memory/resources for each code snippet
# plot graph of model architecture

# To Do:

1. Hypothesis testing on labels, interperet/(if need be) correct results, they don't make sense
2. Style guide, coding practices, design, and python idioms
2.5 Thoroughly test all code, write unit tests for code in Nose/unittest
3. Finish exposition, include paper results/references to back up work
3.5 Commit results to new, github repo, and add commits ...
4. Cross-Validate, more critically evaluate model ... improve and discuss improvements (easy to discuss/do first)
5. Actually extend results, other models reproducing #3


Scrap

In [None]:
print(np.sum(np.sum(np.multiply(np.squeeze(x_train), np.squeeze(x_train)), axis=1), axis=1))

In [None]:
x_train[:,:,:,0].shape
np.argmax(y_train, axis=1)
np.round(8.5)