This notebook is for training the network for SVHN dataset.

In [1]:
%env THEANO_FLAGS=floatX=float32, device=gpu2
import numpy as np
import theano
import theano.tensor as T
import lasagne
from __future__ import print_function

from BC_layers import DenseLayer, Conv2DLayer
from BC_utils import calculate_update
from pylearn2.datasets.svhn import SVHN
from pylearn2.utils import serial

env: THEANO_FLAGS=floatX=float32, device=gpu2


Using gpu device 2: GeForce GTX TITAN X (CNMeM is disabled, cuDNN Version is too old. Update to v5, was 4004.)


The original implementation use a script for pre-processing. I also found preprocessing script [here](https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/papers/maxout/svhn_preprocessing.py), and it seems that it is from the codes for Maxout Network. So here in order to make sure I have the same preprocessing result, I use the same code in the link.

Moreover, the code for downloading and splitting the data is from [here](https://raw.githubusercontent.com/lisa-lab/pylearn2/master/pylearn2/scripts/datasets/download_svhn.sh), also in the pylearn2 library. Although we can also use the code in the previous homeworks, here I use that code for consistency of the overall training.

In [2]:
# make logging functions
import logging
logger = logging.getLogger('')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('experiment.log')
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)

In [4]:
# make sure you have downloaded the data in the correct directory

# make need to change the dir here on different machine
%env SVHN_LOCAL_PATH=/mnt/hdd2/yluo/data/SVHN 
%run make_SVHN.py
%run SVHN_preprocess.py



env: SVHN_LOCAL_PATH=/mnt/hdd2/yluo/data/SVHN
GCN processing data from 0 to 5000
GCN processing data from 5000 to 10000
GCN processing data from 10000 to 15000
GCN processing data from 15000 to 20000
GCN processing data from 20000 to 25000
GCN processing data from 25000 to 30000
GCN processing data from 30000 to 35000
GCN processing data from 35000 to 40000
GCN processing data from 40000 to 45000
GCN processing data from 45000 to 50000
GCN processing data from 50000 to 55000
GCN processing data from 55000 to 60000
GCN processing data from 60000 to 65000
GCN processing data from 65000 to 70000
GCN processing data from 70000 to 75000
GCN processing data from 75000 to 80000
GCN processing data from 80000 to 85000
GCN processing data from 85000 to 90000
GCN processing data from 90000 to 95000
GCN processing data from 95000 to 100000
GCN processing data from 100000 to 105000
GCN processing data from 105000 to 110000
GCN processing data from 110000 to 115000
GCN processing data from 115000 t

Again, here I use the same code to load the data.

In [3]:
print('Loading SVHN dataset')
%env SVHN_LOCAL_PATH=/mnt/hdd2/yluo/data/SVHN   

train_set = SVHN(
    which_set= 'splitted_train',
    path= "${SVHN_LOCAL_PATH}",
    axes= ['b', 'c', 0, 1])
     
valid_set = SVHN(
    which_set= 'valid',
    path= "${SVHN_LOCAL_PATH}",
    axes= ['b', 'c', 0, 1])
    
test_set = SVHN(
    which_set= 'test',
    path= "${SVHN_LOCAL_PATH}",
    axes= ['b', 'c', 0, 1])
    
# bc01 format
# print train_set.X.shape
train_set.X = np.reshape(train_set.X,(-1,3,32,32)).astype('float32')
valid_set.X = np.reshape(valid_set.X,(-1,3,32,32)).astype('float32')
test_set.X = np.reshape(test_set.X,(-1,3,32,32)).astype('float32')
    
# for hinge loss 
# make targets onehot
def make_onehot(dataset, category=10):
    n_dataset = np.zeros((dataset.shape[0], category), dtype=np.int32)
    n_dataset[np.arange(n_dataset.shape[0]), dataset[:,0]-1] = 1
    return n_dataset

train_set.y = np.subtract(np.multiply(2,make_onehot(train_set.y)),1.).astype('int32')
valid_set.y = np.subtract(np.multiply(2,make_onehot(valid_set.y)),1.).astype('int32')
test_set.y = np.subtract(np.multiply(2,make_onehot(test_set.y)),1.).astype('int32')

Loading SVHN dataset
env: SVHN_LOCAL_PATH=/mnt/hdd2/yluo/data/SVHN




In [4]:
# make dataset shared variables
from theano import shared
train_set_x = shared(train_set.X, borrow=True)
train_set_y = shared(train_set.y, borrow=True)
valid_set_x = shared(valid_set.X, borrow=True)
valid_set_y = shared(valid_set.y, borrow=True)
test_set_x = shared(test_set.X, borrow=True)
test_set_y = shared(test_set.y, borrow=True)

In [5]:
from lasagne.layers import batch_norm
def make_network(input_shape, net_arch, net_spec):
    assert len(net_arch) == len(net_spec)
    
    layer = lasagne.layers.InputLayer(shape=input_shape)
    
    layers = {'in': layer}

    for i in range(len(net_arch)):
        if net_arch[i]=='noise':
            lasagne.layers.GaussianNoiseLayer(layer,**net_spec[i])
        
        elif net_arch[i]=='dropout':
            layer = lasagne.layers.DropoutLayer(layer,**net_spec[i])
        
        elif net_arch[i]=='reshape':
            layer = lasagne.layers.ReshapeLayer(layer,**net_spec[i])
        
        elif net_arch[i]=='cnn':
            layer = Conv2DLayer(layer, **net_spec[i])
        
        elif net_arch[i]=='bn':
            layer = lasagne.layers.batch_norm(layer, **net_spec[i])
            
        elif net_arch[i]=='maxpool':
            layer = lasagne.layers.MaxPool2DLayer(layer, **net_spec[i])
            
        elif net_arch[i][:11]=='feedforward':
            if net_arch[i]=='feedforward_tanh':
                nonlinearity = lasagne.nonlinearities.tanh
            elif net_arch[i]=='feedforward_sigmoid':
                nonlinearity = lasagne.nonlinearities.sigmoid
            elif net_arch[i]=='feedforward_softmax':
                nonlinearity = lasagne.nonlinearities.softmax
            elif net_arch[i]=='feedforward_linear':
                nonlinearity = lasagne.nonlinearities.linear
            elif net_arch[i]=='feedforward_rectify':
                nonlinearity = lasagne.nonlinearities.rectify
            elif net_arch[i]=='feedforward_leaky':
                nonlinearity = lasagne.nonlinearities.leaky_rectify
            elif net_arch[i]=='feedforward_identity':
                nonlinearity = lasagne.nonlinearities.identity
                
            layer = DenseLayer(layer,nonlinearity=nonlinearity,**net_spec[i])
                
                
    layers['out'] = layer
    return layers

In [6]:
# make network
binary=True
stochastic=True

net_arch=['cnn','bn','cnn','maxpool','bn','cnn','bn','cnn','maxpool','bn','cnn','bn','cnn','maxpool','bn',
         'feedforward_rectify', 'bn', 'feedforward_rectify', 'bn', 'feedforward_identity', 'bn']

net_spec=[{'binary':binary,'stochastic':stochastic,'num_filters':64,'filter_size':(3,3),'pad':1}, 
          {'name': 'batch norm 1'}, 
          {'binary':binary,'stochastic':stochastic,'num_filters':64,'filter_size':(3,3),'pad':1}, 
          {'pool_size':(2, 2)}, 
          {'name': 'batch norm 2'}, 
          {'binary':binary,'stochastic':stochastic,'num_filters':128,'filter_size':(3,3),'pad':1}, 
          {'name': 'batch norm 3'}, 
          {'binary':binary,'stochastic':stochastic,'num_filters':128,'filter_size':(3,3),'pad':1}, 
          {'pool_size':(2, 2)}, 
          {'name': 'batch norm 4'}, 
          {'binary':binary,'stochastic':stochastic,'num_filters':256,'filter_size':(3,3),'pad':1}, 
          {'name': 'batch norm 5'}, 
          {'binary':binary,'stochastic':stochastic,'num_filters':256,'filter_size':(3,3),'pad':1}, 
          {'pool_size':(2, 2)}, 
          {'name': 'batch norm 6'}, 
          {'binary':binary,'stochastic':stochastic,'num_units':1024}, 
          {'name': 'batch norm 7'}, 
          {'binary':binary,'stochastic':stochastic,'num_units':1024}, 
          {'name': 'batch norm 8'}, 
          {'binary':binary,'stochastic':stochastic,'num_units':10}, 
          {'name': 'batch norm 9'}]

In [7]:
input_shape=(None, 3, 32, 32)
Layer = make_network(input_shape,net_arch,net_spec)

input = T.ftensor4('input')
Y = T.imatrix('Y')
LR = T.fscalar('learning_rate')

In [8]:
# cost function
# hinge loss
cost_tr = T.mean(T.sqr(T.maximum(0.,1. - Y * lasagne.layers.get_output(Layer['out'], 
                                                                     {Layer['in']: input}, deterministic=False))))

cost_cv = T.mean(T.sqr(T.maximum(0.,1. - Y * lasagne.layers.get_output(Layer['out'], 
                                                                     {Layer['in']: input}, deterministic=True))))
error_cv = T.mean(T.neq(T.argmax(lasagne.layers.get_output(Layer['out'], \
                                                           {Layer['in']: input}, deterministic=True), axis=1), 
                        T.argmax(Y, axis=1)),dtype='float32')

In [9]:
# compute update
update = calculate_update(Layer, cost_tr, LR, binary=binary)

In [19]:
batch_size = 1024
n_train_batches = train_set_x.get_value(borrow=True).shape[0]
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
n_test_batches = test_set_x.get_value(borrow=True).shape[0]
n_train_batches //= batch_size
n_valid_batches //= batch_size
n_test_batches //= batch_size


index = T.lscalar()

import time
start_time = time.time()

logger.info('Computing function for training...')
train_model = theano.function(
        inputs=[index, LR],
        outputs=cost_tr,
        updates=update,
        givens={
            input: train_set_x[index * batch_size: (index + 1) * batch_size],
            Y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
print('Time spent: ', time.time() - start_time)

logger.info('Compiling cost function for testing...')
test_model = theano.function(
        inputs=[index],
        outputs=error_cv,
        givens={
            input: test_set_x[index * batch_size:(index + 1) * batch_size],
            Y: test_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )
print('Time spent: ', time.time() - start_time)

logger.info('Compiling cost function for validation...')
validate_model = theano.function(
        inputs=[index],
        outputs=error_cv,
        givens={
            input: valid_set_x[index * batch_size:(index + 1) * batch_size],
            Y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )
print('Time spent: ', time.time() - start_time)

INFO:root:Computing function for training...
Computing function for training...
INFO:root:Compiling cost function for testing...
Compiling cost function for testing...


Time spent:  8.615265131
Time spent: 

INFO:root:Compiling cost function for validation...
Compiling cost function for validation...


 9.26482701302
Time spent:  9.91819596291


In [25]:
# training function
import timeit
import inspect
import time



def train_nn(train_model, validate_model, test_model, LR, 
            n_train_batches, n_valid_batches, n_test_batches, n_epochs,
            verbose = True):
    """
    Function from HW3

    """

    # early-stopping parameters
    patience = 1e9  # look as this many examples regardless
    patience_increase = 5  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.98  # a relative improvement of this much is
                                   # considered significant

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    
    LR_decay = (0.0003/LR)**(1./n_epochs)

    while epoch < n_epochs:
        epoch = epoch + 1
        start_epoch_time = time.time()
        
        for minibatch_index in range(n_train_batches):
            cost_ij = train_model(minibatch_index, LR)

        validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
        this_validation_loss = np.mean(validation_losses)

        if verbose:
            print('epoch %i, validation error %f %%' %
                        (epoch,
                         this_validation_loss * 100.))

        # if we got the best validation score until now
        if this_validation_loss < best_validation_loss:
            # save best validation score and iteration number
            best_validation_loss = this_validation_loss
            best_epoch = epoch

            # test it on the test set
            test_losses = [
                        test_model(i)
                        for i in range(n_test_batches)
                ]
            test_score = np.mean(test_losses)

            if verbose:
                print(('     epoch %i, test error of '
                               'best model %f %%') %
                              (epoch, 
                               test_score * 100.))
        
        # decay learning rate
        LR *= LR_decay
        
        logger.info("Epoch {} took {} seconds.\n".format(
            epoch, time.time() - start_epoch_time))

    end_time = timeit.default_timer()

    # Print out summary
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at epoch %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_epoch, test_score * 100.))

In [None]:
# train the network
print('... training')

train_nn(train_model, validate_model, test_model, 0.1,
        n_train_batches, n_valid_batches, n_test_batches, 100, verbose=True)