In [1]:
#IPython extension to reload modules before executing user code.
#'autoreload' reloads modules automatically before entering the execution of code typed at the IPython prompt.
%load_ext autoreload
%autoreload 2

In [2]:
import os
import gpustat

stats = gpustat.GPUStatCollection.new_query()
ids = map(lambda gpu: int(gpu.entry['index']), stats)
ratios = map(lambda gpu: float(gpu.entry['memory.used'])/float(gpu.entry['memory.total']), stats)
bestGPU = min(zip(ids, ratios), key=lambda x: x[1])[0]

print("setGPU: Setting GPU to: {}".format(bestGPU))
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = str(bestGPU)

setGPU: Setting GPU to: 0



# MODEL:
1 model normal
1 model Learning rate /2 regulierement qui apporte une meilleure generalisation que directement un petit learning rate
1 model avec weights decay #L² (qui ameliore nettement l'orthogonalisation et donc le generalisation pour les SGD #Simon'sArticle)

(+effets positifs des petits minibatch sur la generalisation)


### Import packages

In [3]:
# IMPORT
import numpy

import keras
from keras import backend as K
from keras import optimizers
from keras import regularizers
from keras.callbacks import LearningRateScheduler
from keras.datasets import cifar10 # we can use also cifar100
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Activation
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.models import Sequential
from keras.models import load_model
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot

import matplotlib.pyplot as plt

import sklearn
import sklearn.metrics

import tensorflow as tf

from IPython.display import SVG


#print(sklearn.__version__)

Using TensorFlow backend.


In [4]:
# LOAD DATA
num_classes = 10
# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples


# TODO
batch normalization apres chaque couche (depend de la taille du réseau)

initialization:poids random?

In [5]:
# MODEL 0
def model_0(lr):
  # create model
  # https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py
  model = Sequential()
  model.add(Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]))
  model.add(Activation('relu'))
  model.add(Conv2D(32, (3, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  
  model.add(Conv2D(64, (3, 3), padding='same'))
  model.add(Activation('relu'))
  model.add(Conv2D(64, (3, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  
  model.add(Flatten())
  model.add(Dense(512))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes))
  model.add(Activation('softmax'))
  # Optimizer
  sgd = optimizers.SGD(lr)
  #opt = keras.optimizers.rmsprop(lr=0.001, decay=0.0)
  # Compile model
  #loss : 'categorical_crossentropy', 'mean_squared_error'
  #optimizer : 'adam', 'sgd',adadelta, adagrad, RMSprop, Adamax, Nadam
  model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
  return model

In [None]:
# MODEL 1
def model_1(lr, weightdecay):
  # create model
  # https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py
  model = Sequential()
  model.add(Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]))
  model.add(Activation('relu'))
  model.add(Conv2D(32, (3, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  
  model.add(Conv2D(64, (3, 3), padding='same'))
  model.add(Activation('relu'))
  model.add(Conv2D(64, (3, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  
  model.add(Flatten())
  model.add(Dense(512, kernel_regularizer=regularizers.l2(weightdecay)))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes, kernel_regularizer=regularizers.l2(weightdecay)))
  model.add(Activation('softmax'))
  # Optimizer
  sgd = optimizers.SGD(lr)
  #opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
  # Compile model
  #loss : 'categorical_crossentropy', 'mean_squared_error'
  #optimizer : 'adam', 'sgd',adadelta, adagrad, RMSprop, Adamax, Nadam
  model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
  return model

In [None]:
# MODEL 2
def model_2(lr, weightdecay):
  # create model
  model = Sequential()
  model.add(Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]))
  model.add(Activation('relu'))
  model.add(Conv2D(32, (3, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  
  model.add(Conv2D(64, (3, 3), padding='same'))
  model.add(Activation('relu'))
  model.add(Conv2D(64, (3, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  
  model.add(Flatten())
  model.add(Dense(512, kernel_regularizer=regularizers.l2(weightdecay)))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes, kernel_regularizer=regularizers.l2(weightdecay)))
  model.add(Activation('softmax'))
  # Optimizer
  sgd = optimizers.SGD(lr)
  #opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
  # Compile model
  #loss : 'categorical_crossentropy', 'mean_squared_error'
  #optimizer : 'adam', 'sgd',adadelta, adagrad, RMSprop, Adamax, Nadam
  model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
  return model

## (11/2/2019) New models

### RESNET V1

In [8]:
def resnet_layer(inputs,
                 num_filters=16,
                 kernel_size=3,
                 strides=1,
                 activation='relu',
                 batch_normalization=True,
                 conv_first=True,
                 use_bias = True,
                 batchnorm_training = True,
                 name = '',
                 weight_decay=0.): #1e-4
    """2D Convolution-Batch Normalization-Activation stack builder
    # Arguments
        inputs (tensor): input tensor from input image or previous layer
        num_filters (int): Conv2D number of filters
        kernel_size (int): Conv2D square kernel dimensions
        strides (int): Conv2D square stride dimensions
        activation (string): activation name
        batch_normalization (bool): whether to include batch normalization
        conv_first (bool): conv-bn-activation (True) or
            activation-bn-conv (False)
    # Returns
        x (tensor): tensor as input to the next layer
    """
    regularizer = l2(weight_decay) if weight_decay >0. else None
    conv = Conv2D(num_filters,
                  kernel_size=kernel_size,
                  strides=strides,
                  padding='same',
                  kernel_initializer='he_normal',
                  use_bias = use_bias,
                  name = name+'_conv',
                  kernel_regularizer=regularizer)

    x = inputs
    if conv_first:
        x = conv(x)
        if batch_normalization:
            x = BatchNormalization(center = batchnorm_training, scale = batchnorm_training, name = name + '_batch')(x)
        if activation is not None:
            x = Activation(activation, name = name+'_act')(x)
    else:
        if batch_normalization:
            x = BatchNormalization(center = batchnorm_training, scale = batchnorm_training, name = name + '_batch')(x)
        if activation is not None:
            x = Activation(activation, name = name+'_act')(x)
        x = conv(x)
    return x

In [9]:
def resnet_v1(input_shape, depth, num_classes=10, use_bias = True, batchnorm_training = True, weight_decay = 0.):
    """ResNet Version 1 Model builder [a]
    Stacks of 2 x (3 x 3) Conv2D-BN-ReLU
    Last ReLU is after the shortcut connection.
    At the beginning of each stage, the feature map size is halved (downsampled)
    by a convolutional layer with strides=2, while the number of filters is
    doubled. Within each stage, the layers have the same number filters and the
    same number of filters.
    Features maps sizes:
    stage 0: 32x32, 16
    stage 1: 16x16, 32
    stage 2:  8x8,  64
    The Number of parameters is approx the same as Table 6 of [a]:
    ResNet20 0.27M
    ResNet32 0.46M
    ResNet44 0.66M
    ResNet56 0.85M
    ResNet110 1.7M
    # Arguments
        input_shape (tensor): shape of input image tensor
        depth (int): number of core convolutional layers
        num_classes (int): number of classes (CIFAR10 has 10)
    # Returns
        model (Model): Keras model instance
    """
    if (depth - 2) % 6 != 0:
        raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])')
    # Start model definition.
    num_filters = 16
    num_res_blocks = int((depth - 2) / 6)

    inputs = Input(shape=input_shape)
    x = resnet_layer(inputs=inputs, use_bias = use_bias, batchnorm_training = batchnorm_training, name = 'first',weight_decay = weight_decay)
    # Instantiate the stack of residual units
    for stack in range(3):
        for res_block in range(num_res_blocks):
            strides = 1
            if stack > 0 and res_block == 0:  # first layer but not first stack
                strides = 2  # downsample
            y = resnet_layer(inputs=x,
                             num_filters=num_filters,
                             strides=strides,
                             use_bias = use_bias, batchnorm_training = batchnorm_training,
                             name = str(stack)+'_'+str(res_block)+'_1',
                             weight_decay = weight_decay)
            y = resnet_layer(inputs=y,
                             num_filters=num_filters,
                             activation=None,
                             use_bias = use_bias, batchnorm_training = batchnorm_training,
                             name = str(stack)+'_'+str(res_block)+'_2',
                             weight_decay = weight_decay)
            if stack > 0 and res_block == 0:  # first layer but not first stack
                # linear projection residual shortcut connection to match
                # changed dims
                x = resnet_layer(inputs=x,
                                 num_filters=num_filters,
                                 kernel_size=1,
                                 strides=strides,
                                 activation=None,
                                 batch_normalization=False,
                                 use_bias = use_bias, batchnorm_training = batchnorm_training,
                                 name = str(stack)+'_'+str(res_block)+'_strided',
                                 weight_decay = weight_decay)
            x = keras.layers.add([x, y])
            x = Activation('relu')(x)
        num_filters *= 2

    # Add classifier on top.
    # v1 does not use BN after last shortcut connection-ReLU
    x = AveragePooling2D(pool_size=8)(x)
    y = Flatten()(x)
    outputs = Dense(num_classes,
                    activation='softmax',
                    kernel_initializer='he_normal',
                    use_bias = use_bias,
                    name = 'last_dense')(y)

    # Instantiate model.
    model = Model(inputs=inputs, outputs=outputs)
    return model

### VGG

In [10]:
#==============================================================================
# VGG model from pytorchblog: http://torch.ch/blog/2015/07/30/cifar.html
# Was also used by "The marginal value of adaptive gradient methods in machine learning"
#==============================================================================

def VGG_pytorchBlogStyle(input_shape, nbstages, nblayers, nbfilters,nbclasses,weight_decay=0., 
        kernel_constraint = None, kernel_initializer='glorot_uniform', include_top = True, use_batchnorm = True,
        batchnorm_training = True, use_bias = True, act = 'relu', dropout = 0., kernel_size = (3,3),
        batchnorm_position = 'before'):
    '''
    nbstages is the number of spatial dimension levels
    nblayers is a list with nbstages elements containing the 
        number of convolutional layers per stage
    nbfilters is a list of size sum(nbstages) with the 
        number of filters per convolutional layer in a stage
    
    kernel_constraint only applied on convolutional layers
    
    uses batchnorm after or before non-linearity
    '''    
    if K.image_data_format() == 'channels_last':
        if len(input_shape) == 2:
            input_shape = input_shape + (3,)
        channel_axis = -1
    elif K.image_data_format() == 'channels_first':
        if len(input_shape) == 2:
            input_shape = (3,) + input_shape
        channel_axis = 1
    
    if len(nblayers) != nbstages:
        raise ValueError('nblayers should contain one element per stage.')
    if len(nbfilters) != nbstages:
        raise ValueError('nbfilters should contain one element per stage.')
        
    if batchnorm_position not in ['after','before']:
        raise ValueError('batchnorm_position argument should be either \'after\' or \'before\'')
    
    regularizer = None
    if weight_decay > 0.:
        regularizer = l2(weight_decay)
    
    input_model = Input(shape = input_shape)
    x = input_model
    
    layer_counter = 0
    for s in range(nbstages):
        for l in range(nblayers[s]):
            x = Conv2D(nbfilters[s], kernel_size = kernel_size, padding = 'same',
                       name = 'stage'+str(s)+'_layer'+str(l)+'_conv',
                       kernel_constraint = kernel_constraint,
                       kernel_initializer = kernel_initializer,
                       kernel_regularizer=regularizer,
                       use_bias = use_bias)(x)
            
            if use_batchnorm and batchnorm_position == 'before':
                x = BatchNormalization(axis = channel_axis, name = 'stage'+str(s)+'_layer'+str(l)+'_batch',
                                       center = batchnorm_training, scale = batchnorm_training)(x)
                
            if act is not 'leaky':
                x = Activation('relu', name = 'stage'+str(s)+'_layer'+str(l)+'_relu')(x)
            else:
                x = LeakyReLU(alpha = 0.3, name = 'stage'+str(s)+'_layer'+str(l)+'_relu')(x)
                
            if use_batchnorm and batchnorm_position == 'after':
                x = BatchNormalization(axis = channel_axis, name = 'stage'+str(s)+'_layer'+str(l)+'_batch',
                                       center = batchnorm_training, scale = batchnorm_training)(x)
            
            if l<nblayers[s]-1:
                if s == 0:
                    x = Dropout(0.3)(x)
                else:
                    x = Dropout(0.4)(x)
                        
            layer_counter += 1
        
        x = MaxPooling2D((2,2),strides = (2,2), name = 'stage'+str(s)+'_pool')(x)
    
    if include_top:
        x = Flatten()(x)
        x = Dropout(0.5)(x)
        x = Dense(512, kernel_initializer = kernel_initializer, use_bias = use_bias,kernel_regularizer=regularizer, name = 'dense1')(x)
        if use_batchnorm and batchnorm_position == 'before':
            x = BatchNormalization(axis = channel_axis,center = batchnorm_training, scale = batchnorm_training)(x)
        x = Activation('relu')(x)
        if use_batchnorm and batchnorm_position == 'after':
            x = BatchNormalization(axis = channel_axis,center = batchnorm_training, scale = batchnorm_training)(x)
        x = Dropout(0.5)(x)
        x = Dense(nbclasses,name = 'last_dense', kernel_initializer = kernel_initializer, use_bias = use_bias,
                 kernel_regularizer=regularizer)(x)
        x = Activation('softmax',name = 'predictions')(x)
    
    return Model(input_model,x)
    
    
def VGG_pytorchBlog(weight_decay=0.0005):
    model = VGG_pytorchBlogStyle((32,32), 5, [2,2,3,3,3], [64,128,256,512,512],10,weight_decay=weight_decay,
                                 batchnorm_training = False, use_bias = False, kernel_initializer='he_normal')
    
    weights_location = 'VGG_pytorchBlog_initial_weights.h5'
    if not os.path.isfile(weights_location):
        model.save_weights(weights_location)
    else:
        model.load_weights(weights_location)
            
    return model


In [None]:
# pré-TRAINING
batch_size = 32
epochs = 25
# build the models
model0 = model_0(lr=0.1)
model1 = model_1(lr=0.1, weightdecay=0.001)
model2 = model_2(lr=0.1, weightdecay=0.001)

In [None]:
# plot the model
SVG(model_to_dot(model0).create(prog='dot', format='svg'))

In [None]:
# get weights
weights01 = model0.get_weights()
weights11 = model1.get_weights()
weights21 = model2.get_weights()

In [None]:
# Save the models and the initial weights
model0.save("models/model0.h5")
model1.save("models/model1.h5")
model2.save("models/model2.h5")
model0.save_weights("models/model0_weights0.h5")
model1.save_weights("models/model1_weights0.h5")
model2.save_weights("models/model2_weights0.h5")

In [None]:
data_augmentation = False

In [None]:
# Fit the model 0
if not data_augmentation:
    print('Not using data augmentation.')
    history0 = model0.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, verbose=1)

In [None]:
# Plot training & validation accuracy values
plt.plot(history0.history['acc'])
plt.plot(history0.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history0.history['loss'])
plt.plot(history0.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
model0.save_weights("models/model0_weights25.h5")

In [None]:
# Fit the model 1
if not data_augmentation:
    print('Not using data augmentation.')
    history1 = model1.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True)

In [None]:
# Plot training & validation accuracy values
plt.plot(history1.history['acc'])
plt.plot(history1.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history1.history['loss'])
plt.plot(history1.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
model1.save_weights("models/model1_weights25.h5")

In [None]:
def scheduler(epoch):
    if (epoch == 10) or (epoch == 20):
        print("Changing learning rate "+str(K.get_value(model2.optimizer.lr))+" to "+str(K.get_value(model2.optimizer.lr)/2))
        K.set_value(model2.optimizer.lr, K.get_value(model2.optimizer.lr)/2)
    return K.get_value(model2.optimizer.lr)

In [None]:
change_lr = LearningRateScheduler(scheduler)

In [None]:
# Fit the model 2
if not data_augmentation:
    print('Not using data augmentation.')
    history2 = model2.fit(x_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(x_test, y_test), 
               shuffle=True,
               callbacks=[change_lr])

In [None]:
# Plot training & validation accuracy values
plt.plot(history2.history['acc'])
plt.plot(history2.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
model2.save_weights("models/model2_weights25.h5")

In [None]:
# EVALUATION
# Final evaluation of the models
scores0 = model0.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores0[0])
print('Test accuracy:', scores0[1])
print("Model 0 Error: %.2f%%" % (100-scores0[1]*100))
scores1 = model1.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores1[0])
print('Test accuracy:', scores1[1])
print("Model 1 CNN Error: %.2f%%" % (100-scores1[1]*100))
scores2 = model2.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores2[0])
print('Test accuracy:', scores2[1])
print("Model 2 Error: %.2f%%" % (100-scores2[1]*100))