# Train a ConvNet!
We now have a generic solver and a bunch of modularized layers. It's time to put it all together, and train a ConvNet to recognize the classes in CIFAR-10. In this notebook we will walk you through training a simple two-layer ConvNet and then set you free to build the best net that you can to perform well on CIFAR-10.

Open up the file `cs231n/classifiers/convnet.py`; you will see that the `two_layer_convnet` function computes the loss and gradients for a two-layer ConvNet. Note that this function uses the "sandwich" layers defined in `cs231n/layer_utils.py`. 

In [None]:
# As usual, a bit of setup

import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifier_trainer import ClassifierTrainer
from cs231n.gradient_check import eval_numerical_gradient
from cs231n.classifiers.convnet import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [None]:
# Import data
train_ramspeed   = np.loadtxt(open("train_test_data/trainsamples_ramspeed.csv","rb"),delimiter=",",skiprows=0)
train_flac       = np.loadtxt(open("train_test_data/trainsamples_flac.csv","rb"),delimiter=",",skiprows=0)
train_aobench    = np.loadtxt(open("train_test_data/trainsamples_aobench.csv","rb"),delimiter=",",skiprows=0)
train_git        = np.loadtxt(open("train_test_data/trainsamples_git.csv","rb"),delimiter=",",skiprows=0)
train_gimp       = np.loadtxt(open("train_test_data/trainsamples_gimp.csv","rb"),delimiter=",",skiprows=0)
train_7zip       = np.loadtxt(open("train_test_data/trainsamples_7zip.csv","rb"),delimiter=",",skiprows=0)
train_cyclictest = np.loadtxt(open("train_test_data/trainsamples_cyclictest.csv","rb"),delimiter=",",skiprows=0)
train_opencv     = np.loadtxt(open("train_test_data/trainsamples_opencv.csv","rb"),delimiter=",",skiprows=0)
train_phpbench   = np.loadtxt(open("train_test_data/trainsamples_phpbench.csv","rb"),delimiter=",",skiprows=0)
train_stream     = np.loadtxt(open("train_test_data/trainsamples_stream.csv","rb"),delimiter=",",skiprows=0)
train_ttest      = np.loadtxt(open("train_test_data/trainsamples_ttest.csv","rb"),delimiter=",",skiprows=0)

test_ramspeed    = np.loadtxt(open("train_test_data/testsamples_ramspeed.csv","rb"),delimiter=",",skiprows=0)
test_flac        = np.loadtxt(open("train_test_data/testsamples_flac.csv","rb"),delimiter=",",skiprows=0)
test_aobench     = np.loadtxt(open("train_test_data/testsamples_aobench.csv","rb"),delimiter=",",skiprows=0)
test_git         = np.loadtxt(open("train_test_data/testsamples_git.csv","rb"),delimiter=",",skiprows=0)
test_gimp        = np.loadtxt(open("train_test_data/testsamples_gimp.csv","rb"),delimiter=",",skiprows=0)
test_7zip        = np.loadtxt(open("train_test_data/testsamples_7zip.csv","rb"),delimiter=",",skiprows=0)
test_cyclictest  = np.loadtxt(open("train_test_data/testsamples_cyclictest.csv","rb"),delimiter=",",skiprows=0)
test_opencv      = np.loadtxt(open("train_test_data/testsamples_opencv.csv","rb"),delimiter=",",skiprows=0)
test_phpbench    = np.loadtxt(open("train_test_data/testsamples_phpbench.csv","rb"),delimiter=",",skiprows=0)
test_stream      = np.loadtxt(open("train_test_data/testsamples_stream.csv","rb"),delimiter=",",skiprows=0)
test_ttest       = np.loadtxt(open("train_test_data/testsamples_ttest.csv","rb"),delimiter=",",skiprows=0)


In [None]:
# print data shapes
print('Train data shapes:')
print(train_ramspeed.shape)
print(train_flac.shape)
print(train_aobench.shape)
print(train_git.shape)
print(train_gimp.shape)
print(train_7zip.shape)
print(train_cyclictest.shape)
print(train_opencv.shape)
print(train_phpbench.shape)
print(train_stream.shape)
print(train_ttest.shape)

print('Test data shapes:')
print(test_ramspeed.shape)
print(test_flac.shape)
print(test_aobench.shape)
print(test_git.shape)
print(test_gimp.shape)
print(test_7zip.shape)
print(test_cyclictest.shape)
print(test_opencv.shape)
print(test_phpbench.shape)
print(test_stream.shape)
print(test_ttest.shape)

In [None]:
# from cs231n.data_utils import load_CIFAR10

# def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
#     """
#     Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
#     it for the two-layer neural net classifier. These are the same steps as
#     we used for the SVM, but condensed to a single function.  
#     """
    # Load the raw CIFAR-10 data
#     cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
#     X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    
    # Subsample the data
#     mask = range(num_training, num_training + num_validation)
#     X_val = X_train[mask]
#     y_val = y_train[mask]
#     mask = range(num_training)
#     X_train = X_train[mask]
#     y_train = y_train[mask]
#     mask = range(num_test)
#     X_test = X_test[mask]
#     y_test = y_test[mask]

#     # Normalize the data: subtract the mean image
#     mean_image = np.mean(X_train, axis=0)
#     X_train -= mean_image
#     X_val -= mean_image
#     X_test -= mean_image
    
#     # Transpose so that channels come first
#     X_train = X_train.transpose(0, 3, 1, 2).copy()
#     X_val = X_val.transpose(0, 3, 1, 2).copy()
#     X_test = X_test.transpose(0, 3, 1, 2).copy()

#     return X_train, y_train, X_val, y_val, X_test, y_test


# # Invoke the above function to get our data.
# X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
# print ('Train data shape: ', X_train.shape)
# print ('Train labels shape: ', y_train.shape)
# print ('Validation data shape: ', X_val.shape)
# print ('Validation labels shape: ', y_val.shape)
# print ('Test data shape: ', X_test.shape)
# print ('Test labels shape: ', y_test.shape)
##################################################

# train_ramspeed = np.loadtxt(open("train_test_data/trainsamples_ramspeed.csv","rb"),delimiter=",",skiprows=0)
# train_flac = np.loadtxt(open("train_test_data/trainsamples_flac.csv","rb"),delimiter=",",skiprows=0)
# test_ramspeed = np.loadtxt(open("train_test_data/testsamples_ramspeed.csv","rb"),delimiter=",",skiprows=0)
# test_flac = np.loadtxt(open("train_test_data/testsamples_flac.csv","rb"),delimiter=",",skiprows=0)

num_classes=11
"""
Label order: 7zip--1, cyclictest--2, ramspeed--3, aobench--4, gimp--5, 
             stream--6, flac--7, git--8, ttest--9, opencv--10, phpbench--11
"""

# Insert labels into the first column of each train/test data arrays before shuffle
lab1 = np.ones((train_7zip.shape[0],1))*0
lab_train_7zip = np.hstack((lab1,train_7zip))
lab1 = np.ones((test_7zip.shape[0],1))*0
lab_test_7zip = np.hstack((lab1,test_7zip))

lab2 = np.ones((train_cyclictest.shape[0],1))*1
lab_train_cyclictest = np.hstack((lab2,train_cyclictest))
lab2 = np.ones((test_cyclictest.shape[0],1))*1
lab_test_cyclictest = np.hstack((lab2,test_cyclictest))

lab3 = np.ones((train_ramspeed.shape[0],1))*2
lab_train_ramspeed = np.hstack((lab3,train_ramspeed))
lab3 = np.ones((test_ramspeed.shape[0],1))*2
lab_test_ramspeed = np.hstack((lab3,test_ramspeed))

lab4 = np.ones((train_aobench.shape[0],1))*3
lab_train_aobench = np.hstack((lab4,train_aobench))
lab4 = np.ones((test_aobench.shape[0],1))*3
lab_test_aobench = np.hstack((lab4,test_aobench))

lab5 = np.ones((train_gimp.shape[0],1))*4
lab_train_gimp = np.hstack((lab5,train_gimp))
lab5 = np.ones((test_gimp.shape[0],1))*4
lab_test_gimp = np.hstack((lab5,test_gimp))

lab6 = np.ones((train_stream.shape[0],1))*5
lab_train_stream = np.hstack((lab6,train_stream))
lab6 = np.ones((test_stream.shape[0],1))*5
lab_test_stream = np.hstack((lab6,test_stream))

lab7 = np.ones((train_flac.shape[0],1))*6
lab_train_flac = np.hstack((lab7,train_flac))
lab7 = np.ones((test_flac.shape[0],1))*6
lab_test_flac = np.hstack((lab7,test_flac))

lab8 = np.ones((train_git.shape[0],1))*7
lab_train_git = np.hstack((lab8,train_git))
lab8 = np.ones((test_git.shape[0],1))*7
lab_test_git = np.hstack((lab8,test_git))

lab9 = np.ones((train_ttest.shape[0],1))*8
lab_train_ttest = np.hstack((lab9,train_ttest))
lab9 = np.ones((test_ttest.shape[0],1))*8
lab_test_ttest = np.hstack((lab9,test_ttest))

lab10 = np.ones((train_opencv.shape[0],1))*9
lab_train_opencv = np.hstack((lab10,train_opencv))
lab10 = np.ones((test_opencv.shape[0],1))*9
lab_test_opencv = np.hstack((lab10,test_opencv))

lab11 = np.ones((train_phpbench.shape[0],1))*10
lab_train_phpbench = np.hstack((lab11,train_phpbench))
lab11 = np.ones((test_phpbench.shape[0],1))*10
lab_test_phpbench = np.hstack((lab11,test_phpbench))



In [None]:
Train = np.vstack((lab_train_7zip,lab_train_cyclictest,lab_train_ramspeed,lab_train_aobench,lab_train_gimp,
                   lab_train_stream,lab_train_flac,lab_train_git,lab_train_ttest,
                   lab_train_opencv,lab_train_phpbench))
np.random.shuffle(Train)
mask = range(Train.shape[0])
X_train = Train[mask,1:]
X_train = X_train.reshape(X_train.shape[0],1,50,40)
y_train = Train[mask,0]
y_train = y_train.astype(np.int64)

Test = np.vstack((lab_test_7zip,lab_test_cyclictest,lab_test_ramspeed,lab_test_aobench,lab_test_gimp,
                  lab_test_stream,lab_test_flac,lab_test_git,lab_test_ttest,
                  lab_test_opencv,lab_test_phpbench))
np.random.shuffle(Test)
maskidx = np.int(Test.shape[0]/2)
X_val = Test[range(maskidx),1:]
X_val = X_val.reshape(X_val.shape[0],1,50,40)
y_val = Test[range(maskidx),0]
y_val = y_val.astype(np.int64)
X_test = Test[maskidx:,1:]
X_test = X_test.reshape(X_test.shape[0],1,50,40)
y_test = Test[maskidx:,0]
y_test = y_test.astype(np.int64)

# Normalize the data: subtract the mean image
mean_image = np.mean(np.vstack((X_train,X_test,X_val)), axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image

print(train_aobench.shape)
print(train_ttest.shape)
print(lab_test_git.shape)
print(lab_test_stream.shape)
print(Train.shape)
print(Test.shape)
print ('Train data shape: ', X_train.shape)
print ('Train labels shape: ', y_train.shape)
print ('Validation data shape: ', X_val.shape)
print ('Validation labels shape: ', y_val.shape)
print ('Test data shape: ', X_test.shape)
print ('Test labels shape: ', y_test.shape)

In [None]:
print(y_val)
k=0

In [None]:
# print(X_train.shape)
k = np.random.randint(X_train.shape[0])
print(y_train[k],'k=',k)
bb = X_train[k,0]

# plt.imshow(mean_image.astype('uint8'))
plt.subplot(1, 3, 1)
plt.imshow(bb)
plt.subplot(1, 3, 2)
plt.imshow(mean_image[0])
plt.subplot(1, 3, 3)
plt.imshow(bb-mean_image[0] )
k+=1


# Sanity check loss
After you build a new network, one of the first things you should do is sanity check the loss. When we use the softmax loss, we expect the loss for random weights (and no regularization) to be about `log(C)` for `C` classes. When we add regularization this should go up.

In [None]:
input_shape = (1, 50, 40)
model = init_two_layer_convnet(input_shape=input_shape)
# model = init_two_layer_convnet(num_filters=3, filter_size=3, input_shape=input_shape)
X = np.random.randn(100, 1, 50, 40)
y = np.random.randint(10, size=100)

loss, _ = two_layer_convnet(X, model, y, reg=0)

# Sanity check: Loss should be about log(10) = 2.3026
print ('Sanity check loss (no regularization): ', loss)

# Sanity check: Loss should go up when you add regularization
loss, _ = two_layer_convnet(X, model, y, reg=1)
print ('Sanity check loss (with regularization): ', loss)

# Gradient check
After the loss looks reasonable, you should always use numeric gradient checking to make sure that your backward pass is correct. When you use numeric gradient checking you should use a small amount of artifical data and a small number of neurons at each layer.

In [None]:
num_inputs = 2
input_shape = (1, 50, 40)
reg = 0.0
num_classes = 11
X = np.random.randn(num_inputs, *input_shape)
y = np.random.randint(num_classes, size=num_inputs)

# model = init_two_layer_convnet(num_filters=3, filter_size=3, input_shape=input_shape)
model = init_two_layer_convnet(weight_scale=1e-3, bias_scale=0, input_shape=(1, 50, 40),
                           num_classes=num_classes, num_filters=3, filter_size=3)
loss, grads = two_layer_convnet(X, model, y)
for param_name in sorted(grads):
    f = lambda _: two_layer_convnet(X, model, y)[0]
    param_grad_num = eval_numerical_gradient(f, model[param_name], verbose=False, h=1e-6)
    e = rel_error(param_grad_num, grads[param_name])
    print ('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))

# Overfit small data
A nice trick is to train your model with just a few training samples. You should be able to overfit small datasets, which will result in very high training accuracy and comparatively low validation accuracy.

In [None]:
print(y_train)


In [None]:
# Use a two-layer ConvNet to overfit 50 training examples.
num_classes = 11
model = init_two_layer_convnet(weight_scale=1e-3, bias_scale=0, input_shape=(1, 50, 40),
                           num_classes=num_classes, num_filters=32, filter_size=5)
trainer = ClassifierTrainer()
ridx = np.random.randint(8000,size = 10)
print('ridx =',ridx)
best_model, loss_history, train_acc_history, val_acc_history = trainer.train(
          X_train[ridx], y_train[ridx], X_val, y_val, model, two_layer_convnet,
          reg=0.001, momentum=0.9, learning_rate=0.0001, batch_size=2, num_epochs=10,
          verbose=True)

Plotting the loss, training accuracy, and validation accuracy should show clear overfitting:

In [None]:
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.xlabel('iteration')
plt.ylabel('loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

# Train the net
Once the above works, training the net is the next thing to try. You can set the `acc_frequency` parameter to change the frequency at which the training and validation set accuracies are tested. If your parameters are set properly, you should see the training and validation accuracy start to improve within a hundred iterations, and you should be able to train a reasonable model with just one epoch.

Using the parameters below you should be able to get around 50% accuracy on the validation set.

In [None]:
model = init_two_layer_convnet(weight_scale=1e-3, bias_scale=0, input_shape=(1, 50, 40),
                           num_classes=2, num_filters=3, filter_size=3)
trainer = ClassifierTrainer()
best_model, loss_history, train_acc_history, val_acc_history = trainer.train(
          X_train, y_train, X_val, y_val, model, two_layer_convnet,
          reg=0.001, momentum=0.9, learning_rate=0.0005, batch_size=30, num_epochs=5,
          acc_frequency=10, verbose=True)

In [None]:
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.xlabel('iteration')
plt.ylabel('loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

In [None]:
scores_test = two_layer_convnet(X_test, best_model)
def softmax(x):
    x_exp = np.exp(x)
    s = x_exp / np.sum(x_exp, axis = 1,keepdims=True)
    return s
probs = softmax(scores_test)
classes = np.argmax(probs,axis = 1)
print('y_test =',y_test)
print('classes =',classes)
pred_acc = np.mean(classes ==  y_test)
print('pred_acc =', pred_acc)
print('probs =',probs)
np.where((classes-y_test)!=0)

In [None]:
yy = np.amax(probs,axis =1)
xx = range(yy.shape[0])
plt.scatter(xx,yy  )

# Visualize weights
We can visualize the convolutional weights from the first layer. If everything worked properly, these will usually be edges and blobs of various colors and orientations.

In [None]:
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.xlabel('iteration')
plt.ylabel('loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc_history)
plt.plot(val_acc_history)
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

In [None]:
best_model['W1'].shape
grid3 = np.dstack((grid,grid,grid))
grid3.shape

In [None]:
from cs231n.vis_utils import visualize_grid

grid = visualize_grid(best_model['W1'].transpose(0, 2, 3, 1))
grid3 = np.dstack((grid,grid,grid))
plt.imshow(grid3.astype('uint8'))

In [None]:
print(best_model['W1'])
