# Week 2, day 2 - CNN with less overfitting (by D. Tuia, 2020)

**1. Re-load the LeNet of yesterday  and play around with parameters**

*Preliminaries (given to you)*

In [0]:
#intall mxp net and load libraries to start the day
!pip install mxnet-cu101==1.6.0b20191122
from mxnet import autograd, gluon, init, np, npx
from mxnet.gluon import nn
npx.set_np()

#get the data
!pip install git+https://github.com/d2l-ai/d2l-en
import d2l
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

*Create an instance of LeNet as yesterday*

In [0]:
#create an instance of a LeNet (6.6.1)
net = nn.Sequential()

#add the different layers
net.add(nn.Conv2D(channels=6, kernel_size=5,padding=2,activation='sigmoid'), #1st conv, with padding
        nn.AvgPool2D(pool_size=2,strides=2), #1st pooling, non overlapping stride
        nn.Conv2D(channels=16, kernel_size=5,activation='sigmoid'), #2nd conv, no padding this time
        nn.AvgPool2D(pool_size=2,strides=2), #1st pooling, non overlapping stride        
        nn.Dense(120,activation='sigmoid'), #fc1
        nn.Dense(84,activation='sigmoid'), #fc2
        nn.Dense(10)       # classifier
        )

#create a random point, the size of a point in MNIST
X = np.random.uniform(size=(1,1,28,28))
#print(X)
#print(net)

#initalize all weights with random values
net.initialize()

#let's visualize if everything works out and the size of all tensors.
for layer in net:
  #print(layer)
  X = layer(X)
  print(layer.name, 'output size:\t', X.shape)


*Load the accuracy evaluation function you wrote yesterday (do not forget to activate the GPU)*

In [0]:
ctx = list(net.collect_params().values())[0].list_ctx()[0] #tells us which GPU is being used to train the model.
print (ctx)

# write the evaluation function
def evaluate_accuracy_gpu(net, data_iter, ctx=None): #data_iter is the dataset used for accuracy evaluation
    if not ctx:  # Query the first device the first parameter is on
        ctx = list(net.collect_params().values())[0].list_ctx()[0] #tells us which GPU is being used to train the model.
        print (ctx)
    metric = d2l.Accumulator(2)  # num_corrected_examples, num_examples
    for X, y in data_iter:
        X, y = X.as_in_context(ctx), y.as_in_context(ctx) #as_in_context moves data to the GPU (ctx, which is selected with the if not statement above)
        metric.add(d2l.accuracy(net(X), y), y.size)
    return metric[0]/metric[1]

*Load the training function you wrote yesterday*

In [0]:
def train_ch5(net, train_iter, test_iter, num_epochs, lr, ctx=d2l.try_gpu()):
  net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) #re-initialize the net (in case there was some manual stuff going on in between)
  loss = gluon.loss.SoftmaxCrossEntropyLoss()
  trainer = gluon.Trainer(net.collect_params(),'sgd', {'learning_rate': lr})
  animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs],legend=['train loss', 'train acc', 'test acc'])
  timer = d2l.Timer()
  
  for epoch in range(num_epochs): #going through epochs
    metric = d2l.Accumulator(3)  # 3 values: train_loss, train_acc, num_examples
    for i, (X, y) in enumerate(train_iter): #going through examples in the batch
      timer.start()
      X, y = X.as_in_context(ctx), y.as_in_context(ctx)
      with autograd.record():
        y_hat = net(X)
        l = loss(y_hat, y)
      l.backward()
      trainer.step(X.shape[0])
      metric.add(l.sum(), d2l.accuracy(y_hat, y), X.shape[0]) #adds all metrics for computing loss and accuracies (at every sample)
      timer.stop()
      train_loss, train_acc = metric[0]/metric[2], metric[1]/metric[2]
      if (i+1) % 50 == 0:
        animator.add(epoch + i/len(train_iter),(train_loss, train_acc, None)) #updates the train and los every 50 points within the batch
    test_acc = evaluate_accuracy_gpu(net, test_iter)
    animator.add(epoch+1, (None, None, test_acc)) #updates the test accuracy at each epoch
  print('loss %.3f, train acc %.3f, test acc %.3f' % (train_loss, train_acc, test_acc))
  print('%.1f examples/sec on %s' % (metric[2]*num_epochs/timer.sum(), ctx))

*Train the models with the different hyperparameters and compare the results*

In [0]:
# as yesterday...
lr, num_epochs = 0.9, 10
train_ch5(net, train_iter, test_iter, num_epochs, lr)

#all yours now, use the parameters from the slide.
#...

***2. Batch Normalization***

*Write the BatchNormalization function (7.5)* 

In [0]:
def batch_norm(X, gamma, beta, moving_mean,moving_var, eps, momentum):
  # 1. check whether the model is training or not, create two cases
  # 2. Define behavior in test
  # 3. Define behavior in training
  #     - check whether the input is 2D (fully conv) or 4D (normal conv layer)
  #         - calculate avg and var in each case
  #     - compute y_hat (the normalized values of the activations)
  #     - update the moving mean and var for the test phase
  #     - update activations using the linear fit (with gamma and beta)
  # 4. return relevatn variables

*Create your BatchNorm class* 

In [0]:
class BatchNorm(nn.Block):

  # you'll need 2 functions:
  # 
  # __init__()
  #
  #   In this one you initialize parameters:  
  #     - gamma as a verctor of ones
  #     - beta as a vector of zeroes
  #     - moving_mean and moving_var as as many zeroes as you have features
  #
  # forward()
  # 
  # here you need to do 3 things: 
  #     1. make sure everything is in the same context (basycally copy the moving_mean and _var to the context where the data are)
  #     2. run the batchnorm function you just wrote
  #     3. return the result

*Create an instance of LeNet, but this time with BatchNorm*

In [0]:
#...

*Train it*

In [0]:
#Let's say with lr = 0.9 and 10 epochs:
lr, num_epochs = 0.9, 10

#...

***3. Dropout***

*Write the Dropout function (chapter 4.6)*

In [0]:
def dropout(X,drop_prob):

*Test that it is doing what you want*

In [0]:
# create a random array
# mask 0, 0.1, 0.5 and 1 of the coefficients. Does it work?

Z = np.arange(100)#.reshape(20,5)
print(dropout(Z,0.5))

*Create your Dropout class* 

In [0]:
class Dropout(nn.Block):

  # Here you really know the drill. There is no code from the book to help you this time :)
  # If you're lost, you can always check out the BatchNorm class you just wrote.
  # This one is much simpler than the BatchNorm! 
  # - In the init function you basically just declare the dropout percentage
  # - in the forward, just remember that dropout is used only during training.

*Create an instance of LeNet, but this time with BatchNorm AND dropout*

In [0]:
# Create and instantiate the new LeNet. BN in between the convolution and the non linear activation

*Train it!*

In [0]:
#Let's say with lr = 0.9 and 10 epochs:
lr, num_epochs = 0.9, 10

# if it crashes and your errors say something about "out of context", 
# it is because your dropout mask is not in the GPU with the neurons' weights. 
# you just need to copyto the mask where the weights are
# Go back to your dropout function and fix it! (as you did for the BatchNorm)

*Compare the different results obtained (all the learning curves). Can you see the effects of the different strategies (hyperparameters, BatchNorm, Dropout)?*

**You have made it. Congrats!!!!**