In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### Training Procedure for neural network

- Define the NN that has some learnable parameters(weights)
- iterate over a dataset of inputs
- process input through the network
- compute the loss (how far is the output from the actual value)
- propagate the gradient back into network's parameters
- update the weights of the network, typically using a simple update rule  
  `weight = weight - learning_rate * gradient`

# define the network

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channels, 6 output channels, 5*5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)

        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, input):
        # convolution layer c1: 1 input, 5 output channels
        # 5 * 5 square convolution, it uses RELU Activation
        # outputs a tensor with size (N, 6, 28, 28) where N is the size
        c1 = F.relu(self.conv1(input))
        # subsampling layer s2: 2*2 grid, purely functional 
        # this layer does not have any parameter and outputs (N, 6, 14, 14) Tensor
        s2 = F.max_pool2d(c1, (2,2))
        # convolution layer c3 : 6 input channels, 16 output channels, 
        # 5x5 square convolution, it uses RELU activation funciton, and 
        # outputs a (N, 16, 10, 10) Tensor
        c3 = F.relu(self.conv2(s2))
        # subsampling layer s3: 2x2 grid, purely funcitonal
        # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor
        s4 = F.max_pool2d(c3, 2)
        # flatten operational purely functional outputs a (N, 400) Tensor
        s4 = torch.flatten(s4, 1)
        # Fully connected layer F5: (N, 400) Tensor input,
        # and outputs a (N, 120) Tensor, it uses RELU activation function
        f5 = F.relu(self.fc1(s4))
        # Fully connected layer F6: (N, 120) Tensor input,
        # and outputs a (N, 84) Tensor, it uses RELU activation function
        f6 = F.relu(self.fc2(f5))
        # Gaussian layer OUTPUT: (N, 84) Tensor input, and
        # outputs a (N, 10) Tensor
        output = self.fc3(f6)
        return output


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


We've defined the `forward` function and the `backward` function (where gradient are computed) is automatically defined for you using the autograd.


In [3]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 5, 5])


In [4]:
# Let's try random 32 * 32 input
# Expected input size of LeNet (Above Created)
## if MNIST dataset is being used then resize input to 32*32
input = torch.randn(1,1,32,32)
out = net(input)
print(out)

tensor([[-0.0135, -0.0121,  0.0557,  0.0909, -0.0077,  0.0821, -0.0901,  0.1119,
         -0.0371,  0.0049]], grad_fn=<AddmmBackward0>)


In [5]:
# Zero the gradient buffers of all parameters and backprop with random gradients
net.zero_grad()
out.backward(torch.randn(1, 10))

- We've created a NN
- Processing inputs and calling backward
### Still left
- Compputing the loss
- updating the weights of the network

> #### Extra learning: Squeeze and unsqueeze operations
> Squeeze operation: Remove unit dimmensions
> Unsqueeze operation: Add unit dimmensions

In [6]:
x = torch.zeros(2, 2, 1, 2)
x.shape

torch.Size([2, 2, 1, 2])

In [7]:
x.unsqueeze_(1)
x.shape

torch.Size([2, 1, 2, 1, 2])

In [8]:
x.squeeze_().shape

torch.Size([2, 2, 2])

## Loss Function

In [9]:
output = net(input)
target = torch.randn(10)
target = target.view(1, -1)
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(1.0133, grad_fn=<MseLossBackward0>)


Now, if you follow loss in the backward direction, using its .grad_fn attribute, you will see a graph of computations that looks like this:

input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> flatten -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss

So, when we call loss.backward(), the whole graph is differentiated w.r.t the nn parameters, and all tensors in the graph has to have `requires_grad=True` will have their `.grad` tensor accumulated with the gradient.

In [10]:
print(loss.grad_fn)
print(loss.grad_fn.next_functions)  # Linear
print(loss.grad_fn.next_functions[0][0])  # Relu
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])

<MseLossBackward0 object at 0x329b28dc0>
((<AddmmBackward0 object at 0x329b28e50>, 0), (None, 0))
<AddmmBackward0 object at 0x329b28e50>
<AccumulateGrad object at 0x3299f75b0>


## Backprop
To backpropagate we will just call loss.backward()
> Make sure to clear all gradient else gradients will be accumulated.

In [11]:
net.zero_grad()

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
None


In [12]:
loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad after backward
tensor([ 0.0209, -0.0084,  0.0012, -0.0224,  0.0194,  0.0026])


## Update the weights
Simplest update rule used in SGD

`weight = weight - learning_rate * gradient`

In [13]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

We also have more complex weight updation rules like SGD, Nesterov-SGD, Adam, RMSProp, etc.
`torch.optim` helps you out to implement all these methods.

In [15]:
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr = 0.01)

#in your training loop
optimizer.zero_grad()   # gradient zeroing
out = net(input)
loss = criterion(output, target)    # criterion is basically MSE Loss
loss.backward()
optimizer.step()  # does the update