In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Define the network

In [13]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        #Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.numFlatFeatures(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def numFlatFeatures(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        numFeatures = 1
        for s in size:
            numFeatures *= s
        return numFeatures

In [14]:
net = Net()
net

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [15]:
params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight
print(params[0])

10
torch.Size([6, 1, 5, 5])
Parameter containing:
tensor([[[[ 0.1624,  0.1444,  0.1983,  0.0406,  0.0589],
          [ 0.0419, -0.1010,  0.1208, -0.1485, -0.1332],
          [-0.1856,  0.1890,  0.1222,  0.1257,  0.1768],
          [-0.1096,  0.0783,  0.1537,  0.0903, -0.1707],
          [ 0.1794, -0.1863,  0.0115,  0.0059,  0.1740]]],


        [[[ 0.1924,  0.1619, -0.0467, -0.0621, -0.1755],
          [ 0.0878, -0.1189, -0.1305, -0.0622, -0.0122],
          [-0.0902, -0.1703,  0.1011, -0.0229, -0.1176],
          [ 0.0927, -0.1430,  0.0915, -0.1945, -0.0742],
          [-0.1589, -0.1323,  0.1851,  0.1627, -0.1052]]],


        [[[-0.1976, -0.1096,  0.1768,  0.0020,  0.0651],
          [ 0.1407, -0.1972,  0.0910, -0.1625, -0.1270],
          [ 0.0022, -0.0793,  0.1976,  0.1763, -0.0278],
          [ 0.1373,  0.0131, -0.1943, -0.1590,  0.1304],
          [ 0.0119, -0.1412,  0.1617, -0.1863,  0.1075]]],


        [[[-0.1182, -0.1977, -0.1244,  0.0465,  0.1671],
          [-0.1551,  0.006

In [16]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
out

tensor([[ 0.1107, -0.0139, -0.0533,  0.0678, -0.0490,  0.1316,  0.0785, -0.0664,
          0.1026,  0.0507]], grad_fn=<AddmmBackward>)

In [17]:
#Zero the gradient buffers of all parameters and backprops with random gradients:
net.zero_grad()
out.backward(torch.randn(1, 10))

Recap:
· **torch.Tensor** - A multi-dimensional array with support for autograd operations like backward(). Also holds the gradient w.r.t. the tensor.
· **nn.Module** - Neural network module. Convenient way of encapsulating parameters, with helpers for moving them to GPU, exporting, loading, etc.
· **nn.Parameter** - A kind of Tensor, that is automatically registered as a parameter when assigned as an attribute to a Module.
· **autograd.Function** - Implements forward and backward definitions of an autograd operation. Every Tensor operation creates at least a single Function node that connects to functions that created a Tensor and encodes its history.

## Loss function 

In [18]:
output = net(input)
target = torch.randn(10)  # a dummy target
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
loss

tensor(2.1517, grad_fn=<MseLossBackward>)

Now, if you follow loss in the backward direction, using its .grad_fn attribute, you will see a graph of computations that looks like this:

input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss

In [20]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x117366940>
<AddmmBackward object at 0x1173669b0>
<AccumulateGrad object at 0x117366940>


## Backpropagation

In [21]:
#To backpropagate the error all we have to do is to loss.backward(). 
#You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.

net.zero_grad() # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0307,  0.0146, -0.0419, -0.0168,  0.0133,  0.0211])


## Updating the weights

The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):

weight = weight - learning_rate * gradient

In [22]:
learningRate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learningRate)

In [23]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update