This implementation computes the forward pass using operations on PyTorch Tensors, and uses PyTorch autograd to compute gradients.

A PyTorch Tensor represents a node in a computational graph. If `x` is a Tensor that has `x.requires_grad=True` then `x.grad` is another Tensor holding the gradient of `x` with respect to some scalar value.

In [1]:
import torch

In [2]:
dtype = torch.float
device = torch.device("cuda:0")

In [3]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, dtype=dtype, device=device)
y = torch.randn(N, D_out, dtype=dtype, device=device)

In [5]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, dtype=dtype, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, dtype=dtype, device=device, requires_grad=True)

In [6]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y - y_pred).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 43594272.0
1 44423648.0
2 46254032.0
3 38844480.0
4 24431808.0
5 11646675.0
6 5290227.0
7 2790811.75
8 1825779.25
9 1373406.875
10 1105636.625
11 917843.125
12 774215.5625
13 659463.0
14 565873.1875
15 488581.46875
16 424162.34375
17 370096.96875
18 324360.40625
19 285515.4375
20 252389.53125
21 223947.140625
22 199350.8125
23 177998.84375
24 159388.125
25 143129.765625
26 128887.0
27 116340.4296875
28 105244.796875
29 95401.1171875
30 86651.109375
31 78853.2421875
32 71878.5390625
33 65627.859375
34 60014.4140625
35 54963.6640625
36 50406.1875
37 46297.359375
38 42588.0
39 39224.19140625
40 36164.51953125
41 33377.0078125
42 30836.9375
43 28521.720703125
44 26403.8828125
45 24465.0
46 22687.322265625
47 21056.12109375
48 19556.087890625
49 18176.09375
50 16904.923828125
51 15732.8818359375
52 14651.111328125
53 13652.0419921875
54 12728.546875
55 11873.875
56 11082.37109375
57 10348.82421875
58 9668.458984375
59 9037.068359375
60 8450.5625
61 7905.638671875
62 7398.9609375
63 6927.8