In [0]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [0]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [0]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [0]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [5]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 29982280.0
1 29005064.0
2 33916552.0
3 38849860.0
4 37770340.0
5 27887158.0
6 15635558.0
7 7203771.5
8 3325100.5
9 1787720.75
10 1172491.25
11 884169.1875
12 717651.5
13 602948.25
14 515084.0
15 444246.8125
16 385582.28125
17 336362.40625
18 294699.46875
19 259166.90625
20 228702.5
21 202486.90625
22 179859.875
23 160219.0
24 143104.953125
25 128134.53125
26 114991.09375
27 103418.46875
28 93192.9140625
29 84138.65625
30 76101.2265625
31 68947.75
32 62566.70703125
33 56862.63671875
34 51753.953125
35 47169.6796875
36 43048.3671875
37 39336.7421875
38 35989.59765625
39 32966.8125
40 30231.337890625
41 27755.0
42 25508.70703125
43 23467.92578125
44 21611.05078125
45 19919.1328125
46 18376.390625
47 16968.03515625
48 15680.025390625
49 14501.1201171875
50 13421.32421875
51 12430.9931640625
52 11522.4794921875
53 10688.0927734375
54 9920.876953125
55 9214.69921875
56 8564.580078125
57 7965.54150390625
58 7412.76611328125
59 6902.54736328125
60 6430.8701171875
61 5994.884765625
62 5591.64

In [0]:
test_x = torch.randn(64, 1000)
text_y = torch.randn(64, 10)

In [9]:

y_pred = test_x.mm(w1).clamp(min=0).mm(w2)
loss = (y_pred - y).pow(2).sum()
print(loss.item())

14135488.0
