Practice for Pytorch usage. Implement a two-layer network to random data

## Pytorch manual

In [10]:
import torch
# We wanna this to run on GPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [11]:
# 标准正态分布, 随机生成[0, 1)之间的随机数(张量), 张量的形状由第一个参数sizes定义
# 生成一个64行, 1000列的张量
x = torch.randn(N, D_in)
# print(type(x))
# print(x.shape)
y = torch.randn(N, D_out)
# Randomly initialize weights
# w1: 1000行, 100列
# w2: 100行, 10列
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)
learning_rate = 1e-6

In [12]:
for t in range(500):
    # Forward pass: predict y
    # x.mm(w1), 矩阵乘法 -> 64行100列
    h = x.mm(w1)
    # clamp(x, min, max), 将张量x限制在[min, max]之间
    # 例如x = torch.arange(12) => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    # torch.clamp(x, 2, 10)返回[2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10]
    # relu函数, if(x < 0): y = 0 else: y = x
    h_relu = h.clamp(min=0)
    # 64行10列, 最终结果
    y_pred = h_relu.mm(w2)
    # Compute and print loss; 计算出来的loss是tensor, 如果只想要其中的value, 可以使用item()
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Backprop to compute gradients of w1 and w2 with respect to loss
    # 首先计算损失对于预测值的梯度, 均方误差损失函数
    grad_y_pred = 2.0 * (y_pred - y)
    # 100行10列, 计算w2的梯度, t()代表矩阵转置
    # 链式法则
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29514046.0
1 26733540.0
2 28648754.0
3 30371482.0
4 28165574.0
5 21231412.0
6 13011388.0
7 6909567.5
8 3561758.75
9 1967752.5
10 1233059.0
11 872658.375
12 673819.5625
13 548086.375
14 458876.75
15 390535.0625
16 335881.6875
17 290794.125
18 253107.75
19 221276.671875
20 194194.28125
21 171024.203125
22 151096.0
23 133891.328125
24 118987.921875
25 106027.921875
26 94710.03125
27 84808.5703125
28 76107.703125
29 68438.1015625
30 61667.578125
31 55667.8203125
32 50334.95703125
33 45588.859375
34 41354.71875
35 37571.2734375
36 34183.171875
37 31148.646484375
38 28423.19921875
39 25969.828125
40 23756.310546875
41 21757.6875
42 19949.89453125
43 18312.73046875
44 16828.34375
45 15479.5126953125
46 14254.2626953125
47 13139.888671875
48 12126.9072265625
49 11202.53515625
50 10357.4326171875
51 9584.15234375
52 8876.32421875
53 8228.6494140625
54 7634.48193359375
55 7088.458984375
56 6586.560546875
57 6124.7119140625
58 5699.05419921875
59 5306.427734375
60 4944.0380859375
61 4609.317382

## Pytorch Autograd

In [13]:
w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)
learning_rate = 1e-6

In [14]:
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # grad.zero_()将tensor的梯度归零
        # 因为在每次反向传播之后, 梯度值会积累, 如果不及时清零,
        # 可能会影响下一次的梯度计算, grad.zero_()可以确保每次反向传播前都将梯度重置为0, 以便进行新一轮的梯度计算
        w1.grad.zero_()
        w2.grad.zero_()

0 32797328.0
1 28249210.0
2 24371408.0
3 19176796.0
4 13479629.0
5 8626683.0
6 5324640.0
7 3340290.75
8 2215490.75
9 1570646.25
10 1182403.625
11 932090.1875
12 758843.5
13 631386.1875
14 533060.75
15 454789.5
16 391070.34375
17 338327.90625
18 294226.375
19 257036.0625
20 225441.078125
21 198441.34375
22 175229.609375
23 155212.765625
24 137868.140625
25 122795.015625
26 109628.7734375
27 98087.4609375
28 87930.9765625
29 78982.21875
30 71076.765625
31 64069.18359375
32 57843.40234375
33 52308.83984375
34 47372.390625
35 42959.578125
36 39007.8203125
37 35464.5
38 32277.966796875
39 29408.609375
40 26821.45703125
41 24486.390625
42 22375.12109375
43 20462.5859375
44 18728.91015625
45 17156.7109375
46 15727.833984375
47 14427.65625
48 13243.8779296875
49 12165.27734375
50 11181.021484375
51 10282.220703125
52 9461.1884765625
53 8710.3369140625
54 8023.37548828125
55 7394.34912109375
56 6818.01123046875
57 6289.521484375
58 5804.44482421875
59 5359.0439453125
60 4950.0146484375
61 4574.

## Pytorch: Defining new autograd functions