### 用numpy实现两层神经网络

一个全连接ReLU神经网络，一个隐藏层，没有bias。用来从x预测y，使用L2 Loss。
- $h = W_1X + b_1$
- $a = max(0,h)$
- $y_{hat} = W_2a + b_2$

这一实现完全使用numpy来计算前向神经网络，loss，和反向传播。

- forward pass
- loss
- backward pass

numpy ndarray是一个普通的n维array。它不知道任何关于深度学习或者梯度（gradient）的知识，也不知道计算图（computation graph），只是一种用来计算数学运算的数据结构 。


In [1]:
import torch
import numpy as np

In [6]:
# 64个输入、1000维数据、隐藏层100维、输出10维
N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for it in range(500):
    # Forward pass
    h = x.dot(w1) # 拿到一个N * H的矩阵
    h_relu = np.maximum(h, 0) # N * H relu函数
    y_pred = h_relu.dot(w2) # N * D_out
    
    # compute loss 采用MSE均方误差
    loss = np.square(y_pred - y).sum()
    print(it, loss)
    
    # Backward pass
    # compute the gradient 链式求导法则
    # loss  (y_pred-y)^2
    grad_y_pred = 2.0 * (y_pred - y)
    
    # y_pred = h_relu * w2
    # 令矩阵C = A * B，有一个函数f，使得y=f(C)
    # 那么y对B的偏导等于 A^T * y对A的C的偏导
    grad_w2 = h_relu.T.dot(grad_y_pred)
    
    # 与上面同理
    grad_h_relu = grad_y_pred.dot(w2.T)
    
    # relu函数求导，小于0为0，大于0求导后是1，根据链式法则，剩下的和后面的grad_h_relu一样
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    
    # h = x * w1
    # 那么y对w1的偏导等于x的转置乘y对h的偏导
    grad_w1 = x.T.dot(grad_h)
    
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    

0 27714280.294375356
1 23873274.62406103
2 23406200.352810062
3 22963584.517222192
4 20720525.65836962
5 16413249.945964457
6 11414805.538624102
7 7178049.785304103
8 4306614.777427594
9 2599917.623948256
10 1647246.6618929557
11 1119466.160870905
12 816870.582663557
13 632556.99545114
14 511362.74185552547
15 425776.517518524
16 361359.69520255085
17 310657.8490437366
18 269499.1860509651
19 235323.97635969537
20 206567.9459344337
21 182135.26377790686
22 161193.55849814083
23 143154.55449373543
24 127521.18143150111
25 113910.70701649057
26 102013.38169289999
27 91583.83845642237
28 82444.13879118575
29 74382.1248553228
30 67263.84452277675
31 60940.60859016608
32 55319.11000611327
33 50316.97044888718
34 45850.22773690379
35 41856.995382755
36 38271.61727852036
37 35043.45938404368
38 32132.15520779055
39 29500.651578228273
40 27119.625461204967
41 24959.261787320767
42 22998.232966472515
43 21214.89459370781
44 19590.82708550986
45 18110.58071200543
46 16757.70567674047
47 15520.74

396 0.0025752326009491986
397 0.00247515373555108
398 0.0023789632043652933
399 0.0022865040734573556
400 0.0021977118220309092
401 0.0021123587520826817
402 0.0020302892347992983
403 0.001951457911901201
404 0.0018756813716594742
405 0.001802877357466298
406 0.001732895342225189
407 0.0016655931078166035
408 0.0016009215652863038
409 0.001538769234478714
410 0.0014790634919728024
411 0.0014217171496768962
412 0.0013665364070614913
413 0.001313493096533085
414 0.001262520028094783
415 0.0012135528081861496
416 0.001166493373746424
417 0.0011212338848733492
418 0.0010777430514373979
419 0.001035955399865482
420 0.000995780341560838
421 0.0009571995831132612
422 0.0009200780194099638
423 0.0008843982170291497
424 0.000850105552716124
425 0.0008171471744322476
426 0.0007855078125321062
427 0.0007550648112169347
428 0.0007257947961694914
429 0.0006976666817909746
430 0.0006706278073166595
431 0.0006446501314097373
432 0.0006196795967790744
433 0.0005956729335255919
434 0.000572606083198723

In [7]:
h = x.dot(w1) # 拿到一个N * H的矩阵
h_relu = np.maximum(h, 0) # N * H relu函数
y_pred = h_relu.dot(w2) # N * D_out

In [8]:
y_pred -y

array([[ 1.05110327e-04, -2.63352155e-05, -5.73828636e-05,
        -2.09138425e-06,  2.73136413e-05,  8.27495539e-06,
         4.35795897e-05,  3.09584225e-05,  3.35605745e-05,
        -6.71537479e-05],
       [ 2.79553468e-04,  1.48501111e-05, -1.30061001e-04,
        -3.39434622e-05,  1.85718459e-04,  1.13747091e-05,
         1.40793967e-04,  2.24341874e-04,  1.33794301e-04,
        -3.77309832e-04],
       [-4.01496772e-04,  4.72114847e-05,  7.38627938e-05,
         6.76257910e-05, -1.94492843e-04,  4.18434292e-06,
        -1.21872393e-04, -1.65429209e-04, -1.18891804e-04,
         4.81334229e-04],
       [-7.03520583e-05,  1.22584514e-07,  2.06905357e-05,
        -1.01227222e-05, -4.32009221e-05, -1.20740368e-05,
        -1.83714334e-05, -5.86883588e-05,  2.80216541e-05,
         9.59792918e-05],
       [-7.50743908e-04, -1.33502299e-04,  1.93381726e-04,
         9.75927236e-05, -3.58418547e-04,  6.84461315e-06,
        -4.24087004e-04, -3.07936080e-04, -2.22414711e-04,
         7.

### PyTorch : Tensors
这次使用PyTorch tensors来创建前向神经网络，计算损失，以及反向传播

一个PyTorch Tensor很像一个numpy的ndarray。但是它和numpy ndarray最大的区别是，PyTorch Tensor可以在CPU或者GPU上运算。如果想要在GPU上运算，就需要把Tensor换成cuda类型。

In [12]:
# 64个输入、1000维数据、隐藏层100维、输出10维
N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6
for it in range(500):
    # Forward pass
    h = x.mm(w1) # 拿到一个N * H的矩阵 pytorch里dot函数是mm函数
    h_relu = h.clamp(min=0) # N * H relu函数 把min夹到0
    y_pred = h_relu.mm(w2) # N * D_out
    
    # compute loss 采用MSE均方误差
    loss = (y_pred - y).pow(2).sum().item() # item把tensor转换成一个数字
    print(it, loss)
    
    # Backward pass
    # compute the gradient 链式求导法则
    # loss  (y_pred-y)^2
    grad_y_pred = 2.0 * (y_pred - y)
    
    # y_pred = h_relu * w2
    # 令矩阵C = A * B，有一个函数f，使得y=f(C)
    # 那么y对B的偏导等于 A^T * y对A的C的偏导
    grad_w2 = h_relu.t().mm(grad_y_pred)
    
    # 与上面同理
    grad_h_relu = grad_y_pred.mm(w2.t())
    
    # relu函数求导，小于0为0，大于0求导后是1，根据链式法则，剩下的和后面的grad_h_relu一样
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    
    # h = x * w1
    # 那么y对w1的偏导等于x的转置乘y对h的偏导
    grad_w1 = x.t().mm(grad_h)
    
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    

0 44340452.0
1 41517384.0
2 34854688.0
3 23330022.0
4 12675163.0
5 6313179.5
6 3379460.75
7 2101286.0
8 1494456.0
9 1155651.75
10 934958.625
11 775061.875
12 651944.5
13 554152.0
14 475121.3125
15 410102.8125
16 356096.0625
17 310871.03125
18 272670.96875
19 240189.203125
20 212413.703125
21 188534.953125
22 167881.015625
23 149933.03125
24 134290.6875
25 120634.53125
26 108643.140625
27 98070.25
28 88706.3828125
29 80384.8125
30 72968.8359375
31 66345.453125
32 60418.15625
33 55097.0
34 50318.5625
35 46013.7421875
36 42128.6015625
37 38617.50390625
38 35436.62109375
39 32553.0234375
40 29933.86328125
41 27551.34765625
42 25382.6328125
43 23405.67578125
44 21601.5546875
45 19952.685546875
46 18444.41015625
47 17063.822265625
48 15799.29296875
49 14639.7470703125
50 13574.44921875
51 12595.666015625
52 11695.4130859375
53 10866.390625
54 10102.529296875
55 9398.390625
56 8749.98046875
57 8151.15869140625
58 7597.86181640625
59 7086.71728515625
60 6613.2822265625
61 6174.8671875
62 5768.

445 0.0011547222966328263
446 0.0011248656082898378
447 0.0010947181144729257
448 0.001064821146428585
449 0.0010367126669734716
450 0.0010096692712977529
451 0.000984044512733817
452 0.0009574094437994063
453 0.000932955474127084
454 0.0009087243815883994
455 0.0008842700626701117
456 0.0008628013893030584
457 0.0008413600153289735
458 0.0008179211290553212
459 0.0007969554862938821
460 0.0007782942266203463
461 0.0007582566468045115
462 0.0007393472478725016
463 0.0007216527010314167
464 0.0007036342867650092
465 0.0006878654821775854
466 0.0006714672781527042
467 0.0006559599423781037
468 0.0006384645821526647
469 0.0006228298880159855
470 0.0006095882272347808
471 0.000594916520640254
472 0.0005809934227727354
473 0.0005678344168700278
474 0.000553650374058634
475 0.0005412573809735477
476 0.0005289450054988265
477 0.0005172871169634163
478 0.0005061918636783957
479 0.0004944927059113979
480 0.0004837377346120775
481 0.00047292199451476336
482 0.0004627836460713297
483 0.0004523911

In [16]:
# 自动计算gradient
x = torch.tensor(1., requires_grad = True)
w = torch.tensor(2., requires_grad = True)
b = torch.tensor(3., requires_grad = True)

y = w*x + b # y = 2*1 + 3

y.backward()

# dy / dw = x
print(w.grad)

print(x.grad)
print(b.grad)

tensor(1.)
tensor(2.)
tensor(1.)


In [26]:
# 64个输入、1000维数据、隐藏层100维、输出10维
N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)

learning_rate = 1e-6
for it in range(500):
    # Forward pass
    # h = x.mm(w1) # 拿到一个N * H的矩阵 pytorch里dot函数是mm函数
    # h_relu = h.clamp(min=0) # N * H relu函数 把min夹到0
    #y_pred = h_relu.mm(w2) # N * D_out
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # compute loss 采用MSE均方误差
    loss = (y_pred - y).pow(2).sum()
    print(it, loss.item())
    
    # Backward pass
    # loss代表的其实是一个computation graph，包含了loss是怎么算出来的
    # 会算出所有的导数  
    loss.backward()
    
     # update weights of w1 and w2
    with torch.no_grad():
        # update weights of w1 and w2
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_() # 防止不断增加
        w2.grad.zero_()
    
    

0 43383524.0
1 45032768.0
2 45802312.0
3 37713832.0
4 23075608.0
5 11124077.0
6 5135250.5
7 2765812.75
8 1821799.875
9 1372280.0
10 1103315.0
11 913423.8125
12 767492.9375
13 650425.6875
14 554894.9375
15 476065.0625
16 410384.34375
17 355341.53125
18 309020.78125
19 269711.875
20 236199.703125
21 207522.59375
22 182861.40625
23 161654.046875
24 143282.765625
25 127298.890625
26 113343.375
27 101146.328125
28 90465.8515625
29 81059.1484375
30 72758.546875
31 65425.7734375
32 58934.6796875
33 53181.1640625
34 48061.46484375
35 43497.84765625
36 39423.1796875
37 35777.14453125
38 32506.30859375
39 29568.228515625
40 26926.517578125
41 24546.314453125
42 22400.083984375
43 20460.810546875
44 18707.6484375
45 17121.353515625
46 15683.935546875
47 14381.2119140625
48 13198.0380859375
49 12122.560546875
50 11143.373046875
51 10250.4814453125
52 9435.6591796875
53 8691.6962890625
54 8012.18359375
55 7390.111328125
56 6820.7919921875
57 6299.07568359375
58 5821.17431640625
59 5382.826171875
60

436 7.405706855934113e-05
437 7.258396362885833e-05
438 7.154374907258898e-05
439 7.044464291539043e-05
440 6.934581324458122e-05
441 6.843885057605803e-05
442 6.726589344907552e-05
443 6.615858001168817e-05
444 6.518443115055561e-05
445 6.407518230844289e-05
446 6.288644362939522e-05
447 6.230582221178338e-05
448 6.121361366240308e-05
449 6.016744009684771e-05
450 5.9232388593954965e-05
451 5.844907718710601e-05
452 5.7558721891837195e-05
453 5.665751814376563e-05
454 5.610697189695202e-05
455 5.515987504622899e-05
456 5.43284259038046e-05
457 5.360416616895236e-05
458 5.2867530030198395e-05
459 5.215457349549979e-05
460 5.123047230881639e-05
461 5.046385194873437e-05
462 4.996985080651939e-05
463 4.9356327508576214e-05
464 4.877954415860586e-05
465 4.8128793423529714e-05
466 4.749433355755173e-05
467 4.6986995585029945e-05
468 4.661127240979113e-05
469 4.5581808080896735e-05
470 4.514830652624369e-05
471 4.459774208953604e-05
472 4.4119093217886984e-05
473 4.350998642621562e-05
474 4

In [20]:
N, D_in, H, D_out =64,1000,100,10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad=True) # 默认false
w2 = torch.randn(H, D_out, requires_grad=True)    

y_pred = x.mm(w1).clamp(min=0).mm(w2)


loss = (y_pred - y).pow(2).sum()

w1.grad.zero_() # 防止不断增加
w2.grad.zero_()

loss.backward()

In [22]:
w1.grad # x,y不需要梯度，就是数据

tensor([[  3093.3018,  -7588.9399, -13514.6123,  ..., -19400.3535,
          -2476.9756,  -4226.0620],
        [ -2718.6970,  -1226.5280,  -3534.9387,  ...,   6022.7979,
            413.4329,    481.7133],
        [-14556.5361,  -3409.4729,  -9587.1611,  ...,  13039.0361,
          -3153.3518,  -2631.1833],
        ...,
        [  -423.3299,  -4233.6787,  10940.6963,  ...,  -4680.3760,
           4415.4307,  -5975.8413],
        [  7672.3774,   8124.5806,  -7207.7900,  ..., -20495.6602,
           5734.2793,  -3332.6897],
        [ -4430.2803,  -4046.4294, -20743.7480,  ...,    373.9221,
           4640.7866,   -453.3057]])

### PyTorch:nn
这次使用PyTorch中nn这个库来构建网络。用PyTorch autograd来构建计算图和计算gradients，然后PyTorch会自动计算gradient

In [30]:
import torch.nn as nn

N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# w1 = torch.randn(D_in, H, requires_grad=True)
# w2 = torch.randn(H, D_out, requires_grad=True)
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # 区别，多了一个b，w_1 * x + b
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),

)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')

learning_rate = 1e-6
for it in range(500):
    # Forward pass
#     y_pred = x.mm(w1).clamp(min=0).mm(w2)
    y_pred = model(x) # model.forward()
    
    # compute loss 采用MSE均方误差
#     loss = (y_pred - y).pow(2).sum()
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
     # update weights of w1 and w2
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
           
    
    

0 636.3040771484375
1 635.82275390625
2 635.3419189453125
3 634.8623046875
4 634.3840942382812
5 633.9065551757812
6 633.4299926757812
7 632.9537963867188
8 632.4784545898438
9 632.003662109375
10 631.5304565429688
11 631.058349609375
12 630.5867309570312
13 630.1156005859375
14 629.64501953125
15 629.1747436523438
16 628.705078125
17 628.23583984375
18 627.7671508789062
19 627.299072265625
20 626.8314208984375
21 626.3643188476562
22 625.8976440429688
23 625.4318237304688
24 624.9664306640625
25 624.5015258789062
26 624.037109375
27 623.5731201171875
28 623.109619140625
29 622.6466064453125
30 622.1839599609375
31 621.7218017578125
32 621.260009765625
33 620.798583984375
34 620.3377075195312
35 619.87744140625
36 619.417724609375
37 618.958984375
38 618.500732421875
39 618.04296875
40 617.5857543945312
41 617.13037109375
42 616.67578125
43 616.2223510742188
44 615.769287109375
45 615.3168334960938
46 614.8646240234375
47 614.4129028320312
48 613.961669921875
49 613.510986328125
50 613

409 480.6643371582031
410 480.35601806640625
411 480.0479431152344
412 479.739990234375
413 479.43218994140625
414 479.12469482421875
415 478.8179016113281
416 478.5111083984375
417 478.2047424316406
418 477.8986511230469
419 477.59271240234375
420 477.28704833984375
421 476.9815673828125
422 476.6762390136719
423 476.3712158203125
424 476.06683349609375
425 475.7628479003906
426 475.4593811035156
427 475.1566162109375
428 474.854736328125
429 474.5531311035156
430 474.25177001953125
431 473.95062255859375
432 473.6498107910156
433 473.3491516113281
434 473.04876708984375
435 472.74853515625
436 472.44854736328125
437 472.14874267578125
438 471.8492736816406
439 471.5499267578125
440 471.25103759765625
441 470.9521789550781
442 470.653564453125
443 470.3553466796875
444 470.0571594238281
445 469.75921630859375
446 469.4615173339844
447 469.1639709472656
448 468.86669921875
449 468.5697021484375
450 468.2730407714844
451 467.9769287109375
452 467.6809387207031
453 467.38519287109375
454

初始化的不好，经过尝试优化，可以修改学习率，可以修改weight的初试分布

In [31]:
import torch.nn as nn

N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# w1 = torch.randn(D_in, H, requires_grad=True)
# w2 = torch.randn(H, D_out, requires_grad=True)
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # 区别，多了一个b，w_1 * x + b
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),

)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')

learning_rate = 1e-3
for it in range(500):
    # Forward pass
#     y_pred = x.mm(w1).clamp(min=0).mm(w2)
    y_pred = model(x) # model.forward()
    
    # compute loss 采用MSE均方误差
#     loss = (y_pred - y).pow(2).sum()
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
     # update weights of w1 and w2
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
           
    
    

0 653.2801513671875
1 329.4708557128906
2 199.94845581054688
3 116.31727600097656
4 64.49711608886719
5 35.6845588684082
6 19.946537017822266
7 11.540924072265625
8 7.128383636474609
9 5.155335426330566
10 5.2912702560424805
11 8.68885612487793
12 19.29680824279785
13 46.13177490234375
14 104.62676239013672
15 185.23240661621094
16 207.2353057861328
17 100.58937072753906
18 28.32972526550293
19 7.766585350036621
20 3.017179012298584
21 1.4837015867233276
22 0.8466441631317139
23 0.5249067544937134
24 0.34298229217529297
25 0.2321019321680069
26 0.16144752502441406
27 0.11450716853141785
28 0.08250533044338226
29 0.06026343256235123
30 0.04447316378355026
31 0.033090852200984955
32 0.024799402803182602
33 0.0186863262206316
34 0.014151763170957565
35 0.010759028606116772
36 0.008211780339479446
37 0.006287937518209219
38 0.0048291501589119434
39 0.0037175023462623358
40 0.002868962474167347
41 0.0022192371543496847
42 0.0017203883035108447
43 0.0013361969031393528
44 0.00103985937312245

338 1.509957848859489e-12
339 1.429470149021117e-12
340 1.468586428680918e-12
341 1.6004913323472425e-12
342 1.2260720867393604e-12
343 1.2635178276917958e-12
344 1.5207807886261082e-12
345 1.2199953504030137e-12
346 1.4506528573862698e-12
347 1.249445750854672e-12
348 1.3208833983188728e-12
349 1.1916430299116487e-12
350 1.216158142074153e-12
351 1.2847612513786077e-12
352 1.1751544832724892e-12
353 1.2387910792652224e-12
354 1.3761811785625855e-12
355 1.307324799630638e-12
356 1.3984029862898484e-12
357 1.2615367734822303e-12
358 1.317131191440335e-12
359 1.3648569037114089e-12
360 1.445025414430201e-12
361 1.3663765214763646e-12
362 1.6763976274855596e-12
363 1.5372641310948398e-12
364 1.5219534616958685e-12
365 1.6021844224597959e-12
366 1.6001946946328505e-12
367 1.4100898183475041e-12
368 1.438896636389575e-12
369 1.5560494516161927e-12
370 1.5827069472715283e-12
371 1.4598885251723703e-12
372 1.5586810271292495e-12
373 1.541515938334459e-12
374 1.6352968241692412e-12
375 1.30480

In [33]:
import torch.nn as nn

N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# w1 = torch.randn(D_in, H, requires_grad=True)
# w2 = torch.randn(H, D_out, requires_grad=True)
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # 区别，多了一个b，w_1 * x + b
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),

)

# 正态分布 数据范围小了以后在lr小的时候更容易拟合
torch.nn.init.normal_(model[0].weight) # 正态分布
torch.nn.init.normal_(model[2].weight)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')

learning_rate = 1e-6
for it in range(500):
    # Forward pass
#     y_pred = x.mm(w1).clamp(min=0).mm(w2)
    y_pred = model(x) # model.forward()
    
    # compute loss 采用MSE均方误差
#     loss = (y_pred - y).pow(2).sum()
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
     # update weights of w1 and w2
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
           
    
    

0 27708702.0
1 24887600.0
2 25340236.0
3 25742772.0
4 23597378.0
5 18670904.0
6 12564335.0
7 7568198.0
8 4351439.5
9 2572502.75
10 1633915.5
11 1134351.125
12 851627.75
13 677628.25
14 560494.75
15 475130.59375
16 409151.9375
17 356025.375
18 312101.0
19 275163.25
20 243724.65625
21 216742.859375
22 193421.28125
23 173168.453125
24 155491.375
25 140009.96875
26 126375.390625
27 114328.5078125
28 103659.6640625
29 94178.8203125
30 85727.3359375
31 78175.53125
32 71412.1953125
33 65341.2421875
34 59873.66015625
35 54947.16796875
36 50496.52734375
37 46467.68359375
38 42815.08984375
39 39494.5390625
40 36471.80859375
41 33719.1484375
42 31207.552734375
43 28910.73828125
44 26808.318359375
45 24881.75
46 23113.33984375
47 21489.505859375
48 19996.625
49 18621.755859375
50 17354.203125
51 16184.8701171875
52 15104.3408203125
53 14105.30859375
54 13180.9296875
55 12325.05859375
56 11532.1669921875
57 10796.904296875
58 10114.2255859375
59 9480.8515625
60 8892.0380859375
61 8344.4091796875
62

406 0.0028808568604290485
407 0.002780915005132556
408 0.002686124062165618
409 0.0025948514230549335
410 0.0025073911529034376
411 0.002419346012175083
412 0.0023383081424981356
413 0.0022627413272857666
414 0.0021887351758778095
415 0.0021137534640729427
416 0.002043834887444973
417 0.0019782036542892456
418 0.0019145046826452017
419 0.0018505352782085538
420 0.0017940937541425228
421 0.0017350974958389997
422 0.00168029242195189
423 0.0016261949203908443
424 0.0015753160696476698
425 0.0015246303519234061
426 0.0014777599135413766
427 0.0014340539928525686
428 0.001389635493978858
429 0.001348669989965856
430 0.0013054203009232879
431 0.0012666612165048718
432 0.00122979620937258
433 0.0011916152434423566
434 0.0011556051904335618
435 0.0011221925960853696
436 0.001089493278414011
437 0.001059117610566318
438 0.0010290341451764107
439 0.0009989581303671002
440 0.0009694222826510668
441 0.000942164333537221
442 0.0009168302640318871
443 0.0008900275570340455
444 0.0008654451812617481

### PyTorch:optim
这一次不再手动更新模型的weights，而是使用optim这个包来帮助更新参数。optim这个package提供了各种不同的模型优化方法，包括SGD+momentum，RMSProp，Adam等等。

In [34]:
import torch.nn as nn

N, D_in, H, D_out =64,1000,100,10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# w1 = torch.randn(D_in, H, requires_grad=True)
# w2 = torch.randn(H, D_out, requires_grad=True)
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # 区别，多了一个b，w_1 * x + b
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),

)

# 正态分布 数据范围小了以后在lr小的时候更容易拟合
# torch.nn.init.normal_(model[0].weight) # 正态分布
# torch.nn.init.normal_(model[2].weight)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# learning_rate = 1e-6
# optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
for it in range(500):
    # Forward pass
#     y_pred = x.mm(w1).clamp(min=0).mm(w2)
    y_pred = model(x) # model.forward()
    
    # compute loss 采用MSE均方误差
#     loss = (y_pred - y).pow(2).sum()
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    optimizer.zero_grad()
    
    # Backward pass
    loss.backward()
    
    # udate model parameters
    optimizer.step()
    
  
           
    
    

0 601.9918212890625
1 586.1619873046875
2 570.7959594726562
3 555.9359741210938
4 541.5560302734375
5 527.5195922851562
6 513.9478759765625
7 500.760009765625
8 487.91400146484375
9 475.3715515136719
10 463.17547607421875
11 451.382080078125
12 439.9604797363281
13 428.8838195800781
14 418.17474365234375
15 407.7995300292969
16 397.735595703125
17 387.96380615234375
18 378.4560546875
19 369.2335510253906
20 360.23272705078125
21 351.57049560546875
22 343.2032775878906
23 335.0404968261719
24 327.08282470703125
25 319.32073974609375
26 311.69281005859375
27 304.21368408203125
28 296.87890625
29 289.68829345703125
30 282.68133544921875
31 275.8154296875
32 269.08209228515625
33 262.51495361328125
34 256.1270751953125
35 249.90283203125
36 243.82626342773438
37 237.86724853515625
38 232.04115295410156
39 226.3566436767578
40 220.81224060058594
41 215.44618225097656
42 210.20545959472656
43 205.0906524658203
44 200.0883331298828
45 195.19497680664062
46 190.401611328125
47 185.689056396484

399 1.987215546250809e-05
400 1.8725626432569697e-05
401 1.764424996508751e-05
402 1.662215800024569e-05
403 1.565839738759678e-05
404 1.4749213733011857e-05
405 1.388866439810954e-05
406 1.3077921721560415e-05
407 1.2315726962697227e-05
408 1.1594378520385362e-05
409 1.091469130187761e-05
410 1.027327925839927e-05
411 9.669935025158338e-06
412 9.099820090341382e-06
413 8.560909009247553e-06
414 8.055465514189564e-06
415 7.578225904580904e-06
416 7.128672677936265e-06
417 6.703733106405707e-06
418 6.3044894886843394e-06
419 5.927178790443577e-06
420 5.573602265940281e-06
421 5.2393734222278e-06
422 4.924682343698805e-06
423 4.628108854376478e-06
424 4.349361915956251e-06
425 4.086623903276632e-06
426 3.8393882277887315e-06
427 3.6061694572708802e-06
428 3.3879136935865972e-06
429 3.1814697649679147e-06
430 2.9874033771193353e-06
431 2.8053268579242285e-06
432 2.632699761306867e-06
433 2.4719533939787652e-06
434 2.3201819203677587e-06
435 2.1769933482573833e-06
436 2.042602091023582e-06

### PyTorch:自定义 nn Modules 
我们可以定义一个模型，这个模型继承自nn.Module类。如果需要定义一个比Sequential模型更加复杂的模型，就需要定义nn.Module模型。

In [36]:
import torch.nn as nn

N, D_in, H, D_out =64,1000,100,10

# 1、定义数据
# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# 2、定义模型
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H,  D_out):
        # 在init里面把每一个有导数的层放在这里 
        # define the model architecture
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H, bias = False)
        self.linear2 =  torch.nn.Linear(H, D_out, bias = False)
        
    def forward(self, x):
        
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

# 初始化一个模型
model = TwoLayerNet(D_in, H,  D_out)


# 3、定义损失函数，将优化交给optimizer来做
loss_fn = nn.MSELoss(reduction = 'sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# 4、训练过程
for it in range(500):
    # Forward pass
    y_pred = model(x) # model.forward()
    
    # compute loss 采用MSE均方误差
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    optimizer.zero_grad()
    
    # Backward pass
    loss.backward()
    
    # udate model parameters
    optimizer.step()
    
  
           
    
    

0 660.399658203125
1 643.1188354492188
2 626.3074951171875
3 609.9195556640625
4 593.9554443359375
5 578.5216064453125
6 563.569091796875
7 549.1375122070312
8 535.1300659179688
9 521.5925903320312
10 508.5079650878906
11 495.83197021484375
12 483.507080078125
13 471.51397705078125
14 459.89495849609375
15 448.5950927734375
16 437.57049560546875
17 426.8580017089844
18 416.4840393066406
19 406.3536376953125
20 396.48455810546875
21 386.8900451660156
22 377.5679931640625
23 368.58819580078125
24 359.8080749511719
25 351.255615234375
26 342.97784423828125
27 334.9315490722656
28 327.0703125
29 319.395751953125
30 311.9219665527344
31 304.6548156738281
32 297.5523681640625
33 290.6865234375
34 284.0282897949219
35 277.56781005859375
36 271.26934814453125
37 265.1204528808594
38 259.1248779296875
39 253.26185607910156
40 247.53334045410156
41 241.92942810058594
42 236.44509887695312
43 231.08233642578125
44 225.83352661132812
45 220.70526123046875
46 215.687255859375
47 210.7628936767578
4

375 0.00032172934152185917
376 0.0003047820064239204
377 0.0002886963775381446
378 0.0002734338631853461
379 0.0002589457726571709
380 0.00024519750149920583
381 0.0002321633801329881
382 0.00021978707809466869
383 0.00020805919484701008
384 0.00019692955538630486
385 0.000186375473276712
386 0.00017636900884099305
387 0.00016688044706825167
388 0.00015788899327162653
389 0.00014936244406271726
390 0.0001412812853232026
391 0.00013362579920794815
392 0.00012636568862944841
393 0.00011949287727475166
394 0.00011297731543891132
395 0.00010680414561647922
396 0.00010095945617649704
397 9.542109910398722e-05
398 9.017507545650005e-05
399 8.521922427462414e-05
400 8.051305485423654e-05
401 7.60631519369781e-05
402 7.185195136116818e-05
403 6.786543235648423e-05
404 6.409161142073572e-05
405 6.052327807992697e-05
406 5.7142235164064914e-05
407 5.394559775595553e-05
408 5.092329593026079e-05
409 4.806375000043772e-05
410 4.5357071940088645e-05
411 4.280217399355024e-05
412 4.038283077534288e-