# 神经网络的实现

首先利用numpy来实现一个原生的全连接网络，采用ReLU激活函数，没有bias。一个隐藏层，使用L2 Loss。
- $h = W_1X$
- $a = max(0, h)$
- $y_{hat} = W_2a$

整体包含3个过程，前向传播，loss，反向传播

In [1]:
import numpy as np
import torch
import torch.nn as nn

In [2]:
# 定义相关的参量
N = 64              # 样本的数量
D_in = 1000         # 每个样本的特征维数
H = 100             # 隐藏层神经元的数量
D_out = 10          # 输出神经元的个数
Iter = 500          # 迭代次数

# 随机创建一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6         # 学习率如果设置的太大，可能会产生nan
for i in range(Iter):
    # Forward
    h = x.dot(w1)    # h.shape = (N, H)
    h_relu = np.maximum(h, 0)  # shape = (N, H)
    y_pred = h_relu.dot(w2)    # y_pred.shape = (N, D_out)
    
    # compute loss
    loss = np.square(y_pred - y).sum()
    print(i, loss)
    
    # Backward
    # compute gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 46301206.34740269
1 50660745.92380345
2 52802248.54713954
3 41284574.46260713
4 21769465.78882839
5 8747742.095558362
6 3767485.854665929
7 2188426.8572103716
8 1586556.5451915266
9 1264854.7156105381
10 1044830.1984777033
11 877054.4854607668
12 743504.7847451086
13 635226.3799526416
14 546451.6843631574
15 472930.2464101938
16 411484.8241874729
17 359811.6751295226
18 316029.2188065357
19 278689.51593326055
20 246686.0296353033
21 219131.1745844049
22 195291.3607695868
23 174607.95356842875
24 156564.38373174693
25 140754.7621930501
26 126847.21389565626
27 114574.7995024859
28 103724.85735800347
29 94091.18716858578
30 85510.2889145597
31 77849.37153198457
32 71004.98605469274
33 64861.33780094885
34 59334.04471724319
35 54354.92650341091
36 49860.65489413153
37 45795.9987846331
38 42111.53841871215
39 38767.65128144702
40 35730.1379062406
41 32967.88324547911
42 30448.98474518735
43 28147.95565041511
44 26042.91295412345
45 24118.260917996868
46 22355.951912227654
47 20738.970360

370 0.001645090909411444
371 0.0015760961170311438
372 0.0015100189584570552
373 0.0014467282833127695
374 0.0013861156383803194
375 0.0013280639761139557
376 0.0012724619802030603
377 0.0012191986758692463
378 0.001168200334230879
379 0.0011193266151587088
380 0.001072524774339093
381 0.0010276812367155897
382 0.0009847402874453952
383 0.000943599101189186
384 0.0009041938233700839
385 0.0008664404199107261
386 0.000830275272785931
387 0.000795633550251695
388 0.0007624504194727625
389 0.0007306527750777919
390 0.0007001903140731315
391 0.0006710054646005381
392 0.0006430445236101825
393 0.0006162625856148873
394 0.0005905992843886506
395 0.0005660142795590948
396 0.0005424535793901847
397 0.0005198894774780893
398 0.0004982613103611351
399 0.00047754309818580327
400 0.0004576857747352049
401 0.00043866147302239677
402 0.000420433289474059
403 0.00040296524029435335
404 0.0003862299630551504
405 0.0003701922651295374
406 0.000354828692268727
407 0.0003401007942381143
408 0.00032598965

### PyTorch实现MLP

In [3]:
# Version one
N = 64
D_in = 1000
H = 100
D_out = 10
Iter = 500

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)

learning_rate = 1e-6
for i in range(Iter):
    # Forward
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Loss
    loss = (y_pred - y).pow(2).sum()
    print(i, loss.item())
    # Backward
    loss.backward()
    
    #update w1 and w2
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # 梯度清零
        w1.grad.zero_()
        w2.grad.zero_()

0 29714992.0
1 23728528.0
2 20875798.0
3 18210188.0
4 14938837.0
5 11240504.0
6 7884606.5
7 5262013.5
8 3470574.0
9 2319167.0
10 1606633.0
11 1162706.0
12 880012.4375
13 692327.0
14 561947.6875
15 467064.0
16 395165.625
17 338730.34375
18 293153.09375
19 255752.90625
20 224556.046875
21 198227.375
22 175809.78125
23 156553.65625
24 139898.25
25 125431.9453125
26 112801.703125
27 101750.296875
28 92011.3359375
29 83397.703125
30 75755.4140625
31 68947.203125
32 62870.375
33 57431.69140625
34 52548.5625
35 48154.8046875
36 44193.00390625
37 40613.30859375
38 37373.1875
39 34433.66015625
40 31761.33984375
41 29330.515625
42 27121.392578125
43 25104.533203125
44 23261.328125
45 21575.001953125
46 20030.203125
47 18611.46875
48 17307.189453125
49 16107.0615234375
50 15000.912109375
51 13980.6904296875
52 13038.455078125
53 12168.326171875
54 11363.55859375
55 10618.2763671875
56 9928.900390625
57 9290.521484375
58 8697.4248046875
59 8146.568359375
60 7634.453125
61 7158.158203125
62 6714.98

401 0.006181095726788044
402 0.00598016194999218
403 0.00578291155397892
404 0.005594519432634115
405 0.005409725476056337
406 0.005235558841377497
407 0.005065726116299629
408 0.004901583772152662
409 0.004742864519357681
410 0.004590772558003664
411 0.004446687176823616
412 0.004302360117435455
413 0.0041665141470730305
414 0.004032018594443798
415 0.003908842336386442
416 0.0037822735030204058
417 0.0036659324541687965
418 0.003549690591171384
419 0.003440089989453554
420 0.0033327308483421803
421 0.003232377115637064
422 0.0031310117337852716
423 0.0030308379791677
424 0.0029407269321382046
425 0.0028534953016787767
426 0.0027675877790898085
427 0.002682950347661972
428 0.0026000954676419497
429 0.0025246490258723497
430 0.002448122948408127
431 0.0023753587156534195
432 0.0023055714555084705
433 0.002237270586192608
434 0.0021716130431741476
435 0.0021083271130919456
436 0.0020448737777769566
437 0.0019857960287481546
438 0.0019294330850243568
439 0.0018749467562884092
440 0.00182

In [4]:
# Version two, 使用nn模块

N = 64
D_in = 1000
H = 100
D_out = 10
Iter = 500

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 定义模型结构
model = nn.Sequential(
    nn.Linear(D_in, H, bias=False),  # w_1 * x
    nn.ReLU(),
    nn.Linear(H, D_out, bias=False)
)
# 使用高斯分布初始化模型的权重参数
nn.init.normal_(model[0].weight)    
nn.init.normal_(model[2].weight)

# 如果有GPU，可以在GPU上进行训练
# model = model.cuda()
# 定义Loss
loss_fn = nn.MSELoss(reduction="sum")
learning_rate = 1e-6

for i in range(Iter):
    # Forward
    y_pred = model(x)
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(i, loss.item())
    
    # Backward
    loss.backward()
    
    # update w1 and w2
    with torch.no_grad():
        for param in model.parameters():  # param(tensor, grad)
            param -= learning_rate * param.grad
            
    model.zero_grad()

0 34793928.0
1 32469724.0
2 34964708.0
3 35492360.0
4 29721394.0
5 19436932.0
6 10312277.0
7 5012261.5
8 2583600.75
9 1538248.625
10 1061477.375
11 811602.9375
12 657663.5
13 549580.5
14 467226.3125
15 401514.125
16 347662.03125
17 302888.40625
18 265326.625
19 233464.4375
20 206261.03125
21 182892.546875
22 162804.84375
23 145400.71875
24 130230.953125
25 116981.5625
26 105363.78125
27 95114.359375
28 86046.6875
29 77993.78125
30 70826.4296875
31 64431.27734375
32 58714.7265625
33 53591.515625
34 48983.30078125
35 44833.21484375
36 41091.32421875
37 37707.046875
38 34639.14453125
39 31860.51171875
40 29340.28125
41 27045.177734375
42 24953.18359375
43 23044.23828125
44 21297.81640625
45 19706.927734375
46 18250.9375
47 16915.216796875
48 15688.8037109375
49 14561.685546875
50 13524.58984375
51 12569.029296875
52 11687.4248046875
53 10873.73828125
54 10122.806640625
55 9428.3681640625
56 8786.1318359375
57 8191.51025390625
58 7640.62548828125
59 7130.4599609375
60 6657.5068359375
61 62

417 0.0003993602003902197
418 0.0003901775344274938
419 0.00038024759851396084
420 0.0003706316347233951
421 0.00036176221328787506
422 0.0003532076079864055
423 0.00034418440191075206
424 0.0003359907423146069
425 0.000327522458974272
426 0.000319427257636562
427 0.00031173659954220057
428 0.00030461899586953223
429 0.00029715412529185414
430 0.0002899194078054279
431 0.0002834092010743916
432 0.00027726913685910404
433 0.00027159310411661863
434 0.000265500188106671
435 0.00025990596623159945
436 0.00025403618928976357
437 0.00024834260693751276
438 0.000242074194829911
439 0.00023663483443669975
440 0.00023144404985941947
441 0.00022670543694403023
442 0.00022177926439326257
443 0.00021739484509453177
444 0.00021219608606770635
445 0.0002076402015518397
446 0.00020328929531387985
447 0.00019964590319432318
448 0.00019561007502488792
449 0.00019204408454243094
450 0.0001877832692116499
451 0.00018393289064988494
452 0.00017995810776483268
453 0.00017679849406704307
454 0.000173148117

In [5]:
model[0].weight

Parameter containing:
tensor([[ 0.4101, -0.6685,  0.2559,  ...,  0.7734, -1.1114,  0.6637],
        [-0.0583, -2.0673,  1.5641,  ..., -0.3305,  0.0815,  1.6297],
        [ 0.4037, -0.1332, -0.5856,  ...,  0.4857, -1.0795,  0.1113],
        ...,
        [-0.4176,  0.4822,  0.2580,  ..., -2.2000, -0.7753, -0.1594],
        [-0.7678, -0.1887,  0.6217,  ..., -0.9976,  0.5586, -0.8643],
        [-0.5049,  0.8828, -0.1795,  ...,  0.7155, -1.0759, -0.1418]],
       requires_grad=True)

In [6]:
# Version three, 使用optim模块

N = 64
D_in = 1000
H = 100
D_out = 10
Iter = 500

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = nn.Sequential(
    nn.Linear(D_in, H, bias=False),
    nn.ReLU(),
    nn.Linear(H, D_out, bias=False)
)

nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction="sum")

learning_rate = 1e-6
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for i in range(Iter):
    # Forward
    y_pred = model(x)
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(i, loss.item())
    
    # Backward
    optimizer.zero_grad()
    loss.backward()
    
    # update model parameters
    optimizer.step()

0 31232964.0
1 27242166.0
2 33871872.0
3 46833196.0
4 57171588.0
5 50000944.0
6 27443558.0
7 9723530.0
8 3224443.75
9 1528157.625
10 1038035.0
11 824475.875
12 688402.5
13 585196.3125
14 502125.03125
15 433812.75
16 377039.75
17 329412.0625
18 289164.90625
19 254976.375
20 225757.1875
21 200652.9375
22 178966.390625
23 160151.265625
24 143767.546875
25 129423.546875
26 116823.0
27 105712.3125
28 95891.0546875
29 87192.4453125
30 79447.84375
31 72532.5546875
32 66339.6796875
33 60781.73828125
34 55781.7890625
35 51272.171875
36 47195.1484375
37 43504.46484375
38 40153.53125
39 37109.5703125
40 34337.8203125
41 31802.0625
42 29486.46484375
43 27367.65625
44 25427.2890625
45 23646.27734375
46 22010.99609375
47 20508.6796875
48 19127.1171875
49 17853.19921875
50 16676.8828125
51 15589.19140625
52 14582.52734375
53 13650.267578125
54 12785.3466796875
55 11982.9189453125
56 11237.94921875
57 10545.1650390625
58 9900.5751953125
59 9300.490234375
60 8741.345703125
61 8219.7001953125
62 7732.96

426 0.0019782311283051968
427 0.001918699825182557
428 0.0018583389464765787
429 0.001800093799829483
430 0.001746550784446299
431 0.00169252825435251
432 0.0016423955094069242
433 0.001591627486050129
434 0.0015442075673490763
435 0.0014998877886682749
436 0.0014574842061847448
437 0.0014131511561572552
438 0.001371210440993309
439 0.0013336802367120981
440 0.0012962276814505458
441 0.0012588540557771921
442 0.0012229669373482466
443 0.0011881941463798285
444 0.0011527680326253176
445 0.001122436486184597
446 0.0010916702449321747
447 0.001061107381246984
448 0.0010334659600630403
449 0.0010047612013295293
450 0.0009775960352271795
451 0.000955044524744153
452 0.0009274031035602093
453 0.000903188600204885
454 0.0008789582643657923
455 0.0008578784763813019
456 0.0008350412826985121
457 0.0008141141734085977
458 0.0007934772875159979
459 0.0007731662481091917
460 0.0007533160387538373
461 0.0007349546649493277
462 0.0007160467212088406
463 0.0006972590927034616
464 0.00068060617195442

In [8]:
# Version four, 自定义模型(继承nn.Module)

N = 64
D_in = 1000
H = 100
D_out = 10
Iter = 500

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerMLP(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerMLP, self).__init__()
        
        # 定义模型结构
        self.linear1 = nn.Linear(D_in, H, bias=False)
        self.linear2 = nn.Linear(H, D_out, bias=False)
    
    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = TwoLayerMLP(D_in, H, D_out)
loss_fn = nn.MSELoss(reduction="sum")
learning_rate = 1e-6
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for i in range(Iter):
    # Forward
    y_pred = model(x)
    
    # Compute loss
    loss = loss_fn(y_pred, y)
    print(i, loss.item())
    
    # Backward
    optimizer.zero_grad()
    loss.backward()
    
    # update model parameters
    optimizer.step()


0 722.8117065429688
1 722.2970581054688
2 721.7830200195312
3 721.269775390625
4 720.7575073242188
5 720.245849609375
6 719.7344970703125
7 719.22412109375
8 718.714111328125
9 718.2044677734375
10 717.6958618164062
11 717.1892700195312
12 716.6828002929688
13 716.1768798828125
14 715.671630859375
15 715.1669311523438
16 714.662841796875
17 714.1594848632812
18 713.6566162109375
19 713.1544189453125
20 712.6526489257812
21 712.1513061523438
22 711.6507568359375
23 711.1510620117188
24 710.6527099609375
25 710.1544799804688
26 709.6569213867188
27 709.1598510742188
28 708.6632080078125
29 708.167236328125
30 707.671875
31 707.177001953125
32 706.6829223632812
33 706.1893310546875
34 705.696044921875
35 705.2035522460938
36 704.7117309570312
37 704.220458984375
38 703.7298583984375
39 703.240478515625
40 702.7521362304688
41 702.2640991210938
42 701.77734375
43 701.2909545898438
44 700.80517578125
45 700.3198852539062
46 699.835205078125
47 699.3508911132812
48 698.8671875
49 698.3842773

387 562.7058715820312
388 562.3734130859375
389 562.041259765625
390 561.70947265625
391 561.3778076171875
392 561.0469970703125
393 560.7164306640625
394 560.3861694335938
395 560.0562744140625
396 559.7269287109375
397 559.3978881835938
398 559.0690307617188
399 558.740478515625
400 558.4122314453125
401 558.0839233398438
402 557.7562255859375
403 557.4287719726562
404 557.1014404296875
405 556.7744140625
406 556.44775390625
407 556.121337890625
408 555.7950439453125
409 555.4689331054688
410 555.1430053710938
411 554.8173217773438
412 554.4920043945312
413 554.1668701171875
414 553.841796875
415 553.5170288085938
416 553.1923217773438
417 552.8679809570312
418 552.5438842773438
419 552.2196655273438
420 551.8956909179688
421 551.5720825195312
422 551.2486572265625
423 550.9252319335938
424 550.6021118164062
425 550.2794189453125
426 549.9571533203125
427 549.6348266601562
428 549.312744140625
429 548.9911499023438
430 548.6700439453125
431 548.3500366210938
432 548.0311279296875
433