## 通过案例学习pytorch

- 用numpy来构建3层的全连接神经网络
- 用pytorch来构建全连接网络，介绍pytorch的反向传播，计算图等

### 先来试试numpy建立nn

In [7]:
import numpy as np

# N 是 batch size， D_in 是输入维度， H 是隐藏层维度， D_out 是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # 前向计算
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)  # N*H
    y_pred = h_relu.dot(w2)    # N*D_out
    
    # 计算loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 后向梯度传播
    grad_y_pred = 2.0 * (y_pred - y)      # 平方求导，N*D_out
    grad_w2 = h_relu.T.dot(grad_y_pred)   # Loss对W2的导数，H*D_out
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0                       # relu的导数
    grad_w1 = x.T.dot(grad_h)             # 计算grad_w1
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

(0, 32459329.22309231)
(1, 31567545.801274665)
(2, 31742065.71399916)
(3, 28706674.014970787)
(4, 21719214.150050543)
(5, 13684639.141976327)
(6, 7643123.722749564)
(7, 4183990.136878989)
(8, 2453487.30485572)
(9, 1605294.726497449)
(10, 1164074.4627873772)
(11, 908770.1215817849)
(12, 742528.9325136293)
(13, 622943.9010670949)
(14, 530884.340545196)
(15, 457024.2128255093)
(16, 396281.00733926287)
(17, 345521.5946256011)
(18, 302700.2041884255)
(19, 266284.9768751493)
(20, 235130.45397458796)
(21, 208343.82881388266)
(22, 185223.34295249498)
(23, 165199.97999555213)
(24, 147791.33450581518)
(25, 132596.36893013498)
(26, 119253.29475768711)
(27, 107511.98602936302)
(28, 97139.87326552426)
(29, 87959.84899038312)
(30, 79803.29544177017)
(31, 72545.08882929408)
(32, 66060.00644450552)
(33, 60257.93585447466)
(34, 55051.09454804825)
(35, 50369.795882571765)
(36, 46151.16157750768)
(37, 42342.25116492754)
(38, 38898.71634021314)
(39, 35778.69951558348)
(40, 32949.084530814434)
(41, 30379.1

(346, 0.010401815959674321)
(347, 0.009968552099366203)
(348, 0.009553725382352503)
(349, 0.009155859110815259)
(350, 0.008774881529851047)
(351, 0.008409617672637785)
(352, 0.0080597767061008)
(353, 0.007724560385560076)
(354, 0.007403221709924791)
(355, 0.007095267007599042)
(356, 0.006800262958153107)
(357, 0.006517555852287528)
(358, 0.006246620993715927)
(359, 0.005986907901221977)
(360, 0.005738096280770029)
(361, 0.005499686430695766)
(362, 0.0052711634783328045)
(363, 0.0050523000235793685)
(364, 0.004842376340998301)
(365, 0.004641259101810654)
(366, 0.00444852153573788)
(367, 0.004263790077415274)
(368, 0.004086779012470134)
(369, 0.0039171507001010825)
(370, 0.003754574894385474)
(371, 0.0035987763290967196)
(372, 0.003449397762449945)
(373, 0.003306306483363391)
(374, 0.0031691221207536583)
(375, 0.003037747449681001)
(376, 0.002911763012584526)
(377, 0.0027909875684287476)
(378, 0.0026752633271696236)
(379, 0.0025643290511049395)
(380, 0.0024580336301736057)
(381, 0.002356

### 使用pytorch

pytorch可以使用GPU，其运算速度比CPU快上数十倍，此处用pytorch的tensor构建和上述一样的神经网络来拟合随机数据

In [6]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)               # x 与 w1 矩阵的乘积
    h_relu = h.clamp(min=0)    # clamp 将tensor中的数据限制在min，max之间
    y_pred = h_relu.mm(w2)
    
    # 计算Loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # 后向传播梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

(0, 28097968.0)
(1, 24741478.0)
(2, 28736492.0)
(3, 36814076.0)
(4, 43120656.0)
(5, 40412568.0)
(6, 27200838.0)
(7, 13396490.0)
(8, 5519128.5)
(9, 2457090.25)
(10, 1377107.75)
(11, 961606.5)
(12, 760143.0625)
(13, 635044.0)
(14, 543171.5)
(15, 470012.40625)
(16, 409504.34375)
(17, 358565.3125)
(18, 315249.53125)
(19, 278158.21875)
(20, 246180.53125)
(21, 218550.203125)
(22, 194621.921875)
(23, 173789.28125)
(24, 155568.28125)
(25, 139577.921875)
(26, 125488.8203125)
(27, 113041.484375)
(28, 102024.859375)
(29, 92243.3828125)
(30, 83548.9453125)
(31, 75802.359375)
(32, 68883.6875)
(33, 62687.421875)
(34, 57123.96484375)
(35, 52120.97265625)
(36, 47611.59765625)
(37, 43548.52734375)
(38, 39884.0)
(39, 36575.140625)
(40, 33582.75)
(41, 30869.38671875)
(42, 28403.2109375)
(43, 26158.427734375)
(44, 24113.78515625)
(45, 22248.771484375)
(46, 20545.072265625)
(47, 18988.34375)
(48, 17564.96484375)
(49, 16261.3603515625)
(50, 15066.3076171875)
(51, 13969.265625)
(52, 12961.6923828125)
(53, 12

### pytorch的autograd
1. 使用pytorch对tensor的自动求导功能，进行快捷的网络设计
2. 当tensor 的 requires_grad=True时， tensor有grad属性，储存tensor的grad

In [9]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    # 使用自动求导进行后向传播，它会计算所有 requires_grad=True的tensor
    # 的导数，然后 w1.grad 和 w2.grad 会储存根据loss求得的导数
    loss.backward()
    with torch.no_grad():    # 表明下述的w1 与 w2 的更新不需要被后序梯度计算考虑
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

(0, 34067984.0)
(1, 28800384.0)
(2, 24794114.0)
(3, 19579816.0)
(4, 13751772.0)
(5, 8779938.0)
(6, 5377044.5)
(7, 3347737.75)
(8, 2207181.5)
(9, 1562810.125)
(10, 1180464.25)
(11, 936940.6875)
(12, 769322.4375)
(13, 646166.25)
(14, 550905.375)
(15, 474636.34375)
(16, 412035.03125)
(17, 359914.4375)
(18, 315952.0625)
(19, 278564.9375)
(20, 246573.21875)
(21, 219001.671875)
(22, 195110.890625)
(23, 174335.734375)
(24, 156176.25)
(25, 140269.65625)
(26, 126293.75)
(27, 113958.2890625)
(28, 103033.4921875)
(29, 93331.0625)
(30, 84691.390625)
(31, 76985.109375)
(32, 70103.9375)
(33, 63945.3828125)
(34, 58423.79296875)
(35, 53453.140625)
(36, 48972.75)
(37, 44929.48046875)
(38, 41271.4921875)
(39, 37959.23046875)
(40, 34957.42578125)
(41, 32230.6484375)
(42, 29747.416015625)
(43, 27485.04296875)
(44, 25422.853515625)
(45, 23538.474609375)
(46, 21814.21484375)
(47, 20234.169921875)
(48, 18785.349609375)
(49, 17455.666015625)
(50, 16233.85546875)
(51, 15111.6328125)
(52, 14080.181640625)
(53, 

### torch.autograd.Function
- Function就像计算图中的边，实现Variable的计算，并输出新的Variable
- 使用 torch.autograd.Function 来自定义 **前向** 与 **后向** 计算函数
- Function一般只定义一个操作，因为其无法保存参数，因此适用于激活函数、pooling等操作
- Function需要定义三个方法：__init__, forward, backward（需要自己写求导公式）

In [12]:
class MyReLU(torch.autograd.Function):
    """
    构建torch.autograd.Function的子类，实现对tensor的自动前向与后向计算
    """
    @staticmethod   # 内置函数(返回函数的静态方法)，用此函数进行装饰
    def forward(ctx, input):
        # ctx is a context object that can be used to 
        # stash information for backward computation，类似self
        ctx.save_for_backward(input)  # 将输入保存起来，在backward时使用
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        # grad_output为反向传播上一级计算得到的梯度值
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0       # 相当于grad_output乘储存输入的Relu导数
        return grad_input

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

for t in range(500):
    # 或者使用apply方法对自己定义的方法取个别名
    relu = MyReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

(0, 29268360.0)
(1, 21931706.0)
(2, 19136682.0)
(3, 17621526.0)
(4, 15810654.0)
(5, 13380708.0)
(6, 10439852.0)
(7, 7640316.5)
(8, 5305949.0)
(9, 3615355.5)
(10, 2464091.25)
(11, 1717796.125)
(12, 1235907.875)
(13, 923455.8125)
(14, 715624.625)
(15, 573117.875)
(16, 471472.46875)
(17, 396230.75)
(18, 338506.875)
(19, 292852.9375)
(20, 255759.4375)
(21, 224969.5625)
(22, 199051.515625)
(23, 176971.59375)
(24, 157978.640625)
(25, 141513.96875)
(26, 127158.8984375)
(27, 114560.984375)
(28, 103464.875)
(29, 93647.71875)
(30, 84937.3515625)
(31, 77181.1796875)
(32, 70255.828125)
(33, 64058.109375)
(34, 58501.375)
(35, 53506.5546875)
(36, 49009.33203125)
(37, 44953.4609375)
(38, 41285.15234375)
(39, 37962.40625)
(40, 34947.07421875)
(41, 32209.86328125)
(42, 29725.82421875)
(43, 27461.865234375)
(44, 25397.3046875)
(45, 23511.30078125)
(46, 21781.8203125)
(47, 20198.279296875)
(48, 18745.689453125)
(49, 17412.1328125)
(50, 16186.6083984375)
(51, 15058.49609375)
(52, 14019.4091796875)
(53, 13

(372, 0.014785748906433582)
(373, 0.014259674586355686)
(374, 0.013745839707553387)
(375, 0.013255981728434563)
(376, 0.0127794798463583)
(377, 0.012331723235547543)
(378, 0.01188892312347889)
(379, 0.011467594653367996)
(380, 0.011060263961553574)
(381, 0.010671904310584068)
(382, 0.010292337276041508)
(383, 0.009932709857821465)
(384, 0.00958422850817442)
(385, 0.009248074144124985)
(386, 0.008921870030462742)
(387, 0.008605279959738255)
(388, 0.008305802941322327)
(389, 0.008021264337003231)
(390, 0.007738915737718344)
(391, 0.007472663652151823)
(392, 0.0072093745693564415)
(393, 0.006960285361856222)
(394, 0.006717656273394823)
(395, 0.0064885178580880165)
(396, 0.006270130164921284)
(397, 0.006052128970623016)
(398, 0.005841345991939306)
(399, 0.005640821065753698)
(400, 0.005445709452033043)
(401, 0.005259121302515268)
(402, 0.0050836713053286076)
(403, 0.004910196643322706)
(404, 0.004741100128740072)
(405, 0.004588237963616848)
(406, 0.004433200694620609)
(407, 0.0042849830351

### 使用nn模块

#### 模块
- torch.nn.Module：所有网络的基础类，可以定义神经网络的层，通过它，无需定义后向传播过程，其操作更为便捷。nn.Module的子类需要有 **__init__, forward**
- torch.nn.Sequential(*args)： 时序容器，可以包含Modules或卷积层、池化层等
- torch.nn.ModuleList(modules=None) Modules组成的列表
- torch.nn.ModuleDict(modules=None)

#### 卷积层
- Conv1d： 1维卷积层，参数有in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1，bias=True
- Conv2d：2维卷积层，输入维度为(N,C,H,W)
- Conv3d：3维卷积层，输入维度为(N,C,D,H,W)

#### 池化层
- torch.nn.MaxPool1d：1维下的最大化池化操作，相应还有2维、3维
- torch.nn.MaxUnpool1d：MaxPool1d的逆过程，但之前有信息丢失，故非完全逆向
- torch.nn.AvgPool1d：对信号提供1维平均池化，维度主要与输入信息的维度有关
- torch.nn.FractionalMaxPool2d：2维的分数最大化池化操作
- torch.nn.LPPool2d：2维的幂平均池化操作
- torch.nn.AdaptiveMaxPool1d：1维的自适应最大池化操作，还有2维、平均池化等变化

#### 非线性激活函数
- torch.nn.ELU
- torch.nn.Hardshrink
- torch.nn.Hardtanh
- torch.nn.LeakyReLU
- torch.nn.LogSigmoid
- torch.nn.PReLU
- torch.nn.ReLU
- torch.nn.ReLU6
- torch.nn.RReLU
- torch.nn.CELU
- torch.nn.Sigmoid
- torch.nn.Softplus
- torch.nn.Softshrink
- torch.nn.Softsign
- torch.nn.Tanh
- torch.nn.Tanhshrink
- torch.nn.Threshold

#### 正则化
- torch.nn.BatchNorm1d：对小批量(mini-batch)的2d或3d输入进行批标准化(Batch Normalization)操作，在训练时，该层计算每次输入的均值与方差，并进行移动平均。移动平均默认的动量值为0.1。参数：num_features, eps=1e-05, momentum=0.1, affine=True

#### 循环网络层
- torch.nn.RNN：将一个多层的 Elman RNN，激活函数为tanh或者ReLU，用于输入序列。
- torch.nn.LSTM：将一个多层的 (LSTM) 应用到输入序列。
- torch.nn.GRU：将一个多层的GRU用于输入序列。

#### 线性网络层
- torch.nn.Linear：参数为in_features, out_features, bias=True

#### dropout层
- torch.nn.Dropout：随机将输入张量中部分元素设置为0。对于每次前向调用，被置0的元素都是随机的。参数：p=0.5, inplace=False

#### Sparse Layers
- torch.nn.Embedding：一个保存了固定字典和大小的简单查找表。这个模块常用来保存词嵌入和用下标检索它们。模块的输入是一个下标的列表，输出是对应的词嵌入。
- torch.nn.PairwiseDistance：按批计算向量v1, v2之间的距离

#### Loss函数
- torch.nn.L1Loss：L1损失
- torch.nn.MSELoss：平方误差
- torch.nn.CrossEntropyLoss：将LogSoftMax与NLLLoss集成到一起
- torch.nn.NLLLoss：负的log likelihood loss损失
- torch.nn.KLDivLoss：计算KL散度损失
- torch.nn.BCELoss：计算目标与output之间的二进制交叉熵
- torch.nn.MarginRankingLoss：创建一个标准，使y的值在{-1,1}之中
- torch.nn.HingeEmbeddingLoss：合页损失
- torch.nn.MultiLabelMarginLoss：计算多标签的合页损失<br>
..等等

#### Vision Layers
- torch.nn.PixelShuffle
- torch.nn.UpsamplingNearest2d：对于多channel 输入 进行 2-D 最近邻上采样。
- torch.nn.UpsamplingBilinear2d：对于多channel 输入 进行 2-D bilinear 上采样。
- torch.nn.DataParallel：在模块级别上实现数据并行。

In [19]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    model.zero_grad()        # 模型梯度置零
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

(0, 607.5924682617188)
(1, 559.6580200195312)
(2, 519.0784912109375)
(3, 483.9808044433594)
(4, 452.7442321777344)
(5, 424.6859130859375)
(6, 399.11767578125)
(7, 375.80706787109375)
(8, 354.3562927246094)
(9, 334.5290222167969)
(10, 316.1512756347656)
(11, 298.9085998535156)
(12, 282.65289306640625)
(13, 267.3879089355469)
(14, 252.93507385253906)
(15, 239.23861694335938)
(16, 226.3023681640625)
(17, 214.0489959716797)
(18, 202.41104125976562)
(19, 191.22979736328125)
(20, 180.5787353515625)
(21, 170.47311401367188)
(22, 160.8202362060547)
(23, 151.65887451171875)
(24, 142.95823669433594)
(25, 134.69967651367188)
(26, 126.85818481445312)
(27, 119.42988586425781)
(28, 112.39962005615234)
(29, 105.73837280273438)
(30, 99.45510864257812)
(31, 93.52886962890625)
(32, 87.93603515625)
(33, 82.65250396728516)
(34, 77.6696548461914)
(35, 72.98558044433594)
(36, 68.57122039794922)
(37, 64.4217529296875)
(38, 60.53288269042969)
(39, 56.87421417236328)
(40, 53.433067321777344)
(41, 50.1987571716

(427, 1.354108280793298e-05)
(428, 1.3112523447489366e-05)
(429, 1.2698705177172087e-05)
(430, 1.229605004482437e-05)
(431, 1.1906315194210038e-05)
(432, 1.1528824870765675e-05)
(433, 1.1164324860146735e-05)
(434, 1.0811748325068038e-05)
(435, 1.0470437700860202e-05)
(436, 1.0140810445591342e-05)
(437, 9.819507795327809e-06)
(438, 9.509845767752267e-06)
(439, 9.211528777086642e-06)
(440, 8.920235814002808e-06)
(441, 8.640018677397165e-06)
(442, 8.367852387891617e-06)
(443, 8.105225788312964e-06)
(444, 7.850066140235867e-06)
(445, 7.6030255513614975e-06)
(446, 7.364602424786426e-06)
(447, 7.132147402444389e-06)
(448, 6.908840987307485e-06)
(449, 6.691444468742702e-06)
(450, 6.481627224275144e-06)
(451, 6.279164608713472e-06)
(452, 6.080289949750295e-06)
(453, 5.89155843044864e-06)
(454, 5.706905540137086e-06)
(455, 5.527912435354665e-06)
(456, 5.3560261221718974e-06)
(457, 5.187777333048871e-06)
(458, 5.025291102356277e-06)
(459, 4.867763891525101e-06)
(460, 4.716578587249387e-06)
(461,

### optim包定义优化策略

torch.optim是一个实现了各种优化算法的库。大部分常用的方法得到支持，并且接口具备足够的通用性，使得未来能够集成更加复杂的方法。<br>
为了使用torch.optim，你需要构建一个optimizer对象。这个对象能够保持当前参数状态并基于计算得到的梯度进行参数更新。<br>

- torch.optim.Optimizer：重要参数是params（Variable或者dict的iterable），指定了什么参数应当被优化，通常是模型的参数，可以指定整个模型参数，也可以是具体一些层的参数。step()可以进行单次参数更新，通常在backward之后调用。
- torch.optim.Adadelta：实现Adadelta算法
- torch.optim.Adamax：实现Adamax算法（Adam的一种基于无穷范数的变种）
- torch.optim.RMSprop：实现RMSprop算法
- torch.optim.Rprop：实现弹性反向传播算法

等等，此处使用Adam算法来更新

In [21]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 693.2561645507812)
(1, 676.301025390625)
(2, 659.9093627929688)
(3, 643.9142456054688)
(4, 628.4351196289062)
(5, 613.4736938476562)
(6, 598.9464721679688)
(7, 584.7763671875)
(8, 570.9635620117188)
(9, 557.4812622070312)
(10, 544.3178100585938)
(11, 531.50830078125)
(12, 519.1080932617188)
(13, 507.12701416015625)
(14, 495.4449768066406)
(15, 484.12237548828125)
(16, 473.08941650390625)
(17, 462.396728515625)
(18, 451.9190368652344)
(19, 441.6812438964844)
(20, 431.6579284667969)
(21, 421.8376770019531)
(22, 412.24456787109375)
(23, 402.842041015625)
(24, 393.632080078125)
(25, 384.62481689453125)
(26, 375.8165588378906)
(27, 367.2746887207031)
(28, 358.95501708984375)
(29, 350.8160400390625)
(30, 342.8268127441406)
(31, 335.01080322265625)
(32, 327.3464050292969)
(33, 319.838134765625)
(34, 312.4763488769531)
(35, 305.266845703125)
(36, 298.21368408203125)
(37, 291.3105773925781)
(38, 284.52703857421875)
(39, 277.8581237792969)
(40, 271.3272705078125)
(41, 264.9181823730469)
(42,

(440, 4.3147642259100394e-07)
(441, 3.9890366565487057e-07)
(442, 3.6887391274831316e-07)
(443, 3.4093017120540026e-07)
(444, 3.1500709951615136e-07)
(445, 2.910160219471436e-07)
(446, 2.687118296762492e-07)
(447, 2.480229284174129e-07)
(448, 2.2878755601141165e-07)
(449, 2.1122014004504308e-07)
(450, 1.9496731340495899e-07)
(451, 1.797945543557944e-07)
(452, 1.659382888874461e-07)
(453, 1.5298103051009093e-07)
(454, 1.410066658991127e-07)
(455, 1.3005718813019485e-07)
(456, 1.1971846447522694e-07)
(457, 1.1027552915265915e-07)
(458, 1.0164236385890035e-07)
(459, 9.349645324618905e-08)
(460, 8.610346213799858e-08)
(461, 7.91731054050615e-08)
(462, 7.279030000972853e-08)
(463, 6.693814924574326e-08)
(464, 6.156706433557702e-08)
(465, 5.6618794985752174e-08)
(466, 5.1996266847709194e-08)
(467, 4.777919215825932e-08)
(468, 4.3911857261491605e-08)
(469, 4.0405197410109395e-08)
(470, 3.7043875522613234e-08)
(471, 3.405727255767488e-08)
(472, 3.12542454139475e-08)
(473, 2.868213222484428e-08

### 定义Modules

通过定义 nn.Modules 的子类，可以构建更为复杂的网络，此处将上述网络定义为新类。

In [25]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.linear1(x)
        h_relu = self.relu1(h_relu)
        y_pred = self.linear2(h_relu)
        return y_pred

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 688.0287475585938)
(1, 634.0516967773438)
(2, 588.2632446289062)
(3, 548.7922973632812)
(4, 514.0750732421875)
(5, 483.1347961425781)
(6, 454.9927673339844)
(7, 429.060791015625)
(8, 405.0951843261719)
(9, 382.8861389160156)
(10, 362.1589660644531)
(11, 342.839111328125)
(12, 324.59991455078125)
(13, 307.3724365234375)
(14, 291.01446533203125)
(15, 275.4429626464844)
(16, 260.6875)
(17, 246.61404418945312)
(18, 233.19895935058594)
(19, 220.40716552734375)
(20, 208.21075439453125)
(21, 196.6345977783203)
(22, 185.5901641845703)
(23, 175.03526306152344)
(24, 164.9927215576172)
(25, 155.4248504638672)
(26, 146.34072875976562)
(27, 137.6408233642578)
(28, 129.40782165527344)
(29, 121.62106323242188)
(30, 114.25202178955078)
(31, 107.28170776367188)
(32, 100.69052124023438)
(33, 94.49908447265625)
(34, 88.67680358886719)
(35, 83.2049789428711)
(36, 78.06139373779297)
(37, 73.21747589111328)
(38, 68.67789459228516)
(39, 64.41874694824219)
(40, 60.426109313964844)
(41, 56.69112014770508)


### 控制流与共享权重

在隐藏层使用全连接的ReLU网络，并重复使用相同的隐藏层权重。

In [27]:
import random
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = DynamicNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 624.7078857421875)
(1, 609.1409912109375)
(2, 598.5230102539062)
(3, 511.28759765625)
(4, 597.7745361328125)
(5, 590.5433349609375)
(6, 586.991943359375)
(7, 575.1552124023438)
(8, 565.94091796875)
(9, 588.4013061523438)
(10, 539.8123779296875)
(11, 584.8789672851562)
(12, 510.9237365722656)
(13, 564.3084716796875)
(14, 317.03509521484375)
(15, 468.78619384765625)
(16, 551.4328002929688)
(17, 436.1291809082031)
(18, 570.3226318359375)
(19, 565.9698486328125)
(20, 559.8622436523438)
(21, 368.4220275878906)
(22, 198.57281494140625)
(23, 489.00775146484375)
(24, 527.20068359375)
(25, 145.640625)
(26, 504.0824890136719)
(27, 431.8714904785156)
(28, 474.06964111328125)
(29, 104.58412170410156)
(30, 372.68267822265625)
(31, 414.4101257324219)
(32, 212.83352661132812)
(33, 201.08192443847656)
(34, 187.3670196533203)
(35, 170.89808654785156)
(36, 313.154541015625)
(37, 135.0020751953125)
(38, 119.27690887451172)
(39, 104.86109924316406)
(40, 261.2212219238281)
(41, 225.95266723632812)
(42,

(452, 0.7259885668754578)
(453, 0.4380108416080475)
(454, 0.2810386121273041)
(455, 0.4059736728668213)
(456, 0.26417553424835205)
(457, 0.21655601263046265)
(458, 0.3044850528240204)
(459, 0.2996559739112854)
(460, 0.2187299132347107)
(461, 0.13700313866138458)
(462, 0.11894004791975021)
(463, 1.766518473625183)
(464, 0.08997821062803268)
(465, 0.5602712631225586)
(466, 0.3065895140171051)
(467, 0.4141919016838074)
(468, 0.3486825227737427)
(469, 0.1490900069475174)
(470, 0.34380286931991577)
(471, 0.1844639778137207)
(472, 0.8831683397293091)
(473, 0.17374736070632935)
(474, 0.3683702051639557)
(475, 0.11626024544239044)
(476, 1.2072663307189941)
(477, 0.2625638246536255)
(478, 0.28723645210266113)
(479, 0.6873461604118347)
(480, 0.30999109148979187)
(481, 0.7514258623123169)
(482, 0.32041749358177185)
(483, 0.16704538464546204)
(484, 0.9736873507499695)
(485, 1.0837223529815674)
(486, 0.18592804670333862)
(487, 2.076011896133423)
(488, 1.0855780839920044)
(489, 0.2603071630001068)
(