In [1]:
import torch

## 定义

In [3]:
# 1. 构建一个未初始化的 5x3 的矩阵
x = torch.empty(5, 3)
x

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [5]:
# 2. 构建一个随机初始化的 5x3 的矩阵
x = torch.rand(5, 3) 
x  # 0-1 之间

tensor([[0.9660, 0.4438, 0.1004],
        [0.2004, 0.7474, 0.7545],
        [0.9972, 0.0395, 0.8132],
        [0.7423, 0.4470, 0.4382],
        [0.8610, 0.8010, 0.1924]])

In [6]:
# 3. 构建一个全部为0，类型为 long 的 5x3 的矩阵
x = torch.zeros(5, 3, dtype=torch.long)
x

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])

In [7]:
x.dtype

torch.int64

In [8]:
# 方式二
x = torch.zeros(5, 3).long()
x.dtype

torch.int64

In [10]:
# 4. 从数据直接构建 tensor
x = torch.tensor([5.5, 3])
x

tensor([5.5000, 3.0000])

In [12]:
# 5. 从一个已有的tensor构建另一个tensor。。
# 这些方法会重用原来的tensor的特征，例如，数据类型，除非提供新的数据

x_new = x.new_ones(5, 3)
x_new

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

从 x_new 的结果可以看出，x_new 和之前的 x 的数据类型相同

In [14]:
x = torch.randn_like(x, dtype=torch.double)
x

tensor([[ 4.4445e-01,  1.4479e-01, -6.2074e-01],
        [ 9.5716e-01,  1.2673e+00,  9.0667e-01],
        [-8.2146e-04,  2.9741e-01,  1.2585e+00],
        [-1.0800e+00, -6.0612e-01,  1.5349e+00],
        [-1.1568e+00, -6.0676e-01,  3.0532e+00]], dtype=torch.float64)

In [15]:
# 6. 得到 tensor 的形状
x.shape

torch.Size([5, 3])

In [18]:
x.size()

torch.Size([5, 3])

注意：torch.Size 返回的是一个 tuple（元组）

## 运算

### 1. 加法

In [19]:
y = torch.rand(5, 3)
y

tensor([[0.7051, 0.2710, 0.5037],
        [0.4359, 0.1115, 0.3186],
        [0.4780, 0.5766, 0.9534],
        [0.4985, 0.3286, 0.2202],
        [0.5064, 0.1130, 0.4838]])

In [22]:
x = torch.rand(5, 3)
x

tensor([[0.6424, 0.5232, 0.4694],
        [0.4968, 0.1627, 0.8041],
        [0.8081, 0.2958, 0.3686],
        [0.3775, 0.1159, 0.0780],
        [0.6673, 0.4262, 0.2349]])

In [23]:
x + y

tensor([[1.3475, 0.7942, 0.9731],
        [0.9327, 0.2742, 1.1228],
        [1.2861, 0.8724, 1.3220],
        [0.8759, 0.4446, 0.2982],
        [1.1737, 0.5392, 0.7186]])

In [24]:
torch.add(x, y)

tensor([[1.3475, 0.7942, 0.9731],
        [0.9327, 0.2742, 1.1228],
        [1.2861, 0.8724, 1.3220],
        [0.8759, 0.4446, 0.2982],
        [1.1737, 0.5392, 0.7186]])

In [25]:
result = torch.empty(5, 3)
torch.add(x, y, out=result)
result

tensor([[1.3475, 0.7942, 0.9731],
        [0.9327, 0.2742, 1.1228],
        [1.2861, 0.8724, 1.3220],
        [0.8759, 0.4446, 0.2982],
        [1.1737, 0.5392, 0.7186]])

In [26]:
result = x + y
result

tensor([[1.3475, 0.7942, 0.9731],
        [0.9327, 0.2742, 1.1228],
        [1.2861, 0.8724, 1.3220],
        [0.8759, 0.4446, 0.2982],
        [1.1737, 0.5392, 0.7186]])

In [27]:
# 直接将相加的结果保存在 y 上
y.add_(x)
y

tensor([[1.3475, 0.7942, 0.9731],
        [0.9327, 0.2742, 1.1228],
        [1.2861, 0.8724, 1.3220],
        [0.8759, 0.4446, 0.2982],
        [1.1737, 0.5392, 0.7186]])

注意：任何 in-place 的运算都会以 `_` 结尾。举例来说：x.copy_(y)，x.t_()会改变 x

### 2. 各种类似 NumPy 的 indexing 都可以在 Pytorch tensor 上使用。

In [28]:
x

tensor([[0.6424, 0.5232, 0.4694],
        [0.4968, 0.1627, 0.8041],
        [0.8081, 0.2958, 0.3686],
        [0.3775, 0.1159, 0.0780],
        [0.6673, 0.4262, 0.2349]])

In [29]:
x[:, 1:] # 取出每一行中第1列及之后的项

tensor([[0.5232, 0.4694],
        [0.1627, 0.8041],
        [0.2958, 0.3686],
        [0.1159, 0.0780],
        [0.4262, 0.2349]])

### 3. Resizing
 如果你希望 resize/reshape 一个 tensor，可以使用 torch.view

In [30]:
x = torch.rand(4, 4)
y = x.view(16)
y

tensor([0.9699, 0.5858, 0.0461, 0.6555, 0.6411, 0.6974, 0.2339, 0.7755, 0.8295,
        0.1821, 0.9792, 0.8485, 0.2084, 0.5697, 0.1400, 0.3183])

In [31]:
z = x.view(2, 8)
z

tensor([[0.9699, 0.5858, 0.0461, 0.6555, 0.6411, 0.6974, 0.2339, 0.7755],
        [0.8295, 0.1821, 0.9792, 0.8485, 0.2084, 0.5697, 0.1400, 0.3183]])

In [32]:
w = x.view(-1, 2)

In [33]:
w

tensor([[0.9699, 0.5858],
        [0.0461, 0.6555],
        [0.6411, 0.6974],
        [0.2339, 0.7755],
        [0.8295, 0.1821],
        [0.9792, 0.8485],
        [0.2084, 0.5697],
        [0.1400, 0.3183]])

In [35]:
# 如果你只有一个元素的 tensor，使用 .item() 方法可以把里面的 value 变成 Python 数值
x = torch.rand(1)
x

tensor([0.3390])

In [36]:
dir(x)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ifloordiv__',
 '__ilshift__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rfloordiv__',
 '__rmul__',
 '__rpow__',
 '__rshift__',
 '__rsub__',
 '__rtruediv__',
 '__

In [37]:
x.data

tensor([0.3390])

In [38]:
x.grad

In [39]:
# 由 tensor 转 数值
x.item()

0.3389529585838318

In [40]:
z.transpose(1, 0) # 将z从 2x8 的矩阵变成 8x2 的矩阵

tensor([[0.9699, 0.8295],
        [0.5858, 0.1821],
        [0.0461, 0.9792],
        [0.6555, 0.8485],
        [0.6411, 0.2084],
        [0.6974, 0.5697],
        [0.2339, 0.1400],
        [0.7755, 0.3183]])

## Numpy和Tensor之间的转化

1. 在 Torch Tensor 和 Numpy array 之间相互转化非常容易
2. Torch Tensor 和 Numpy array 会共享内存，所以改变其中一项也会改变另一项

In [41]:
# 把 Tensor 转变成 Array
a = torch.ones(5)
a

tensor([1., 1., 1., 1., 1.])

In [42]:
b = a.numpy()
b

array([1., 1., 1., 1., 1.], dtype=float32)

In [43]:
# 改变 numpy array 里面的值
b[1] = 10
b

array([ 1., 10.,  1.,  1.,  1.], dtype=float32)

In [44]:
a # a 和 b 共享内存

tensor([ 1., 10.,  1.,  1.,  1.])

In [45]:
import numpy as np
# 把 Array 变成 Tensor
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
a

array([2., 2., 2., 2., 2.])

In [46]:
b

tensor([2., 2., 2., 2., 2.], dtype=torch.float64)

In [47]:
a = a + 1

In [48]:
a

array([3., 3., 3., 3., 3.])

In [49]:
b

tensor([2., 2., 2., 2., 2.], dtype=torch.float64)

所有CPU上的 Tensor 都支持转成 numpy 或者从 numpy 转成 Tensor

## CUDA Tensors
使用 .to 方法，Tensor可以被移动到别的 device 上

In [50]:
# 查看自己电脑是否支持GPU
torch.cuda.is_available()

True

In [51]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x, device=device)
    x = x.to(device)
    z = x + y
    print(z)
    print(z.to("cpu"))

tensor([1.3390], device='cuda:0')
tensor([1.3390])


In [52]:
# 如果一个 tensor 在 GPU 上，必须先转成在 CPU 上才能转成 numpy Array
y.to("cpu").data.numpy()
y.cpu().data.numpy()

array([1.], dtype=float32)

In [None]:
# 将模型迁移到 GPU
model = model.cuda()


## 热身：用 numpy 实现两层神经网络

一个全连接ReLU神经网络，一个隐藏层，没有bias，用来从x预测y，使用 L2 Loss
- $h = W_1X$
- $a = max(0, h)$
- $y_{hat} = W_2a$

这一实现完全使用 numpy 来计算前向神经网络，loss 和 反向传播
- foward pass
- loss
- backward pass

numpy ndarray 是一个普通的 n 维 array。它不知道任何关于深度学习或者梯度（gradient）的知识，也不知道计算图（computation graph），只是一种用来计算数学运算的数据结构。

In [53]:
# N：输入样本数
# D_in：输入维度
# H：中间层
# D_out：输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

In [55]:
# 随机创建训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# W1 是可以将 1000 维的数据转成 100 维的数据
# W2 是可以将 100 维的数据转成 10 维的数据
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    h = x.dot(w1) # N x H
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2) # N x D_out
    
    # compute loss
    loss = np.square(y_pred - y).sum() # 均方误差
    print(t, loss)
    
    # backward pass
    # compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 32154083.752617557
1 27680781.722330175
2 25642233.763676543
3 22682261.23966396
4 17870773.215894267
5 12461383.589884918
6 7880580.684430635
7 4792020.629589669
8 2957284.7518131877
9 1931403.4309312934
10 1353240.0459642257
11 1013315.0433397911
12 799082.1297799157
13 653595.6386179761
14 547681.9440090879
15 466534.8053837627
16 401987.9792872119
17 349164.15977930225
18 305131.80941200105
19 267950.7870323984
20 236273.855611176
21 209093.10844511288
22 185648.89366977167
23 165312.4412920772
24 147600.75448625293
25 132116.37201167308
26 118556.82127127961
27 106636.01813109814
28 96110.34568169169
29 86779.31940798246
30 78495.84201592582
31 71113.39839969261
32 64521.304833876675
33 58625.06328220706
34 53342.6145345744
35 48600.712131374086
36 44334.108800863665
37 40490.786537879496
38 37019.20504813708
39 33881.40562612939
40 31041.871056378837
41 28467.10909128632
42 26130.937185106253
43 24010.090709223245
44 22079.566689975138
45 20320.12665005056
46 18714.903384863628

364 0.0004047516936757023
365 0.0003852451019479443
366 0.00036667481519493265
367 0.00034900436295370006
368 0.00033218672302676427
369 0.0003161816392162161
370 0.00030095108908238924
371 0.0002864570787991708
372 0.0002726616951811925
373 0.0002595313515623522
374 0.00024703478037077113
375 0.00023514277153574516
376 0.00022382438860504942
377 0.00021305328449600734
378 0.0002028001081613189
379 0.0001930416617931484
380 0.00018375463920808615
381 0.00017491385900181034
382 0.00016650263742532505
383 0.0001584947846371541
384 0.00015087311571001602
385 0.00014361836050592118
386 0.00013671357453276357
387 0.00013014242321956348
388 0.00012388815411078986
389 0.00011793393832878866
390 0.00011226681357042246
391 0.00010687208710132614
392 0.00010173762289279972
393 9.685030638954816e-05
394 9.219896992440041e-05
395 8.777067985417641e-05
396 8.355519901863788e-05
397 7.954349859536398e-05
398 7.572379226572813e-05
399 7.208925755297174e-05
400 6.86286214626864e-05
401 6.5334778800496

In [56]:
x

array([[-1.14267794, -0.25195498, -0.28560354, ...,  0.81855897,
         0.72424924,  0.27520538],
       [ 0.58254285, -0.38010554,  0.20232598, ..., -1.12462832,
         0.4252977 , -0.21248669],
       [ 0.47352914, -0.40063836,  2.95729554, ..., -0.47003545,
         0.89748504, -0.23577653],
       ...,
       [-0.00665696, -0.35828693,  1.35190195, ..., -0.40885774,
         0.3664065 , -2.14870709],
       [-0.49035084, -0.60208692,  0.26182786, ..., -0.0173768 ,
        -0.23954234,  1.35623013],
       [-0.87843118, -1.39489342, -0.67651579, ...,  0.92584722,
        -0.3529211 , -0.11596917]])

In [58]:
h = x.dot(w1) # N x H
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2) # N x D_out

In [60]:
y_pred - y

array([[-4.76592601e-05, -2.09588250e-05, -1.62466261e-05,
        -5.12615758e-06, -1.43640649e-05, -3.66733414e-06,
         4.49841033e-06, -1.15207951e-05,  9.18217967e-06,
        -1.19300317e-05],
       [-4.74961856e-05, -1.99902946e-05, -1.74922925e-05,
        -4.91887768e-06, -1.36610270e-05, -4.03254527e-06,
         2.90951644e-06, -1.70429165e-05,  2.02722843e-05,
        -1.79449234e-05],
       [ 4.49160686e-05,  2.22305803e-05,  2.61837160e-05,
         2.53096691e-06,  1.84100060e-05,  1.72802424e-05,
        -1.61222325e-05,  3.16028988e-05, -1.89429190e-05,
         1.53351089e-05],
       [-1.94327608e-05, -1.32398795e-05, -1.75245133e-05,
         6.93938032e-07, -3.84171377e-06, -1.63626919e-05,
         6.75744306e-06, -1.79219937e-05,  1.14158902e-05,
        -1.20749728e-05],
       [-1.40556438e-05, -1.99318282e-06,  7.47486818e-06,
         9.66656680e-06,  9.88377205e-06,  8.93091485e-06,
        -3.07465693e-07, -5.65260462e-07, -1.76428116e-05,
        -1.

可以观察到 y_pred 和 y 之间差值非常小

## Pytorch：Tensors

这次我们使用 Pytorch tensors 来创建神经网络，计算损失，以及反向传播

一个 Pytorch Tensor 很像一个 numpy 的 ndarray。但是它和 numpy ndarray 最大的区别是 Pytorch Tensor 可以在 CPU 或者 GPU 上运算。如果想要在 GPU 上运算，就需要把 Tensor 换成 cuda 类型。

In [66]:
# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# W1 是可以将 1000 维的数据转成 100 维的数据
# W2 是可以将 100 维的数据转成 10 维的数据
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1) # N x H
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2) # N x D_out
    
    # compute loss
    loss = (y_pred - y).pow(2).sum().item() # 均方误差
    print(t, loss)
    
    # backward pass
    # compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 34690796.0
1 35367900.0
2 39591868.0
3 40119632.0
4 32654910.0
5 20096512.0
6 10038669.0
7 4726069.5
8 2503067.0
9 1588548.125
10 1166028.75
11 928990.0
12 770657.3125
13 652379.375
14 558615.6875
15 482221.5
16 418823.71875
17 365625.03125
18 320780.625
19 282588.09375
20 249855.40625
21 221784.40625
22 197529.625
23 176505.15625
24 158141.640625
25 142029.078125
26 127850.0078125
27 115350.9921875
28 104290.8828125
29 94478.7734375
30 85742.1171875
31 77941.90625
32 70968.0703125
33 64717.94921875
34 59105.44140625
35 54049.0859375
36 49490.46484375
37 45374.18359375
38 41649.65625
39 38272.06640625
40 35206.01953125
41 32417.509765625
42 29877.0859375
43 27561.15234375
44 25447.158203125
45 23516.728515625
46 21748.833984375
47 20128.419921875
48 18643.36328125
49 17278.53515625
50 16025.0556640625
51 14872.998046875
52 13813.03515625
53 12836.6279296875
54 11936.96875
55 11107.076171875
56 10340.08984375
57 9632.4130859375
58 8978.5751953125
59 8373.62890625
60 7813.3876953125
61

432 0.0003790771879721433
433 0.0003706691204570234
434 0.0003619310155045241
435 0.000353808980435133
436 0.0003445990150794387
437 0.00033697806065902114
438 0.0003292524488642812
439 0.0003218846977688372
440 0.00031508493702858686
441 0.0003084251075051725
442 0.000301888445392251
443 0.00029509016894735396
444 0.0002891148906201124
445 0.0002825990377459675
446 0.0002767245168797672
447 0.00027102974127046764
448 0.0002651491085998714
449 0.00025980937061831355
450 0.0002546229225117713
451 0.00024903586017899215
452 0.00024313047470059246
453 0.00023828845587559044
454 0.00023443384270649403
455 0.00022998142230790108
456 0.0002249870594823733
457 0.000220668560359627
458 0.00021586618095170707
459 0.00021193030988797545
460 0.00020766313537023962
461 0.00020334350119810551
462 0.00019976859039161354
463 0.00019589363364502788
464 0.00019209230958949775
465 0.0001890591229312122
466 0.00018537335563451052
467 0.00018150289542973042
468 0.00017886435671243817
469 0.000175015331478

其实 Pytorch 可以自动计算梯度

In [69]:
# 简单例子
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

y = w * x + b

y.backward()

print(w.grad) # w 的梯度：dy / dw = x = 1
print(x.grad) # x 的梯度：dy / dx = w = 2
print(b.grad) # y 的梯度：dy / db = 1

tensor(1.)
tensor(2.)
tensor(1.)


In [83]:
# 简化代码
# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# W1 是可以将 1000 维的数据转成 100 维的数据
# W2 是可以将 100 维的数据转成 10 维的数据
w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # compute loss
    loss = (y_pred - y).pow(2).sum() # 均方误差
    print(t, loss.item())
    
    # backward pass
    # compute the gradient
    loss.backward()
    
    # update weights of w1 and w2
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

0 48911456.0
1 55363304.0
2 55236096.0
3 39115284.0
4 18464604.0
5 7057713.5
6 3202821.5
7 1995316.75
8 1497643.875
9 1207181.5
10 999631.8125
11 838668.0625
12 709829.3125
13 605285.8125
14 519428.40625
15 448301.875
16 389097.78125
17 339327.0625
18 297233.90625
19 261415.828125
20 230759.0
21 204477.234375
22 181754.328125
23 162026.671875
24 144843.640625
25 129806.1875
26 116596.171875
27 104962.96875
28 94734.6484375
29 85667.9375
30 77622.5546875
31 70458.0625
32 64060.34375
33 58329.0546875
34 53182.50390625
35 48549.21484375
36 44381.7109375
37 40618.53515625
38 37218.6015625
39 34143.52734375
40 31351.828125
41 28815.162109375
42 26507.052734375
43 24404.75
44 22488.02734375
45 20737.8828125
46 19138.26953125
47 17674.517578125
48 16334.203125
49 15104.5830078125
50 13976.1396484375
51 12940.3623046875
52 11988.451171875
53 11112.62890625
54 10306.2861328125
55 9563.236328125
56 8877.7099609375
57 8245.5
58 7661.84130859375
59 7122.4208984375
60 6623.4619140625
61 6161.633789

440 8.566951146349311e-05
441 8.3954117144458e-05
442 8.233678090618923e-05
443 8.089837501756847e-05
444 7.917120092315599e-05
445 7.783747423673049e-05
446 7.65235599828884e-05
447 7.517508493037894e-05
448 7.360079325735569e-05
449 7.249392365338281e-05
450 7.104156975401565e-05
451 6.982620107010007e-05
452 6.842311267973855e-05
453 6.719511293340474e-05
454 6.619602208957076e-05
455 6.489616498583928e-05
456 6.399124686140567e-05
457 6.297961226664484e-05
458 6.196441972861066e-05
459 6.094037598813884e-05
460 5.9800750022986904e-05
461 5.873729242011905e-05
462 5.7686287618707865e-05
463 5.677005901816301e-05
464 5.606118793366477e-05
465 5.513564246939495e-05
466 5.408435390563682e-05
467 5.318172770785168e-05
468 5.239284655544907e-05
469 5.156714178156108e-05
470 5.076464731246233e-05
471 5.0157905207015574e-05
472 4.93003535666503e-05
473 4.8419176891911775e-05
474 4.765769699588418e-05
475 4.6825916797388345e-05
476 4.619027458829805e-05
477 4.5484375732485205e-05
478 4.4727

In [75]:
w1.grad

tensor([[ 1.3243e+02, -1.0529e+03,  4.8376e+03,  ..., -8.5295e+03,
          5.7894e+03,  3.2994e+03],
        [-6.5432e+03, -1.0922e+04, -3.2600e+03,  ...,  2.7307e+03,
         -1.8815e+04, -2.1141e+02],
        [ 6.8676e+03,  2.1944e+03,  4.2264e+03,  ..., -2.3906e+03,
          1.0564e+03,  8.1033e+03],
        ...,
        [-3.1829e+03, -8.8897e+03, -1.3519e+04,  ..., -1.5641e+03,
         -7.3520e+03, -5.4847e+03],
        [ 1.2900e+04,  2.2896e+04,  3.8999e+03,  ..., -9.3005e+03,
          1.9128e+03,  4.3263e+03],
        [-8.8287e+03,  8.2122e+02, -7.3067e+03,  ..., -1.0198e+04,
         -1.3085e+04,  6.7728e+00]])

## Pytorch: nn

这次我们使用 Pytorch 中 nn 这个库来构建网络。用 Pytorch autograd 来构建计算图和计算 gradients，然后 Pytorch 会帮我们自动计算 gradient

In [84]:
import torch.nn as nn

In [90]:
# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # w_1 * x + b
    torch.nn.ReLU(),          # 
    torch.nn.Linear(H, D_out) # 
)


loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(500):
    y_pred = model(x) # model.forward()
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    # backward pass
    # compute the gradient
    loss.backward()
    
    # update weights of w1 and w2
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 701.0899047851562
1 700.5424194335938
2 699.9956665039062
3 699.4500732421875
4 698.9052734375
5 698.3622436523438
6 697.8206176757812
7 697.2799072265625
8 696.7398681640625
9 696.2010498046875
10 695.6636352539062
11 695.1293334960938
12 694.5961303710938
13 694.0634765625
14 693.5320434570312
15 693.00146484375
16 692.4715576171875
17 691.9423217773438
18 691.413818359375
19 690.8861083984375
20 690.359619140625
21 689.833984375
22 689.3089599609375
23 688.7844848632812
24 688.2608032226562
25 687.7391357421875
26 687.2184448242188
27 686.69873046875
28 686.179443359375
29 685.6610717773438
30 685.1436157226562
31 684.627197265625
32 684.1117553710938
33 683.5975341796875
34 683.0841064453125
35 682.571533203125
36 682.0595703125
37 681.5482177734375
38 681.0374755859375
39 680.527099609375
40 680.0178833007812
41 679.5092163085938
42 679.001220703125
43 678.4942626953125
44 677.98828125
45 677.4834594726562
46 676.979248046875
47 676.4756469726562
48 675.97265625
49 675.470458984

可以看到训练效果不好，有很多因素，可能是 w1 和 w2 初始化的不好

In [91]:
# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # w_1 * x + b
    torch.nn.ReLU(),          # 
    torch.nn.Linear(H, D_out) # 
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(500):
    y_pred = model(x) # model.forward()
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    # backward pass
    # compute the gradient
    loss.backward()
    
    # update weights of w1 and w2
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad


0 34666388.0
1 34054268.0
2 38349480.0
3 39379440.0
4 31949888.0
5 19258994.0
6 9192084.0
7 4119950.5
8 2093694.125
9 1300833.75
10 946659.375
11 751654.1875
12 622526.8125
13 526480.25
14 450514.90625
15 388542.8125
16 337077.78125
17 293911.25
18 257487.90625
19 226526.21875
20 200012.609375
21 177240.046875
22 157570.28125
23 140461.015625
24 125536.71875
25 112477.9296875
26 101016.046875
27 90915.28125
28 82010.8828125
29 74145.9140625
30 67176.2734375
31 60974.5703125
32 55436.171875
33 50479.1171875
34 46034.71484375
35 42041.16015625
36 38450.33984375
37 35213.5625
38 32288.60546875
39 29639.6640625
40 27241.00390625
41 25066.51953125
42 23089.44140625
43 21290.90625
44 19647.5
45 18146.322265625
46 16773.673828125
47 15517.5458984375
48 14366.078125
49 13309.9169921875
50 12339.6806640625
51 11447.546875
52 10627.5712890625
53 9871.4931640625
54 9174.0087890625
55 8530.7080078125
56 7936.88623046875
57 7388.166015625
58 6880.658203125
59 6411.04248046875
60 5976.2109375
61 557

## Pytorch：optim

这一次我们手动更新模型的 weights，而是使用 optim 这个包来帮助我们更新参数。optim 这个包提供了各种不同的模型优化方法，包括 SGD + momentum，RMSProp，Adam 等等。

In [94]:
# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # w_1 * x + b
    torch.nn.ReLU(),          # 
    torch.nn.Linear(H, D_out) # 
)

loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # 优化器

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x) # model.forward()
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    # backward pass
    # compute the gradient
    loss.backward()
    
    # update weights of w1 and w2
    optimizer.step()   

0 719.6692504882812
1 701.6485595703125
2 684.0859985351562
3 667.0234375
4 650.425537109375
5 634.3677978515625
6 618.7904052734375
7 603.669921875
8 588.9337768554688
9 574.54931640625
10 560.5349731445312
11 547.0121459960938
12 533.9134521484375
13 521.1456909179688
14 508.72296142578125
15 496.5946960449219
16 484.7381591796875
17 473.1896057128906
18 461.9356689453125
19 450.92730712890625
20 440.1361389160156
21 429.594482421875
22 419.3294982910156
23 409.3489685058594
24 399.6282653808594
25 390.2083740234375
26 381.0108337402344
27 372.02423095703125
28 363.2749938964844
29 354.7665100097656
30 346.4493713378906
31 338.33465576171875
32 330.4557189941406
33 322.8043212890625
34 315.3151550292969
35 307.9793395996094
36 300.7936096191406
37 293.7535400390625
38 286.8562316894531
39 280.10174560546875
40 273.47601318359375
41 266.97027587890625
42 260.5986633300781
43 254.37966918945312
44 248.28021240234375
45 242.314697265625
46 236.47042846679688
47 230.73944091796875
48 225

365 0.00012700061779469252
366 0.00011996183457085863
367 0.00011330063716741279
368 0.00010699810809455812
369 0.00010104339162353426
370 9.540974133415148e-05
371 9.007669723359868e-05
372 8.503429853590205e-05
373 8.026452997000888e-05
374 7.576192729175091e-05
375 7.150163582991809e-05
376 6.747210136381909e-05
377 6.366395973600447e-05
378 6.006384501233697e-05
379 5.666174547513947e-05
380 5.344698729459196e-05
381 5.0408732931828126e-05
382 4.7538393118884414e-05
383 4.4825552322436124e-05
384 4.226459714118391e-05
385 3.984350769314915e-05
386 3.755690704565495e-05
387 3.540004036040045e-05
388 3.336090230732225e-05
389 3.143542562611401e-05
390 2.9615373932756484e-05
391 2.7899739507120103e-05
392 2.628324909892399e-05
393 2.4752835088293068e-05
394 2.331308860448189e-05
395 2.1951831513433717e-05
396 2.0668539946200326e-05
397 1.9457462258287705e-05
398 1.831329063861631e-05
399 1.7236670828424394e-05
400 1.622199488338083e-05
401 1.526519554317929e-05
402 1.4360337445395999e

## Pytorch：自定义 nn Modules

我们可以定义一个模型，这个模型继承自 nn.Module 类。如果需要定义一个比 Sequential 模型更加复杂的模型，就需要定义 nn.Module 模型

In [96]:
# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H, bias=False)
        self.linear2 = torch.nn.Linear(H, D_out, bias=False)
    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred
    
model = TwoLayerNet(1000, 100, 10)

loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # 优化器

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x) # model.forward()
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    # backward pass
    # compute the gradient
    loss.backward()
    
    # update weights of w1 and w2
    optimizer.step()   

0 696.19287109375
1 678.7493286132812
2 661.8256225585938
3 645.3683471679688
4 629.4194946289062
5 613.9087524414062
6 598.88720703125
7 584.2584838867188
8 569.9920654296875
9 556.0720825195312
10 542.5088500976562
11 529.3421630859375
12 516.5916748046875
13 504.1796569824219
14 492.17822265625
15 480.5501708984375
16 469.320068359375
17 458.34814453125
18 447.59674072265625
19 437.1996154785156
20 427.12396240234375
21 417.32855224609375
22 407.79248046875
23 398.49383544921875
24 389.443115234375
25 380.6329040527344
26 372.0736083984375
27 363.68768310546875
28 355.54998779296875
29 347.58892822265625
30 339.82574462890625
31 332.2952575683594
32 324.8900146484375
33 317.6469421386719
34 310.56256103515625
35 303.6239929199219
36 296.8334655761719
37 290.1995849609375
38 283.7002258300781
39 277.3330383300781
40 271.10986328125
41 264.9996337890625
42 258.990478515625
43 253.08883666992188
44 247.28079223632812
45 241.58148193359375
46 235.9968719482422
47 230.5221710205078
48 22

491 1.948529515516384e-08
492 1.791413062335323e-08
493 1.6536485958340563e-08
494 1.5281054643878633e-08
495 1.4093267886039484e-08
496 1.297871321526145e-08
497 1.199905685922431e-08
498 1.1037456282281255e-08
499 1.0169795672254622e-08
