In [1]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X= torch.rand(size= (2, 4))
net(X)

tensor([[-0.3563],
        [-0.3214]], grad_fn=<AddmmBackward0>)

In [None]:
# state_dict()存储模型参数
print(net[2].state_dict())
print(net[1].state_dict())
print(net[0].state_dict())

OrderedDict([('weight', tensor([[ 0.2293, -0.1961,  0.2891,  0.2259,  0.0646,  0.1259,  0.0614, -0.3081]])), ('bias', tensor([-0.3219]))])
OrderedDict()
OrderedDict([('weight', tensor([[-0.4844,  0.2396, -0.4030, -0.2306],
        [-0.4321, -0.1478,  0.3670,  0.0073],
        [-0.0825, -0.1266, -0.0156,  0.0557],
        [ 0.3837,  0.2700, -0.4984, -0.3788],
        [ 0.1061, -0.3389,  0.3818,  0.3653],
        [-0.3568, -0.3782,  0.4664, -0.2084],
        [ 0.3057, -0.0709,  0.4851, -0.2989],
        [-0.4614,  0.3137, -0.0012, -0.3199]])), ('bias', tensor([-0.2491,  0.4824, -0.2615, -0.3999, -0.3563,  0.2592,  0.2448, -0.4722]))])


In [5]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.3219], requires_grad=True)
tensor([-0.3219])


In [None]:
# 一次性访问所有参数

for name, param in net[0].named_parameters():
    print(name, param.shape)

for name, param in net.named_parameters():
    print(name, param.shape)

weight torch.Size([8, 4])
bias torch.Size([8])
0.weight torch.Size([8, 4])
0.bias torch.Size([8])
2.weight torch.Size([1, 8])
2.bias torch.Size([1])


In [12]:
# 设计嵌套块

def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)


tensor([[-0.0734],
        [-0.0734]], grad_fn=<AddmmBackward0>)

In [14]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [20]:
print(rgnet[0][1][0].bias)

Parameter containing:
tensor([ 0.4231, -0.0880,  0.2142, -0.3011, -0.1656, -0.0278, -0.3497,  0.1424],
       requires_grad=True)


In [21]:
# 自定义初始化方法。默认根据一个范围均匀地初始化权重和偏置矩阵

def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, 0, 0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
print(net[0].weight.data)
print(net[0].bias.data)

tensor([[ 0.0111, -0.0153,  0.0028,  0.0124],
        [ 0.0061,  0.0262, -0.0028, -0.0061],
        [ 0.0089,  0.0075, -0.0096, -0.0083],
        [ 0.0257, -0.0066,  0.0051, -0.0047],
        [ 0.0194,  0.0068, -0.0062, -0.0047],
        [ 0.0100,  0.0030, -0.0015, -0.0057],
        [-0.0071, -0.0182, -0.0086,  0.0163],
        [ 0.0251, -0.0058, -0.0041, -0.0012]])
tensor([0., 0., 0., 0., 0., 0., 0., 0.])


In [22]:
# 我们还可以对某些块应用不同的初始化方法。 
# 例如，下面我们使用Xavier初始化方法初始化第一个神经网络层， 然后将第三个神经网络层初始化为常量值42。

def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.0478, -0.4777,  0.0498, -0.0253])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [23]:
# 可以直接设置参数

net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  0.5223,  1.0498,  0.9747])

In [24]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)

# 第二层和第四层是一样的
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
