# 参数管理
---

## 单隐藏层的多层感知机

In [1]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.2536],
        [0.2321]], grad_fn=<AddmmBackward0>)

## 参数访问

In [4]:
print(X)
print(net[0].state_dict())
print(net[1].state_dict())
print(net[2].state_dict())# 这里访问的最后一层是8 * 1.参数有weight 和 bias
#Sequential容器里面放着三个层
#【0】nn.Linear(4, 8),
#【1】nn.ReLU(), 
#【2】nn.Linear(8, 1)

'''这里输出的nn.Linear（4,8）的形状是默认转置了的即输出的是8*4'''

tensor([[0.0585, 0.6048, 0.1910, 0.0395],
        [0.8710, 0.8803, 0.0514, 0.5969]])
OrderedDict([('weight', tensor([[ 0.2548,  0.1707,  0.1765, -0.1320],
        [-0.2513,  0.1520, -0.0352, -0.2049],
        [-0.4581, -0.2734, -0.4541,  0.3457],
        [ 0.4383, -0.2384,  0.1046, -0.2610],
        [ 0.0458,  0.1905, -0.4578, -0.0808],
        [ 0.4440, -0.2033, -0.1551,  0.1538],
        [-0.1463, -0.1766,  0.3476, -0.2831],
        [ 0.3537, -0.4390,  0.3516, -0.1943]])), ('bias', tensor([-0.2663, -0.4552, -0.3195,  0.0531,  0.0972, -0.4190,  0.1240, -0.1059]))])
OrderedDict()
OrderedDict([('weight', tensor([[-0.0378, -0.1843, -0.1461,  0.0017, -0.2036,  0.1632, -0.0296, -0.3493]])), ('bias', tensor([0.2808]))])


## 目标参数

In [5]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)#访问到值

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2808], requires_grad=True)
tensor([0.2808])


In [6]:
net[2].weight.grad == None
# 因为这里没有进行训练，所以梯度为None

True

## 一次性访问所有参数


In [9]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])
# 0 1 2
#1 是ReLU层没有参数，所以拿不出来

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [10]:
net.state_dict()['2.bias'].data

tensor([0.2808])

## 从嵌套块收集参数


In [11]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4))

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1()) #给一个名字而已啦
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)


tensor([[0.3835],
        [0.3832]], grad_fn=<AddmmBackward0>)

In [12]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


## 内置初始化参数

In [14]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)#这种带后下划线的是一种置换函数
        nn.init.zeros_(m.bias)

net.apply(init_normal)
net[0].weight.data[0],net[0].bias.data[0]

(tensor([ 0.0012,  0.0106,  0.0006, -0.0044]), tensor(0.))

In [15]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0],net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

## 对某些块应用不同的初始化方法

In [17]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.1739,  0.0886, -0.0584,  0.1051])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


## 自定义初始化

In [20]:
def my_init(m):
    if type(m) == nn.Linear:
        print(
            'Init',
            *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5  
        #  >= 优先级高于 *=  功能：所以保留权重值大于等于5的，然后把不是的附为0 。
        # 先做右边的布尔运算
        
net.apply(my_init)
net[0].weight[:2]
        
        
        

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-8.2723, -9.4219,  7.5869,  0.0000],
        [-0.0000,  5.7715,  0.0000, -0.0000]], grad_fn=<SliceBackward0>)

## 直接访问到值做替换

In [22]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000, -7.4219,  9.5869,  2.0000])

## 共享权重 -- 参数绑定
如何在不同网络之间共享权重

In [23]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8),nn.ReLU(),shared, nn.ReLU(),shared,
                   nn.ReLU(), nn.Linear(8, 1))
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
#如果改掉一个权重，另一个也自动会改。因为指向的地址是一样的
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])


tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
