In [2]:
import torch 
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
x = torch.rand(size=(2,4))

net(x)

tensor([[0.1221],
        [0.1204]], grad_fn=<AddmmBackward0>)

5.2.1 参数访问

In [5]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.3251,  0.1283,  0.3447,  0.0892, -0.1654, -0.1196,  0.0841,  0.0194]])),
             ('bias', tensor([0.1701]))])

In [4]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.3251,  0.1283,  0.3447,  0.0892, -0.1654, -0.1196,  0.0841,  0.0194]])), ('bias', tensor([0.1701]))])


5.2.1.1 目标参数

In [9]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.1701], requires_grad=True)
tensor([0.1701])


In [10]:
net[2].weight.grad == None

True

5.2.1.2 一次访问所有参数

In [13]:
print( *[ (name, param.shape ) for name , param in net[0].named_parameters()])
print( *[( name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [16]:
net.state_dict()['0.weight'].data

tensor([[ 0.2733,  0.0121, -0.1099, -0.2745],
        [-0.4005, -0.1729,  0.4330,  0.0158],
        [-0.1043, -0.2450, -0.3777, -0.4437],
        [-0.1279, -0.2932,  0.2201,  0.4769],
        [-0.2456,  0.1078, -0.2021,  0.4287],
        [ 0.0922, -0.2131, -0.1299, -0.1490],
        [ 0.3665, -0.1360,  0.0209, -0.4523],
        [-0.4057, -0.2283,  0.3948, -0.3990]])

5.2.1.3 从嵌套块收集参数

In [19]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(x)


tensor([[0.1680],
        [0.1680]], grad_fn=<AddmmBackward0>)

In [20]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [22]:
rgnet[0][1][0].bias.data

tensor([ 0.4285, -0.1629,  0.3416,  0.4154,  0.2267, -0.2720, -0.3419, -0.2041])

5.2.2 参数初始化

5.2.2.1 内置初始化

In [27]:
def init_nomal(m):
    if type(m) == nn.Module:
        nn.init.normal_(m.weight , mean= 0 , std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_nomal)
net[0].weight[0], net[0].bias.data[0]

(tensor([ 0.2733,  0.0121, -0.1099, -0.2745], grad_fn=<SelectBackward0>),
 tensor(0.0183))

In [29]:
def init_xavier(m):
    if type(m) == nn.Module:
        nn.init.xavier_uniform_(m.weight)
       # nn.init.zeros_(m.bias)
def init_42(m):
    if type(m) == nn.Module:
        nn.init.constant_(m.weight  , 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print( net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.2733,  0.0121, -0.1099, -0.2745])
tensor([[ 0.3251,  0.1283,  0.3447,  0.0892, -0.1654, -0.1196,  0.0841,  0.0194]])


5.2.2.2 自定义初始化

In [39]:
def my_init(m):
    if type(m)==nn.Linear:
        print('Init',  *[(name, param) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight Parameter containing:
tensor([[-8.8776, -5.1126,  5.1079,  0.0000],
        [ 0.0000, -0.0000, -0.0000, -6.4835],
        [ 0.0000,  8.9062, -7.2179, -0.0000],
        [ 0.0000, -0.0000,  0.0000,  0.0000],
        [-7.1414,  8.5990,  9.3102, -0.0000],
        [-0.0000, -5.5701,  7.5147,  5.0123],
        [ 0.0000,  0.0000,  0.0000, -0.0000],
        [-0.0000, -0.0000, -0.0000,  6.6403]], requires_grad=True)
Init weight Parameter containing:
tensor([[-7.7063, -0.0000, -9.1408,  6.6859,  7.0303, -9.0110,  7.2340, -7.3622]],
       requires_grad=True)


tensor([[-0.0000,  0.0000, -0.0000, -5.0943],
        [-8.2552,  6.3007,  5.9903,  0.0000]], grad_fn=<SliceBackward0>)

In [42]:
net[0].weight.data[:] +=1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  2.0000,  2.0000, -3.0943])

5.2.3 参数绑定

In [47]:
shared =nn.Linear(8 ,8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))

net(x)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0]  = 100 
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
