In [15]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
X, net(X)

(tensor([[0.4488, 0.2528, 0.3504, 0.3837],
         [0.3695, 0.7989, 0.0139, 0.2202]]),
 tensor([[0.2271],
         [0.1677]], grad_fn=<AddmmBackward0>))

In [28]:
print(net[2])
# 每层的参数可以通过state_dict拿出来
print('*'*20)
print(net[2].state_dict())
print('*'*20)
# 可以直接访问具体的参数
print(net[0].bias)
print('*'*20)
# .data是真正的值
print(net[0].bias.data)
print(net[0].bias.grad)
print(net[0].weight)


Linear(in_features=8, out_features=1, bias=True)
********************
OrderedDict([('weight', tensor([[-0.3484, -0.2972,  0.2717, -0.1989,  0.3406,  0.1706, -0.2302,  0.2613]])), ('bias', tensor([0.1526]))])
********************
Parameter containing:
tensor([-0.4490, -0.0276, -0.3377,  0.4743, -0.3311, -0.3657,  0.0907,  0.1862],
       requires_grad=True)
********************
tensor([-0.4490, -0.0276, -0.3377,  0.4743, -0.3311, -0.3657,  0.0907,  0.1862])
None
Parameter containing:
tensor([[-0.2409, -0.1206,  0.4617,  0.0669],
        [-0.1086, -0.1963,  0.4870, -0.2469],
        [-0.4596,  0.0336,  0.3299, -0.1872],
        [-0.0791,  0.1684, -0.4801,  0.0148],
        [-0.0777,  0.2580,  0.3393, -0.3673],
        [-0.1824,  0.0909, -0.1518, -0.3604],
        [-0.4659, -0.1496,  0.1045,  0.0360],
        [ 0.4959,  0.0968,  0.0249,  0.2232]], requires_grad=True)


In [31]:
# 通过named_parameters()访问所有的参数
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [32]:
net.state_dict()['2.bias'].data

tensor([0.1526])

In [33]:
def init_normal(m):
    # 如果是线性层
    if type(m) == nn.Linear:
        # 对权重正则化
        nn.init.normal_(m.weight, mean=0, std=0.01)
        # 对bias清零
        nn.init.zeros_(m.bias)

# apply函数是对所有net里面的layer循环调用参数（函数）
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0030,  0.0016,  0.0002,  0.0104]), tensor(0.))

In [36]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [51]:
# 自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        # 保留绝对值大于等于5的权重，小于5的设置为0
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-8.5947, -0.0000, -0.0000, -8.3880],
        [-9.0148,  0.0000, -7.6616,  0.0000]], grad_fn=<SliceBackward0>)

In [52]:
# 直接赋值
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  1.0000,  1.0000, -7.3880])

In [None]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

### 自定义一个层

In [53]:
import torch
import torch.nn.functional as F
from torch import nn

class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [54]:
# 层作为组件合并到更复杂的模型中
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

Y = net(torch.rand(4, 8))
Y.mean()

tensor(-4.6566e-10, grad_fn=<MeanBackward0>)

In [55]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        # 注意randn是标准正态分布， rand是均匀分布
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [56]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[-0.4707,  0.6910,  0.3719],
        [ 0.5611, -1.5109, -0.2516],
        [ 0.5316, -1.3115,  1.4843],
        [ 1.1378, -0.4583,  0.0696],
        [-0.4197, -0.4140,  0.8970]], requires_grad=True)