In [2]:
import torch
from torch import nn
from torch.nn import functional as F  #F中包含某些函数

#nn.Sequential定义了一种特殊的module
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256,10))

X = torch.rand(2,20)
net(X)

tensor([[ 0.0199,  0.0859,  0.0805,  0.0636,  0.2853,  0.0070,  0.0440,  0.1719,
         -0.0551,  0.1071],
        [ 0.0773,  0.0597,  0.1111,  0.0722,  0.3561, -0.0290, -0.0867,  0.1059,
         -0.0101,  0.0305]], grad_fn=<AddmmBackward0>)

In [4]:
#任何层或模型为nn.Module的子类
#自定义块
class MLP(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)

    def forward(self, X) : #前向传播函数
        return self.out(F.relu(self.hidden(X)))
        

In [5]:
net = MLP()
net(X)

tensor([[-0.0416, -0.0436, -0.1163, -0.0126,  0.0314, -0.0760,  0.1339, -0.2667,
         -0.2251, -0.0911],
        [-0.0050, -0.0641, -0.1535, -0.1142,  0.0314, -0.0857,  0.1241, -0.2103,
         -0.1421, -0.2013]], grad_fn=<AddmmBackward0>)

In [7]:
#顺序块
#实现Sequential
class MySequential(nn.Module) :
    def __init__(self, *args) :
        super().__init__()
        for block in args :
            self._modules[block] = block
    def forward(self, X) :
        for block in self._modules.values() :
            X = block(X)

        return X
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[ 0.0441, -0.0544,  0.1165, -0.0323,  0.0518, -0.1778,  0.1482,  0.0517,
         -0.0127,  0.0451],
        [ 0.0239,  0.0446, -0.0300,  0.0323,  0.1246, -0.1814,  0.1419,  0.0187,
         -0.0467, -0.0883]], grad_fn=<AddmmBackward0>)

In [8]:
import torch
from torch import nn
from torch.nn import functional as F

#参数管理
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(2,4)
net(X)


tensor([[-0.2240],
        [-0.0613]], grad_fn=<AddmmBackward0>)

In [11]:
 #参数访问
print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.2381,  0.2476, -0.2581,  0.1101,  0.3518,  0.1128, -0.3495, -0.1401]])), ('bias', tensor([-0.2717]))])


In [13]:
print(net[2].bias.data)

tensor([-0.2717])


In [22]:
net[2].weight.grad ==  None

True

In [25]:
#一次性访问所有参数
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [28]:
#从嵌套块收集参数 
def block1() :

    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU())

def block2() :
    net = nn.Sequential()
    for i in range(4) :
        net.add_module(f'block{i}', block1())

    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.1324],
        [0.1324]], grad_fn=<AddmmBackward0>)

In [30]:
print(rgnet.state_dict())

OrderedDict([('0.block0.0.weight', tensor([[-0.4139, -0.1733, -0.1008,  0.2700],
        [-0.4440, -0.4950,  0.3225, -0.0973],
        [ 0.3770,  0.3515,  0.4467,  0.4983],
        [-0.0425,  0.3096, -0.3905, -0.1388],
        [-0.0783,  0.3223, -0.2267, -0.1775],
        [ 0.1401,  0.3578, -0.3475,  0.2529],
        [-0.0528, -0.2848, -0.4339, -0.1182],
        [ 0.1815, -0.2446, -0.3395,  0.4993]])), ('0.block0.0.bias', tensor([ 0.3385, -0.2558, -0.0212, -0.1810,  0.1562,  0.3892,  0.1448,  0.3892])), ('0.block0.2.weight', tensor([[-0.1468, -0.2373,  0.2386, -0.0532, -0.0891,  0.3176,  0.0764, -0.3014],
        [-0.1070, -0.1822,  0.3144,  0.1067, -0.1592,  0.0057, -0.2801,  0.0234],
        [ 0.2233,  0.3373, -0.3056, -0.2418, -0.3501,  0.2865,  0.0562, -0.2195],
        [-0.0515, -0.2046,  0.1331, -0.0102,  0.2777, -0.2994, -0.0595, -0.2698]])), ('0.block0.2.bias', tensor([ 0.3284, -0.0683, -0.0955,  0.3027])), ('0.block1.0.weight', tensor([[-0.2246, -0.4991, -0.3088, -0.0052],
   

In [37]:
 #内置初始化
def init_normal(m) :
    if type(m) == nn.Linear :
        nn.init.normal_(m.weight, mean = 0, std = 0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data, net[0].bias.data

(tensor([[-0.0088,  0.0085,  0.0156,  0.0060],
         [ 0.0065, -0.0007,  0.0010, -0.0139],
         [-0.0133,  0.0129, -0.0038, -0.0106],
         [-0.0156,  0.0112, -0.0107,  0.0063],
         [-0.0011, -0.0002,  0.0183,  0.0141],
         [ 0.0117, -0.0051, -0.0027,  0.0022],
         [-0.0172, -0.0016,  0.0130,  0.0037],
         [ 0.0027, -0.0084, -0.0093,  0.0095]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [38]:
def init_constant(m) :
    if type(m) == nn.Linear :
        nn.init.constant_(m.weight, 1) #后面下划线表示为替换函数  
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [40]:
#对不同层应用不同初始化方法
def xavier(m) :
    if type(m) == nn.Linear :
        nn.init.xavier_uniform_(m.weight)
net[0].apply(xavier)
net[2].apply(init_constant)
print(net[0].weight.data)
print(net[2].weight.data)

tensor([[ 0.1480,  0.3335, -0.6427,  0.0460],
        [-0.6883,  0.2626, -0.2473,  0.5905],
        [-0.3318,  0.6704,  0.1874,  0.2660],
        [ 0.3826, -0.4660,  0.3049, -0.1582],
        [ 0.5091,  0.2734, -0.0277,  0.5033],
        [-0.3510, -0.1518,  0.4473,  0.5855],
        [ 0.5858, -0.2351,  0.6523,  0.0558],
        [-0.2519, -0.2418, -0.4922, -0.0753]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1.]])


In [41]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [43]:
#参数绑定
#两个数据流 共享某一层的权重
shared = nn.Linear(8, 8)
#两层shared权重一致
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1))
net(X)

tensor([[0.1938],
        [0.1952]], grad_fn=<AddmmBackward0>)

In [44]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=8, bias=True)
  (5): ReLU()
  (6): Linear(in_features=8, out_features=1, bias=True)
)

In [None]:
net.weight