In [2]:
from torch import nn
import torch
import torch.nn.functional as F

网络定义

In [3]:
#定义一个简单的前馈神经网络
net=nn.Sequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)
X=torch.randn(2,20)
Y=net(X)
Y

tensor([[-8.9527e-02, -3.9810e-01, -2.8717e-01, -6.5926e-02, -7.8916e-03,
         -2.7812e-02, -6.2529e-02,  2.9098e-01,  2.1217e-01,  3.4255e-01],
        [-1.4179e-01,  8.4178e-02,  3.3168e-02, -1.2896e-01,  3.2975e-04,
          5.2847e-02,  7.5521e-02,  4.3950e-01,  1.8505e-01, -2.9670e-01]],
       grad_fn=<AddmmBackward0>)

In [4]:
#读取某一层的参数
print(net[2].state_dict())
print(net[2].state_dict().keys())


OrderedDict([('weight', tensor([[ 0.0224, -0.0422, -0.0221,  ...,  0.0318,  0.0442, -0.0433],
        [ 0.0162, -0.0274, -0.0318,  ...,  0.0157, -0.0268,  0.0252],
        [-0.0011,  0.0555,  0.0173,  ..., -0.0137, -0.0404,  0.0202],
        ...,
        [-0.0443, -0.0138,  0.0579,  ...,  0.0388,  0.0218, -0.0308],
        [ 0.0417, -0.0319, -0.0313,  ..., -0.0105,  0.0273, -0.0278],
        [-0.0102, -0.0049,  0.0214,  ..., -0.0180, -0.0239,  0.0377]])), ('bias', tensor([-0.0233, -0.0504,  0.0531, -0.0508, -0.0476,  0.0120, -0.0242,  0.0313,
         0.0293, -0.0021]))])
odict_keys(['weight', 'bias'])


In [5]:
print([(name,param.shape) for name,param in net.named_parameters()])

[('0.weight', torch.Size([256, 20])), ('0.bias', torch.Size([256])), ('2.weight', torch.Size([10, 256])), ('2.bias', torch.Size([10]))]


参数初始化

In [6]:
def init_normal(m):
    if type(m)==nn.Linear:
        nn.init.normal_(m.weight,mean=0,std=0.01)
        nn.init.zeros_(m.bias)
def init_42(m):
    if type(m)==nn.Linear:
        nn.init.constant_(m.weight,42)
        nn.init.constant_(m.bias,42)
#apply就是接收一个函数，对里面的每个元素都用这个函数执行一遍。不一定要是初始化
# net.apply(init_normal) #对net的每一层都使用init_normal函数

#单独对各个层进行初始化
net[0].apply(init_normal) #对net的第0层都使用init_normal函数
#net[1]是ReLU层，没有权重参数
net[2].apply(init_42) #对net的地2层使用init_42函数


print(net[0].weight.data[0],'\n',net[0].bias.data[0])
print(net[2].weight.data[2],'\n',net[2].bias.data[2])


tensor([ 0.0067,  0.0079,  0.0044,  0.0300,  0.0138,  0.0046,  0.0068, -0.0063,
         0.0058,  0.0102,  0.0049, -0.0015, -0.0073,  0.0130,  0.0083, -0.0104,
         0.0035, -0.0023,  0.0042, -0.0122]) 
 tensor(0.)
tensor([42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
        42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42., 42.,
  

#### 参数绑定

对于某些层的参数,通过参数绑定使两个不同的层具备完全相同的参数,减少参数量的同时也能保证性能

In [7]:
#定义共享层
shared=nn.Linear(8,8)
#定义网络
net=nn.Sequential(
    nn.Linear(4,8),
    nn.ReLU(),
    shared,
    nn.ReLU(),
    shared,
    nn.ReLU(),
    nn.Linear(8,4),
    nn.ReLU(),
)
X=torch.randn(2,4)
net(X)
print(net[2].weight.data[0]==net[4].weight.data[0])
print(net[2].weight.data[0,0])
net[2].weight.data[0,0]=100
print(net[2].weight.data[0]==net[4].weight.data[0])
print(net[2].weight.data[0,0])
#修改了也相同,说明这两层的参数是一致的

tensor([True, True, True, True, True, True, True, True])
tensor(-0.0156)
tensor([True, True, True, True, True, True, True, True])
tensor(100.)


torch.nn.Parameter()将一个不可训练的tensor转换成可以训练的类型parameter，并将这个parameter绑定到这个module里面。即在定义网络时这个tensor就是一个可以训练的参数了。使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。

In [10]:
import torch.nn as nn
class MyLiner(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight=nn.Parameter(torch.randn(in_units,units))
        self.bias=nn.Parameter(torch.randn(units))
    def forward(self,x):
        #weight有两个属性,一个是data,另外一个是grad梯度
        liner=torch.matmul(x,self.weight.data)+self.bias.data
        return torch.relu(liner)

dense=MyLiner(5,3)
x=torch.randn(2,5)
print(dense(x),dense(x).shape)


tensor([[2.9782, 0.6109, 3.8294],
        [2.6414, 0.0000, 2.7664]]) torch.Size([2, 3])


保存结构和权重

In [11]:
torch.save(dense,'dense.pt')

In [15]:
dense2=torch.load('dense.pt')
print(dense2(x),dense2(x).shape)
print(dense2(x)==dense(x))

tensor([[2.9782, 0.6109, 3.8294],
        [2.6414, 0.0000, 2.7664]]) torch.Size([2, 3])
tensor([[True, True, True],
        [True, True, True]])


仅保存模型参数,不保存模型结构

In [16]:
#仅保存参数,不保存结构
torch.save(dense.state_dict(),'dense.params')
dense3=MyLiner(5,3)
dense3.load_state_dict(torch.load('dense.params'))
print(dense3(x),dense3(x).shape)
print(dense3(x)==dense(x))


tensor([[2.9782, 0.6109, 3.8294],
        [2.6414, 0.0000, 2.7664]]) torch.Size([2, 3])
tensor([[True, True, True],
        [True, True, True]])
