In [2]:
import torch
from torch import nn
from torch.nn import functional as F

# 深度学习:  层和块

In [8]:
# 自定义层和块自定义
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[idx] = module
    
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X

In [10]:
X = torch.arange(40.).reshape(2, 20)
net1 = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 1))
net1(X)

tensor([[-3.2080],
        [-8.5769]], grad_fn=<AddmmBackward0>)

In [23]:
# 添加层之间的计算
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False).to(device='cuda:0')
        self.linear = nn.Linear(20, 20).to(device='cuda:0')
    
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [25]:
Y = X.to(device='cuda:0')
net2 = FixedHiddenMLP()
net2(Y)

tensor(0.2316, device='cuda:0', grad_fn=<SumBackward0>)

# 深度学习: 参数管理

In [28]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
net[1].state_dict(), net[2].state_dict()

(OrderedDict(),
 OrderedDict([('weight',
               tensor([[ 0.3138, -0.2902,  0.2307, -0.0750, -0.1311,  0.1874,  0.2137,  0.2285]])),
              ('bias', tensor([0.0582]))]))

In [33]:
# data 提取具体数据  grad 提取具体梯度
net[2].bias, net[2].bias.data, net[2].bias.grad

(Parameter containing:
 tensor([0.0582], requires_grad=True),
 tensor([0.0582]),
 None)

In [37]:
# 迭代器提取
for name, param in net.named_parameters():
    print(name, param)

0.weight Parameter containing:
tensor([[ 0.0319, -0.0596,  0.2901, -0.4875],
        [ 0.4223, -0.0529,  0.2397,  0.4300],
        [-0.1960,  0.3595, -0.2569,  0.2774],
        [-0.4284, -0.3111,  0.3983,  0.0225],
        [ 0.2094,  0.4215,  0.3690, -0.3983],
        [ 0.1965, -0.3996, -0.1056,  0.4662],
        [-0.4894,  0.4502,  0.4694,  0.2860],
        [ 0.4911, -0.2089,  0.2014,  0.2183]], requires_grad=True)
0.bias Parameter containing:
tensor([-0.4934, -0.1745,  0.2160,  0.4809, -0.1130,  0.0008,  0.4922,  0.0673],
       requires_grad=True)
2.weight Parameter containing:
tensor([[ 0.3138, -0.2902,  0.2307, -0.0750, -0.1311,  0.1874,  0.2137,  0.2285]],
       requires_grad=True)
2.bias Parameter containing:
tensor([0.0582], requires_grad=True)


In [38]:
print(net)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)


### 参数初始化

In [39]:
# 高斯分布
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
net[2].weight, net[2].bias

(Parameter containing:
 tensor([[ 0.0092,  0.0034,  0.0167, -0.0002,  0.0012,  0.0100,  0.0071, -0.0074]],
        requires_grad=True),
 Parameter containing:
 tensor([0.], requires_grad=True))

In [40]:
# xavier
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

net.apply(init_xavier)
net[2].weight, net[2].bias

(Parameter containing:
 tensor([[ 0.3048,  0.6879,  0.6552, -0.5771, -0.2568,  0.6503, -0.1584, -0.7819]],
        requires_grad=True),
 Parameter containing:
 tensor([0.], requires_grad=True))

In [44]:
# 参数绑定
shared = nn.Linear(8, 8)
net_shared = nn.Sequential(shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1)).to(device='cuda:0')
net_shared.apply(init_xavier)

Z = torch.arange(8.).reshape(1, -1).to(device='cuda:0')
net_shared(Z)

tensor([[0.8311]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [45]:
net_shared[0].weight == net_shared[2].weight

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]], device='cuda:0')

# 深度学习: 自定义层

In [48]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(in_units, units))
        self.bias = nn.Parameter(torch.rand(units,))
    
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [49]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[0.0343, 0.8382, 0.4125],
        [0.4901, 0.3376, 0.6394],
        [0.0098, 0.1140, 0.1437],
        [0.0440, 0.5933, 0.5201],
        [0.7661, 0.1908, 0.3035]], requires_grad=True)

# 深度学习: 读写文件

In [3]:
# 存参数
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(8, 20)
        self.output = nn.Linear(20, 4)
    def forward(self, X):
        return self.output(F.relu(self.hidden(X)))

net = MLP()
X = torch.randn(size=(2, 8))
Y = net(X)

In [4]:
torch.save(net.state_dict(), 'mlp.params')

In [5]:
# 取参数
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=8, out_features=20, bias=True)
  (output): Linear(in_features=20, out_features=4, bias=True)
)

In [6]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True],
        [True, True, True, True]])