In [1]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.optim as optim
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import random
import torch.utils.data as Data
from torch.nn import init
from collections import OrderedDict
import pandas as pd
from d2l import torch as d2l

# 模型构造

## 继承Module来构造模型

In [2]:
class MLP(nn.Module):
    #声明带有模型参数的层
    def __init__(self, **kwargs):
        # 调用MLP父类Module的构造函数来进行必要的初始化。这样在构造实例时还可以指定其他函数
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256,10)
    
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [3]:
X = torch.rand(2, 784)
net = MLP()
print(net)
net(X)

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[ 0.3396,  0.3062, -0.1798,  0.0784, -0.0508,  0.0865,  0.1480, -0.0902,
          0.0463, -0.0685],
        [ 0.4098,  0.3183, -0.1443,  0.2088, -0.1780,  0.1172,  0.1159, -0.0100,
          0.0499, -0.0294]], grad_fn=<AddmmBackward>)

## Module的子类

### Sequential类
下面我们实现一个与Sequential类有相同功能的MySequential类。这或许可以帮助读者更加清晰地理解Sequential类的工作机制。

In [4]:
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            # 如果传入的是一个OrderedDict
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    
    def forward(self, input):
        # self._modules返回一个 OrderedDict，保证会按照成员添加时的顺序遍历成员
        for module in self._modules.values():
            input = module(input)
        return input
        

In [5]:
net = MySequential(
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )
print(net)
net(X)

MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[-0.1304, -0.1117, -0.0094, -0.0383,  0.2676, -0.0940,  0.2368, -0.1800,
          0.1204,  0.0252],
        [-0.1715, -0.1236,  0.0498,  0.0687,  0.2851,  0.0249,  0.1377, -0.3202,
          0.1323,  0.1335]], grad_fn=<AddmmBackward>)

### ModuleList类
ModuleList接收一个子模块的列表作为输入，然后也可以类似List那样进行append和extend操作

In [6]:
net = nn.ModuleList([nn.Linear(784,256), nn.ReLU()])
net.append(nn.Linear(256, 10))
print(net[-1])
print(net)
# net(torch.zeros(1, 784)) # 会报NotImplementedError

Linear(in_features=256, out_features=10, bias=True)
ModuleList(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [7]:
#ModuleList仅仅是一个储存各种模块的列表，
#这些模块之间没有联系也没有顺序（所以不用保证相邻层的输入输出维度匹配），
#而且没有实现forward功能需要自己实现，
#所以上面执行net(torch.zeros(1, 784))会报NotImplementedError；
#而Sequential内的模块需要按照顺序排列，要保证相邻层的输入输出大小相匹配，
#内部forward功能已经实现。

In [8]:
class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.linears = nn.ModuleLost([nn.Linear(10,10) for i in range(10)])
    
    def forward(self, x):
        for i, l in enumerate(self.linears):
            x = self.linears[i//2](x) + l(x)
        return x

In [9]:
#ModuleList不同于一般的Python的list
#加入到ModuleList里面的所有模块的参数会被自动添加到整个网络中
class Module_ModuleList(nn.Module):
    def __init__(self):
        super(Module_ModuleList, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10,10)])

class Module_List(nn.Module):
    def __init__(self):
        super(Module_List, self).__init__()
        self.linears = [nn.Linear(10,10)]
net1 = Module_ModuleList()
net2 = Module_List()

print("net1:")
for p in net1.parameters():
    print(p.size())

print("net2:")
for p in net2.parameters():
    print(p)

net1:
torch.Size([10, 10])
torch.Size([10])
net2:


### ModuleDict
ModuleDict接收一个子模块的字典作为输入, 然后也可以类似字典那样进行添加访问操作

In [10]:
net = nn.ModuleDict({
    'linear':nn.Linear(784, 256),
    'act': nn.ReLU(),
})

net['output'] = nn.Linear(256,10)
print(net['linear'])
print(net.output)
print(net)

Linear(in_features=784, out_features=256, bias=True)
Linear(in_features=256, out_features=10, bias=True)
ModuleDict(
  (linear): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


## 构造复杂的模型

In [13]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    
    def forward(self, x):
        x = self.linear(x)
        x = nn.functional.relu(torch.mm(x, self.rand_weight.data) + 1)
        x = self.linear(x)
        #控制流
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [14]:
X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

FancyMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


tensor(0.8240, grad_fn=<SumBackward0>)

In [22]:
#因为FancyMLP和Sequential类都是Module类的子类，所以我们可以嵌套调用它们。
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30),
                                nn.ReLU())
    def forward(self, x):
        return self.net(x)

net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())
X = torch.rand(2, 40)
print(net)
net(X)

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)


tensor(20.7287, grad_fn=<SumBackward0>)

# 模型参数的访问、初始化和共享

In [23]:
#使用默认方式初始化参数
net = nn.Sequential(nn.Linear(4,3), nn.ReLU(), nn.Linear(3,1))
print(net)
X = torch.rand(2, 4)
Y = net(X).sum()

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


## 访问模型参数

In [24]:
#我们可以通过Module类的parameters()或者named_parameters方法来访问所有参数（以迭代器的形式返回）
#后者除了返回参数Tensor外还会返回其名字
print(type(net.named_parameters()))
for name, param in net.named_parameters():
    print(name, param.size())

<class 'generator'>
0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [25]:
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>


In [27]:
#如果一个Tensor是Parameter，那么它会自动被添加到模型的参数列表里
class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        self.weight1 = nn.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
    
    def forward(self, x):
        pass

#weight1在参数列表中但是weight2却没在参数列表中。
n = MyModel()
for name, param in n.named_parameters():
    print(name)

weight1


## 初始化模型参数
PyTorch的init模块里提供了多种预设的初始化方法

In [28]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.weight tensor([[-0.0051, -0.0069,  0.0190,  0.0027],
        [-0.0056,  0.0079,  0.0113,  0.0088],
        [ 0.0129,  0.0207,  0.0124, -0.0024]])
2.weight tensor([[-0.0041, -0.0113,  0.0051]])


In [29]:
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, param.data)

0.bias tensor([0., 0., 0.])
2.bias tensor([0.])


## 自定义初始化方法
我们令权重有一半概率初始化为0，有另一半概率初始化为[−10,−5]和[5,10]两个区间里均匀分布的随机数。

In [30]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

0.weight tensor([[-0.0000,  8.1128, -0.0000, -0.0000],
        [-8.4758,  6.2336, -6.7908, -9.9704],
        [-0.0000,  6.9162, -0.0000, -9.5517]])
2.weight tensor([[-5.7665,  0.0000, -0.0000]])


##  共享模型参数

In [31]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [32]:
print(id(net[0]) == id(net[1]))
print(id(net[0].weight) == id(net[1].weight))


True
True


In [34]:
#因为模型参数里包含了梯度，所以在反向传播计算时，这些共享的参数的梯度是累加的
x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
print(net[0].weight.grad)

tensor(9., grad_fn=<SumBackward0>)
tensor([[6.]])


In [36]:
torch.ones(2, 2)

tensor([[1., 1.],
        [1., 1.]])

# 自定义层

## 不含模型参数的自定义层

In [2]:
class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
        
    def forward(self, x):
        return x - x.mean()

In [3]:
#做前向计算
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float))

tensor([-2., -1.,  0.,  1.,  2.])

In [11]:
#构造更复杂的模型
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
y = net(torch.rand(4,8))
y.shape, y.mean().item()

(torch.Size([4, 128]), 1.862645149230957e-09)

## 含模型参数的自定义层
如果一个Tensor是Parameter，那么它会自动被添加到模型的参数列表里。所以在自定义含模型参数的层时，我们应该将参数定义成Parameter，除了像4.2.1节那样直接定义成Parameter类外，还可以使用ParameterList和ParameterDict分别定义参数的列表和字典。

ParameterList接收一个Parameter实例的列表作为输入然后得到一个参数列表，使用的时候可以用索引来访问某个参数，另外也可以使用append和extend在列表后面新增参数。

In [21]:
class MyListDense(nn.Module):
    def __init__(self):
        super(MyListDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4,4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4,1)))
    
    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x

net = MyListDense()
print(net)

MyListDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x1]
  )
)


ParameterDict接收一个Parameter实例的字典作为输入然后得到一个参数字典，然后可以按照字典的规则使用了。例如使用update()新增参数，使用keys()返回所有键值，使用items()返回所有键值对等等，可参考官方文档。

In [22]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
            'linear1': nn.Parameter(torch.randn(4,4)),
            'linear2': nn.Parameter(torch.randn(4,1))
        })
        self.params.update({'linear3': nn.Parameter(torch.randn(4, 2))})
    
    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])

net = MyDictDense()
print(net)

MyDictDense(
  (params): ParameterDict(
      (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
      (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
      (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
  )
)


In [23]:
#可以根据传入的键值来进行不同的前向传播
x = torch.ones(1, 4)
print(net(x, 'linear1'))
print(net(x, 'linear2'))
print(net(x, 'linear3'))

tensor([[-0.1201, -0.8272, -4.1857,  1.2931]], grad_fn=<MmBackward>)
tensor([[0.1342]], grad_fn=<MmBackward>)
tensor([[1.5546, 1.8816]], grad_fn=<MmBackward>)


In [24]:
#组合
net = nn.Sequential(
    MyDictDense(),
    MyListDense(),
)
print(net)
print(net(x))

Sequential(
  (0): MyDictDense(
    (params): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyListDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x1]
    )
  )
)
tensor([[21.4086]], grad_fn=<MmBackward>)


# 读取和存储

## 读写Tensor
save使用Python的pickle实用程序将对象进行序列化，然后将序列化的对象保存到disk，使用save可以保存各种对象,包括模型、张量和字典等。而load使用pickle unpickle工具将pickle的对象文件反序列化为内存。

In [25]:
#save
x = torch.ones(3)
torch.save(x, 'x.pt')

In [26]:
#load
x2 = torch.load('x.pt')
x2

tensor([1., 1., 1.])

In [28]:
#Tensor列表存储
y = torch.zeros(4)
torch.save([x,y], 'xy.pt')
xy_list = torch.load('xy.pt')
xy_list

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [29]:
#dict存储
torch.save({'x': x, 'y': y}, 'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

## 读写模型

### state_dict

In [30]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)

    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

net = MLP()
net.state_dict()

OrderedDict([('hidden.weight',
              tensor([[-0.2123,  0.4566, -0.3201],
                      [-0.1952, -0.2500, -0.3745]])),
             ('hidden.bias', tensor([-0.3915, -0.4543])),
             ('output.weight', tensor([[-0.2032,  0.3158]])),
             ('output.bias', tensor([0.1327]))])

In [31]:
#只有具有可学习参数的层(卷积层、线性层等)才有state_dict中的条目。优化器(optim)也有一个state_dict
#其中包含关于优化器状态以及所使用的超参数的信息。
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [0, 1, 2, 3]}]}

In [32]:
X = torch.randn(2, 3)
Y = net(X)

PATH = "./net.pt"
torch.save(net.state_dict(), PATH)

net2 = MLP()
net2.load_state_dict(torch.load(PATH))
Y2 = net2(X)
Y2 == Y

tensor([[True],
        [True]])

# GPU计算

## GPU设备

In [36]:
!nvidia-smi  -h# 对Linux/macOS用户有效

Invalid combination of input arguments. Please run 'nvidia-smi -h' for help.



In [37]:
torch.cuda.is_available()

True

In [39]:
#GPU数量
torch.cuda.device_count()

1

In [40]:
#查看当前GPU索引号，索引号从0开始
torch.cuda.current_device()

0

In [41]:
#根据索引号查看GPU名字
torch.cuda.get_device_name(0)

'GeForce GTX 1050 Ti'

## Tensor的GPU计算

In [42]:
x = torch.tensor([1, 2, 3])
x

tensor([1, 2, 3])

In [43]:
#复制到gpu上
x = x.cuda(0)
x

tensor([1, 2, 3], device='cuda:0')

In [44]:
#直接在创建的时候就指定设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = torch.tensor([1,2,3], device=device)
x

tensor([1, 2, 3], device='cuda:0')