torch.nn中提供搭建神经网络所需模块，而这些模块的具体在torch.nn.functional中实现
- Layer函数,包括:线性变换、卷积变化、池化、批归一化(Batch Normalization)、Dropout等
- 激活函数:ReLU、Sigmoid、Tanh、LeakyReLU、Softmax等
- 损失函数:均方误差、交叉熵、负对数似然、Hinge等

In [314]:
import torch 
import torch.nn as nn

全连接层

In [315]:
net = nn.Linear(2,3)
x = torch.rand(7,2)
y = net(x)

print(y.shape)
print(y)


torch.Size([7, 3])
tensor([[-0.1603,  0.7083, -0.0668],
        [-0.0050,  1.0916, -0.3441],
        [-0.2729,  0.9737, -0.1249],
        [-0.2514,  1.2877, -0.2878],
        [-0.1480,  1.4313, -0.4192],
        [-0.2719,  1.0551, -0.1643],
        [-0.1481,  0.8691, -0.1509]], grad_fn=<AddmmBackward0>)


In [316]:
for name, para in net.named_parameters():
    print(name, para,'\n')


weight Parameter containing:
tensor([[-0.4011,  0.5458],
        [ 0.5547,  0.5251],
        [-0.0207, -0.5824]], requires_grad=True) 

bias Parameter containing:
tensor([-0.2882,  0.5677,  0.0781], requires_grad=True) 



In [317]:
for para in net.parameters():
    print(para,'\n')

Parameter containing:
tensor([[-0.4011,  0.5458],
        [ 0.5547,  0.5251],
        [-0.0207, -0.5824]], requires_grad=True) 

Parameter containing:
tensor([-0.2882,  0.5677,  0.0781], requires_grad=True) 



In [318]:
paras = list(net.parameters())
print(paras)

[Parameter containing:
tensor([[-0.4011,  0.5458],
        [ 0.5547,  0.5251],
        [-0.0207, -0.5824]], requires_grad=True), Parameter containing:
tensor([-0.2882,  0.5677,  0.0781], requires_grad=True)]


In [319]:
print(type(net.weight))

<class 'torch.nn.parameter.Parameter'>


In [320]:
net.weight.data = torch.zeros(3,2)
print(net.weight,'\n')

net.bias.data = torch.ones(3)
print(net.bias,'\n')

y = net(x)
print(y,'\n')

Parameter containing:
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]], requires_grad=True) 

Parameter containing:
tensor([1., 1., 1.], requires_grad=True) 

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], grad_fn=<AddmmBackward0>) 



In [321]:
with torch.no_grad():
    x = torch.rand(2,2)
    y = net(x)
    print(y,'\n')

tensor([[1., 1., 1.],
        [1., 1., 1.]]) 



In [322]:
x = torch.rand(2,requires_grad=False)
y = torch.rand(2,requires_grad=True)
z = torch.dot(x,y)
print(z,'\n')

tensor(0.5320, grad_fn=<DotBackward0>) 



In [323]:
z.backward()
print(y.grad,'\n')
print(x.grad)

tensor([0.6670, 0.2001]) 

None


卷积层

In [324]:
net = nn.Conv2d(5,10,kernel_size=3,padding=1)
x = torch.rand(7,5,28,28)
y = net(x)

print(y.shape,'\n')

torch.Size([7, 10, 28, 28]) 



In [325]:
net = nn.Conv2d(5,10,kernel_size=3,padding=1,stride=2)
x = torch.rand(7,5,28,28)
y = net(x)

print(y.shape,'\n')

torch.Size([7, 10, 14, 14]) 



In [326]:
net = nn.Conv2d(2,5,kernel_size=5,padding=2,stride=1)
x = torch.rand(7,2,28,28)
y = net(x)

print(y.shape,'\n')

torch.Size([7, 5, 28, 28]) 



In [327]:
for name, para in net.named_parameters():
    print(name, para.shape,'\n')

weight torch.Size([5, 2, 5, 5]) 

bias torch.Size([5]) 



池化层

In [328]:
x = torch.rand(2,3,9,9)
avg_pool = nn.AvgPool2d(kernel_size=3)
max_pool = nn.MaxPool2d(kernel_size=3)

y1 = avg_pool(x)
y2 = max_pool(x)

print(y1.data.shape,'\n')
print(y2.data.shape,'\n')

torch.Size([2, 3, 3, 3]) 

torch.Size([2, 3, 3, 3]) 



In [329]:
x = torch.rand(2,3,9,9)
avg_pool = nn.AvgPool2d(kernel_size=3,stride=2)
max_pool = nn.MaxPool2d(kernel_size=3,stride=2)

y1 = avg_pool(x)
y2 = max_pool(x)

print(y1.data.shape,'\n')
print(y2.data.shape,'\n')

torch.Size([2, 3, 4, 4]) 

torch.Size([2, 3, 4, 4]) 



激活函数

In [330]:
relu1 = nn.ReLU(inplace=True)
x = torch.randn(3,3)
y = relu1(x)
print(y,'\n')

tensor([[0.0000, 0.8172, 0.9634],
        [0.0000, 0.5376, 0.7961],
        [0.0000, 1.1626, 0.0000]]) 



In [331]:
print(x, y is x)

tensor([[0.0000, 0.8172, 0.9634],
        [0.0000, 0.5376, 0.7961],
        [0.0000, 1.1626, 0.0000]]) True


In [332]:
relu2 = nn.ReLU()
x = torch.randn(3,3)
y = relu2(x)

print(y,'\n')
print(x, y is x)

tensor([[0.6273, 0.2712, 0.0999],
        [0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000]]) 

tensor([[ 0.6273,  0.2712,  0.0999],
        [-1.1771, -0.8340, -0.5153],
        [-0.2561, -0.2042, -0.2384]]) False


In [333]:
print(relu1.inplace,relu2.inplace)

True False


In [334]:
tanh = nn.Tanh()
x = torch.rand(3,3)*1000

y = tanh(x)
print(y)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


损失函数

In [335]:
B = 4
d = 20

x1 = torch.ones(B,d)
x2 = -torch.ones(B,d)

In [336]:
loss = nn.MSELoss()
z = loss(x1,x2)
print(z)

tensor(4.)


In [337]:
loss = nn.MSELoss(reduction='sum')
z = loss(x1, x2)
print(z)

tensor(320.)


In [338]:
loss = nn.MSELoss(reduction='none')
z = loss(x1, x2)
print(z)

tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4.],
        [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4.],
        [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4.],
        [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4.]])


In [339]:
nn.CrossEntropyLoss

torch.nn.modules.loss.CrossEntropyLoss

nn.Module 
- torch.nn中的每个模块都是继承自nn.Module。譬如下面代码片段实现全连接层
- nn.Module中的forward()函数是实现前向传播的函数，backward()函数是实现反向传播的函数
```python
class Linear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
    def forward(self, input):
        return F.linear(input, self.weight, self.bias)
```
继承:
- nn.Module中的forward()函数是实现前向传播的函数，backward()函数是实现反向传播的函数

- zero_grad()函数用于梯度清零

- parameters()函数用于返回模型参数

- to()函数用于将模型转移到GPU上

- state_dict()函数用于返回模型参数字典

- load_state_dict()函数用于加载模型参数字典
  

In [340]:
class QuadActivationFunc(nn.Module):
    def __init__(self):
        super(QuadActivationFunc, self).__init__() 
        a = torch.tensor(1).float()   
        self.a = nn.Parameter(a)

    def forward(self, x):
        y = self.a * x * x
        return y


In [341]:
acfun = QuadActivationFunc()
x = torch.tensor([1.,2.],requires_grad=True)
y = acfun(x)
S = y.sum()
S.backward()

print(x.grad)
print(S.item())


tensor([2., 4.])
5.0


In [342]:
acfun.a.grad

tensor(5.)

nn.Functional
- nn中还有一个很常用的模块:nn.functional,nn中的大多数layer,在functional中都能找到与之对应的函数。nn.Module与nn.functional的区别在于，nn.Module实现的layer是一个特殊的类，会自动提取可学习参数nn.Parameter,而nn.functional是一个函数库，由def.function定义。nn.Module中的layer都是类的形式，需要实例化后才能使用，而nn.functional中的函数都是函数，可以直接调用。当函数中不存在可学习的参数时，nn.Module与nn.functional的功能是一样的，但是当函数中存在可学习参数时，nn.Module的功能更加强大，因为它能够提取出可学习参数，而nn.functional就显得无能为力了。

In [343]:
import torch.nn.functional as F

In [344]:
f = nn.Linear(2,3)
x = torch.rand(2)
output1 = f(x)

output2 = F.linear(x, f.weight, f.bias)

print(output1,'\n')
print(output2,'\n')


tensor([-0.6710, -0.6160, -0.4166], grad_fn=<ViewBackward0>) 

tensor([-0.6710, -0.6160, -0.4166], grad_fn=<ViewBackward0>) 



In [345]:
import math
class Linear(nn.Module):
    def __init__(self, m, n):
        super(Linear, self).__init__()
        self.W = nn.Parameter(torch.randn(n, m))
        self.b = nn.Parameter(torch.randn(n))
        self.c = torch.zeros(3)
        
    def initalize(self,a,b):
        self.W.uniform_(a,b)
        self.b.zero_()
        
    def forward(self,x):
        o = x.matmul(self.W.t()) + self.b
        return o

In [346]:
f = Linear(3,2)

In [347]:
for name, para in f.named_parameters():
    print(name, para,'\n')

W Parameter containing:
tensor([[ 0.3538, -0.4064, -0.4798],
        [-0.6161, -0.2097, -1.9453]], requires_grad=True) 

b Parameter containing:
tensor([-0.1072, -0.2490], requires_grad=True) 



In [348]:
x = torch.rand(4,3)
y = f(x)
y1 = f.forward(x)

print(y1-y,'\n')

tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]], grad_fn=<SubBackward0>) 



In [349]:
x = torch.rand(2,3)
y = f(x)
e = y.pow(2).sum()
e.backward()
print(f.W.grad,'\n')

tensor([[-1.1756, -0.9457, -1.1510],
        [-6.0586, -4.8742, -5.9472]]) 



In [350]:
f.zero_grad()
print(f.W.grad,'\n')

None 



In [351]:
f = Linear(3,2)
f.state_dict()

OrderedDict([('W',
              tensor([[-0.9096, -0.5388,  1.2883],
                      [ 0.1372, -0.5839, -0.0594]])),
             ('b', tensor([-0.2127,  0.0490]))])

In [352]:
torch.save(f.state_dict(),'linear.pt')

In [353]:
ls

1Tensor.ipynb             optim.ipynb               y_new.pkl
2Autograd.ipynb           x.pkl                     神经网络模块.ipynb
linear.pt                 y.pkl


In [355]:
g = Linear(3,2)
g.load_state_dict(torch.load('linear.pt'))

<All keys matched successfully>

In [356]:
g.W == f.W

tensor([[True, True, True],
        [True, True, True]])

In [357]:
import torch.nn as nn
import torch.nn.init as init

# 定义一个线性层
linear_layer = nn.Linear(10, 5)

# 使用 kaiming_normal_ 初始化权重
init.kaiming_normal_(linear_layer.weight, mode='fan_out', nonlinearity='relu')

# 将偏差初始化为零
init.constant_(linear_layer.bias, 0)


Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)