In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [3]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [4]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [7]:
model = Model(input_dimention=256,output_classes=10)
for name,_ in model.named_parameters():
    print(name, _.shape)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

layer1.linear.0.weight torch.Size([128, 256])
layer1.linear.0.bias torch.Size([128])
layer1.linear.2.weight torch.Size([128])
layer1.linear.2.bias torch.Size([128])
layer2.linear.0.weight torch.Size([64, 128])
layer2.linear.0.bias torch.Size([64])
layer2.linear.2.weight torch.Size([64])
layer2.linear.2.bias torch.Size([64])
layer3.linear.0.weight torch.Size([32, 64])
layer3.linear.0.bias torch.Size([32])
layer3.linear.2.weight torch.Size([32])
layer3.linear.2.bias torch.Size([32])
output.linear.0.weight torch.Size([10, 32])
output.linear.0.bias torch.Size([10])


In [8]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [9]:
output = model(dummy_input)
print(output)

tensor([[0.0732, 0.0695, 0.0893, 0.1008, 0.1946, 0.0768, 0.0655, 0.1188, 0.1479,
         0.0636],
        [0.1322, 0.0411, 0.0939, 0.0853, 0.0918, 0.0350, 0.1213, 0.2270, 0.0640,
         0.1084],
        [0.0502, 0.0553, 0.1446, 0.1051, 0.1027, 0.0761, 0.0707, 0.1983, 0.1294,
         0.0675],
        [0.0711, 0.0631, 0.1073, 0.1859, 0.0479, 0.1022, 0.0838, 0.1675, 0.0730,
         0.0981]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [10]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [11]:
criterion = NLLLoss()

In [13]:
loss = criterion(torch.log(output), target)
loss

tensor(2.8557, grad_fn=<NllLossBackward>)

### 完成back propagation並更新梯度

In [14]:
loss.backward()

In [15]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0146, -0.0108,  0.0383,  ..., -0.0347,  0.0600,  0.0457],
        [-0.0041,  0.0202, -0.0016,  ...,  0.0518, -0.0606, -0.0143],
        [ 0.0560,  0.0217,  0.0281,  ...,  0.0024,  0.0020, -0.0584],
        ...,
        [ 0.0014,  0.0066,  0.0551,  ..., -0.0307,  0.0087, -0.0120],
        [-0.0222, -0.0508, -0.0438,  ..., -0.0590, -0.0033, -0.0108],
        [ 0.0174,  0.0035,  0.0487,  ..., -0.0505, -0.0571, -0.0440]],
       requires_grad=True)


grad : tensor([[ 9.8680e-03,  3.9795e-03,  8.4930e-03,  ...,  9.4039e-03,
         -5.0339e-03,  1.0946e-03],
        [ 1.6801e-04,  1.1900e-04, -5.7875e-05,  ..., -1.9559e-04,
         -1.7801e-05, -1.5935e-04],
        [ 3.6892e-02,  2.5865e-02,  4.1308e-02,  ..., -1.8446e-02,
          4.5747e-02, -2.2071e-02],
        ...,
        [ 9.8881e-03, -8.2424e-03, -2.4849e-03,  ...,  9.7274e-04,
          5.7963e-03,  9.8830e-03],
        [ 1.6518e-02,  1.0791e-03,  2.0551e-02,  ...,  3.1294e-02,
       

In [16]:
optimizer.step()

In [17]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0156, -0.0118,  0.0373,  ..., -0.0357,  0.0610,  0.0447],
        [-0.0051,  0.0192, -0.0006,  ...,  0.0528, -0.0596, -0.0133],
        [ 0.0550,  0.0207,  0.0271,  ...,  0.0034,  0.0010, -0.0574],
        ...,
        [ 0.0004,  0.0076,  0.0561,  ..., -0.0317,  0.0077, -0.0130],
        [-0.0232, -0.0518, -0.0448,  ..., -0.0600, -0.0023, -0.0118],
        [ 0.0164,  0.0045,  0.0477,  ..., -0.0515, -0.0581, -0.0450]],
       requires_grad=True)


grad : tensor([[ 9.8680e-03,  3.9795e-03,  8.4930e-03,  ...,  9.4039e-03,
         -5.0339e-03,  1.0946e-03],
        [ 1.6801e-04,  1.1900e-04, -5.7875e-05,  ..., -1.9559e-04,
         -1.7801e-05, -1.5935e-04],
        [ 3.6892e-02,  2.5865e-02,  4.1308e-02,  ..., -1.8446e-02,
          4.5747e-02, -2.2071e-02],
        ...,
        [ 9.8881e-03, -8.2424e-03, -2.4849e-03,  ...,  9.7274e-04,
          5.7963e-03,  9.8830e-03],
        [ 1.6518e-02,  1.0791e-03,  2.0551e-02,  ...,  3.1294e-02,
       

### 清空 gradient

In [18]:
optimizer.zero_grad()

In [19]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0156, -0.0118,  0.0373,  ..., -0.0357,  0.0610,  0.0447],
        [-0.0051,  0.0192, -0.0006,  ...,  0.0528, -0.0596, -0.0133],
        [ 0.0550,  0.0207,  0.0271,  ...,  0.0034,  0.0010, -0.0574],
        ...,
        [ 0.0004,  0.0076,  0.0561,  ..., -0.0317,  0.0077, -0.0130],
        [-0.0232, -0.0518, -0.0448,  ..., -0.0600, -0.0023, -0.0118],
        [ 0.0164,  0.0045,  0.0477,  ..., -0.0515, -0.0581, -0.0450]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
