In [1]:
import torch
import numpy as np
import torch.nn as nn

torch.manual_seed(1)  # reproducible
np.random.seed(1)

In [2]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers=100, do_bn=False):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(neural_num) for i in range(layers)])
        self.neural_num = neural_num
        self.do_bn = do_bn

    def forward(self, x):

        for (i, linear), bn in zip(enumerate(self.linears), self.bns):
            x = linear(x)
            if self.do_bn:
                x = bn(x)
            x = torch.relu(x)

            if torch.isnan(x.std()): # 数据太大
                print("output is nan in {} layers".format(i))
                break
            print("layers:{}, std:{}".format(i, x.std().item()))

        return x

    def initialize(self, mode, std_init=1):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                if mode =="normal":
                    # method 1
                    nn.init.normal_(m.weight.data, std=std_init)    # normal: mean=0, std=1
                elif mode == "kaiming":
                    # method 2 kaiming
                    nn.init.kaiming_normal_(m.weight.data)
                else:
                    print("不支持{}输入".format(mode))

In [8]:
if __name__ == "__main__":

    neural_nums = 256
    layer_nums = 100
    batch_size = 16

#     做BN是否apply和初始化combo进行比较。我们可以发现当使用BN层时，对于权重初始化是多少的影响不大。BN applied时。不需要精心的设置权重初始化

    net = MLP(neural_nums, layer_nums, do_bn=False)      # 1. 无初始化； # 2. normal_初始化； # 3。 kaiming初始化
#     net = MLP(neural_nums, layer_nums, do_bn=True)        # 4. BN+无初始化； 5. BN + normal; 6. BN + kaiming, 7. BN+1000
#     net.initialize("normal", std_init=1)
#     net.initialize("normal", std_init=10000)
#     net.initialize("kaiming")

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

layers:0, std:0.8165372014045715
layers:1, std:0.8045248985290527
layers:2, std:0.7726029753684998
layers:3, std:0.767861545085907
layers:4, std:0.8131486773490906
layers:5, std:0.8373168706893921
layers:6, std:0.880279541015625
layers:7, std:0.7769911289215088
layers:8, std:0.7530372142791748
layers:9, std:0.7334368824958801
layers:10, std:0.7272238731384277
layers:11, std:0.7110223174095154
layers:12, std:0.8030721545219421
layers:13, std:0.7797623872756958
layers:14, std:0.840962827205658
layers:15, std:0.8629758954048157
layers:16, std:0.7960706353187561
layers:17, std:0.7957866787910461
layers:18, std:0.8553910851478577
layers:19, std:0.7684149742126465
layers:20, std:0.683224081993103
layers:21, std:0.6878578066825867
layers:22, std:0.6844690442085266
layers:23, std:0.6568384766578674
layers:24, std:0.629176676273346
layers:25, std:0.5879842638969421
layers:26, std:0.5388146638870239
layers:27, std:0.5111389756202698
layers:28, std:0.48382148146629333
layers:29, std:0.49749252200

## 观察神经网络神经元数据尺度变化

<font  size=12 face="黑体">
    
有无BN层 | 无初始化 | N(0, 1) | Kaiming初始化 | N(0, 10000)
:-: | :-: | :-: | :-: | :-:
无BN层| 1e-40 | NaN in 35 layers | 0.4 | NaN in 8 layers| 
有BN层 | 0.57 | 0.57 | 0.57 |0.57|
    
</font>