In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
import torch
from torch import nn, optim
from torchvision.models import resnet18

## Layerwise Sequential Unit Variance (LSUV)
- [1511.06422.pdf](https://arxiv.org/pdf/1511.06422.pdf)
- The method consists of the two steps. First, pre-initialize weights of each convolution or inner-product layer with orthonormal matrices. Second, proceed from the first to the final layer, normalizing the variance of the output of each layer to be equal to one
- 我们用通常的技术初始化我们的神经网络，然后我们通过模型传递一个批次的数据并检查线性和卷积层的输出。 然后我们可以根据我们在激活时观察到的实际方差来重新调整权重，并从初始偏差中减去我们观察到的平均值。 这样我们就会保持标准化的激活。我们重复这个过程，直到我们对我们观察到的均值/方差感到满意为止。

- 获取一个batch的数据
- 训练一个batch的数据，并获取线性层和卷积层的激活值
- 设置 hook获取激活的均值和标准差
- 不断调整激活值的均值方差直到0， 1（1e-3为阈值）$m.bias -= h.mean， m.weight.data /= h.std$
- 对所有满足条件的卷积层和线性层执行上述步骤

In [3]:
def find_modules(m, cond):
    if cond(m): return [m]
    return sum([find_modules(o, cond) for o in m.children()], [])

def is_lin_layer(l):
    lin_layers = (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear, nn.ReLU)
    return isinstance(l, lin_layers)

In [4]:
model = resnet18()
mods = find_modules(model, is_lin_layer)

In [5]:
mods

[Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False),
 ReLU(inplace),
 Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 ReLU(inplace),
 Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 ReLU(inplace),
 Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
 ReLU(inplace),
 Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False),
 Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 ReLU(inplace),
 Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
 Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
 ReLU(inplace),
 Conv2d(256, 256, kernel_size=(

In [24]:
sum??

[0;31mSignature:[0m [0msum[0m[0;34m([0m[0miterable[0m[0;34m,[0m [0mstart[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the sum of a 'start' value (default: 0) plus an iterable of numbers

When the iterable is empty, return the start value.
This function is intended specifically for use with numeric values and may
reject non-numeric types.
[0;31mType:[0m      builtin_function_or_method


In [26]:
sum([[1, 2, 3], [2, 3, 4]], [1, 4, 5])   # 起到一个连接的作用, 即 [1, 4, 5] + ([1, 2, 3] + [2, 3, 4])
sum([[1, 2, 3], [2, 3, 4]], [])  # [] + ([1, 2, 3] + [2, 3, 4])

[1, 4, 5, 1, 2, 3, 2, 3, 4]

[1, 2, 3, 2, 3, 4]

In [32]:
sum([['abc'], ['bed']], [])
sum ([['abc'], ['bed']], ['def'])

['abc', 'bed']

['def', 'abc', 'bed']

- 设置 hook

In [33]:
def append_stat(hook, mod, inp, outp):
    d = outp.data
    hook.mean,hook.std = d.mean().item(),d.std().item()

In [34]:
def lsuv_module(m, xb):
    h = Hook(m, append_stat)

    while mdl(xb) is not None and abs(h.mean)  > 1e-3: m.bias -= h.mean
    while mdl(xb) is not None and abs(h.std-1) > 1e-3: m.weight.data /= h.std

    h.remove()
    return h.mean,h.std

- lsuv对于复杂的和更深的网络有很好的效果