In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

- hook
- pytorch 的 hook 有 forwardhook和backward hook，必须包括三个参数 module，imput，output（当前模块，当前模块的输入，当前模块的输出）

In [127]:
import torch
from torch import nn, optim
from functools import partial
import matplotlib.pyplot as plt
import numpy as np

In [77]:
data = torch.randn(10, 26)

In [78]:
model = nn.Sequential(
        nn.Linear(26, 10),
        nn.Linear(10, 5),
        nn.ReLU(inplace=True),
        nn.Linear(5, 3)
    )

In [79]:
model(data)

tensor([[-0.3957, -0.4312, -0.2336],
        [-0.4296, -0.3792, -0.2514],
        [-0.4377, -0.3563, -0.2609],
        [-0.4352, -0.5905, -0.0555],
        [-0.4199, -0.3801, -0.2870],
        [-0.4816, -0.4856, -0.2321],
        [-0.4555, -0.4081, -0.3971],
        [-0.4387, -0.3615, -0.2483],
        [-0.3789, -0.5624, -0.0711],
        [-0.5028, -0.6129, -0.3769]], grad_fn=<AddmmBackward>)

In [80]:
act_means = [[] for _ in model]
act_stds  = [[] for _ in model]

In [81]:
def append_stats(i, mod, inp, outp):
    if mod.training:
        act_means[i].append(outp.data.mean())
        act_stds [i].append(outp.data.std())

In [82]:
for i, m in enumerate(model):
    m.register_forward_hook(partial(append_stats, i))

<torch.utils.hooks.RemovableHandle at 0x7faf65882f28>

<torch.utils.hooks.RemovableHandle at 0x7faf65882c88>

<torch.utils.hooks.RemovableHandle at 0x7faf65882a90>

<torch.utils.hooks.RemovableHandle at 0x7faf658824e0>

In [83]:
model(data)

tensor([[-0.3957, -0.4312, -0.2336],
        [-0.4296, -0.3792, -0.2514],
        [-0.4377, -0.3563, -0.2609],
        [-0.4352, -0.5905, -0.0555],
        [-0.4199, -0.3801, -0.2870],
        [-0.4816, -0.4856, -0.2321],
        [-0.4555, -0.4081, -0.3971],
        [-0.4387, -0.3615, -0.2483],
        [-0.3789, -0.5624, -0.0711],
        [-0.5028, -0.6129, -0.3769]], grad_fn=<AddmmBackward>)

In [84]:
act_means
act_stds

[[tensor(-0.0291)], [tensor(0.0751)], [tensor(0.1723)], [tensor(-0.3786)]]

[[tensor(0.5740)], [tensor(0.3283)], [tensor(0.2056)], [tensor(0.1304)]]

- hook 在不使用的时候应该删除，否则内存会受不了

In [87]:
def children(m): return list(m.children())

class Hook():
    def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
    def remove(self): self.hook.remove()
    def __del__(self): self.remove()

def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[])
    means,stds = hook.stats
    if mod.training:
        means.append(outp.data.mean())
        stds .append(outp.data.std())

In [89]:
hooks = [Hook(l, append_stats) for l in children(model)]

In [90]:
model(data)

tensor([[-0.3957, -0.4312, -0.2336],
        [-0.4296, -0.3792, -0.2514],
        [-0.4377, -0.3563, -0.2609],
        [-0.4352, -0.5905, -0.0555],
        [-0.4199, -0.3801, -0.2870],
        [-0.4816, -0.4856, -0.2321],
        [-0.4555, -0.4081, -0.3971],
        [-0.4387, -0.3615, -0.2483],
        [-0.3789, -0.5624, -0.0711],
        [-0.5028, -0.6129, -0.3769]], grad_fn=<AddmmBackward>)

In [95]:
h.stats

([tensor(-0.0291)], [tensor(0.5740)])

**给我们的Hooks类一个`__enter__` 和 `__exit__` 方法后，我们可以将它用作上下文管理器。这样可以确保一旦我们离开with块，所有的hook都被移除了。**

In [96]:
torch.log1p??

[0;31mDocstring:[0m
log1p(input, out=None) -> Tensor

Returns a new tensor with the natural logarithm of (1 + :attr:`input`).

.. math::
    y_i = \log_{e} (x_i + 1)

.. note:: This function is more accurate than :func:`torch.log` for small
          values of :attr:`input`

Args:
    input (Tensor): the input tensor
    out (Tensor, optional): the output tensor

Example::

    >>> a = torch.randn(5)
    >>> a
    tensor([-1.0090, -0.9923,  1.0249, -0.5372,  0.2492])
    >>> torch.log1p(a)
    tensor([    nan, -4.8653,  0.7055, -0.7705,  0.2225])
[0;31mType:[0m      builtin_function_or_method


## dropout
**dropout 就是一个 mask 的过程**

In [101]:
def dropout_mask(x:Tensor, sz:Collection[int], p:float):
    "Return a dropout mask of the same type as `x`, size `sz`, with probability `p` to cancel an element."
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

class RNNDropout(nn.Module):
    "Dropout with probability `p` that is consistent on the seq_len dimension."

    def __init__(self, p:float=0.5):
        super().__init__()
        self.p=p

    def forward(self, x:Tensor)->Tensor:
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

In [140]:
a = torch.randn(3, 4, 3)
b = a.clone()  # 这里必须是 clone， Python是链接
a.bernoulli_(0.8).div_(0.8) * b

tensor([[[-1.1863,  0.9396, -0.0000],
         [-0.8098,  0.7910,  2.2062],
         [-2.2199,  0.2210, -0.8520],
         [ 0.0000, -0.0000, -0.5371]],

        [[ 1.3177, -0.0863, -0.5836],
         [-0.0000,  0.0000, -2.0321],
         [-0.0000,  0.5223,  0.8653],
         [ 0.8988, -1.0106, -0.3684]],

        [[-0.1882,  0.7382, -1.1006],
         [-1.7342,  0.0592,  0.0756],
         [-1.0711, -0.0000, -1.6763],
         [ 0.1145, -0.5760,  1.1264]]])

In [141]:
m = RNNDropout(0.2)
m(b)

tensor([[[-1.1863,  0.9396, -2.3814],
         [-0.8098,  0.7910,  2.2062],
         [-2.2199,  0.2210, -0.8520],
         [ 1.0485, -0.6884, -0.5371]],

        [[ 1.3177, -0.0863, -0.5836],
         [-0.2189,  0.9301, -2.0321],
         [-1.3592,  0.5223,  0.8653],
         [ 0.8988, -1.0106, -0.3684]],

        [[-0.1882,  0.7382, -1.1006],
         [-1.7342,  0.0592,  0.0756],
         [-1.0711, -0.5491, -1.6763],
         [ 0.1145, -0.5760,  1.1264]]])

- 线性插值
$$\text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)$$

In [146]:
torch.lerp??

[0;31mDocstring:[0m
lerp(start, end, weight, out=None)

Does a linear interpolation of two tensors :attr:`start` and :attr:`end` based
on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.

.. math::
    \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)

The shapes of :attr:`start` and :attr:`end` must be
:ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
the shapes of :attr:`start`, :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.

Args:
    start (Tensor): the tensor with the starting points
    end (Tensor): the tensor with the ending points
    weight (float or tensor): the weight for the interpolation formula
    out (Tensor, optional): the output tensor

Example::

    >>> start = torch.arange(1., 5.)
    >>> end = torch.empty(4).fill_(10)
    >>> start
    tensor([ 1.,  2.,  3.,  4.])
    >>> end
    tensor([ 10.,  10.,  10.,  10.])
    >>> torch.lerp(start, end,

- BatchNorma 实现

In [147]:
class BatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        # NB: pytorch bn mom is opposite of what you'd expect
        self.mom,self.eps = mom,eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('vars',  torch.ones(1,nf,1,1))
        self.register_buffer('means', torch.zeros(1,nf,1,1))

    def update_stats(self, x):
        m = x.mean((0,2,3), keepdim=True)
        v = x.var ((0,2,3), keepdim=True)
        self.means.lerp_(m, self.mom)
        self.vars.lerp_ (v, self.mom)
        return m,v
        
    def forward(self, x):
        if self.training:
            with torch.no_grad(): m,v = self.update_stats(x)
        else: m,v = self.means,self.vars
        x = (x-m) / (v+self.eps).sqrt()
        return x*self.mults + self.adds

$$y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$$

In [149]:
class LayerNorm(nn.Module):
    __constants__ = ['eps']
    def __init__(self, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.mult = nn.Parameter(tensor(1.))
        self.add  = nn.Parameter(tensor(0.))

    def forward(self, x):
        m = x.mean((1,2,3), keepdim=True)
        v = x.var ((1,2,3), keepdim=True)
        x = (x-m) / ((v+self.eps).sqrt())
        return x*self.mult + self.add

In [150]:
class InstanceNorm(nn.Module):
    __constants__ = ['eps']
    def __init__(self, nf, eps=1e-0):
        super().__init__()
        self.eps = eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))

    def forward(self, x):
        m = x.mean((2,3), keepdim=True)
        v = x.var ((2,3), keepdim=True)
        res = (x-m) / ((v+self.eps).sqrt())
        return res*self.mults + self.adds

- batchnorm 会收到 batchsize的影响，当bs非常小的时候，均值几乎接近0
- 使用 光滑 BN 解决

In [151]:
class RunningBatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        self.mom,self.eps = mom,eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('sums', torch.zeros(1,nf,1,1))
        self.register_buffer('sqrs', torch.zeros(1,nf,1,1))
        self.register_buffer('batch', tensor(0.))
        self.register_buffer('count', tensor(0.))
        self.register_buffer('step', tensor(0.))
        self.register_buffer('dbias', tensor(0.))

    def update_stats(self, x):
        bs,nc,*_ = x.shape
        self.sums.detach_()
        self.sqrs.detach_()
        dims = (0,2,3)
        s = x.sum(dims, keepdim=True)
        ss = (x*x).sum(dims, keepdim=True)
        c = self.count.new_tensor(x.numel()/nc)
        mom1 = 1 - (1-self.mom)/math.sqrt(bs-1)
        self.mom1 = self.dbias.new_tensor(mom1)
        self.sums.lerp_(s, self.mom1)
        self.sqrs.lerp_(ss, self.mom1)
        self.count.lerp_(c, self.mom1)
        self.dbias = self.dbias*(1-self.mom1) + self.mom1
        self.batch += bs
        self.step += 1

    def forward(self, x):
        if self.training: self.update_stats(x)
        sums = self.sums
        sqrs = self.sqrs
        c = self.count
        if self.step<100:
            sums = sums / self.dbias
            sqrs = sqrs / self.dbias
            c    = c    / self.dbias
        means = sums/c
        vars = (sqrs/c).sub_(means*means)
        if bool(self.batch < 20): vars.clamp_min_(0.01)
        x = (x-means).div_((vars.add_(self.eps)).sqrt())
        return x.mul_(self.mults).add_(self.adds)

### Simplified RunningBatchNorm

In [152]:
class RunningBatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        self.mom, self.eps = mom, eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('sums', torch.zeros(1,nf,1,1))
        self.register_buffer('sqrs', torch.zeros(1,nf,1,1))
        self.register_buffer('count', tensor(0.))
        self.register_buffer('factor', tensor(0.))
        self.register_buffer('offset', tensor(0.))
        self.batch = 0
        
    def update_stats(self, x):
        bs,nc,*_ = x.shape
        self.sums.detach_()
        self.sqrs.detach_()
        dims = (0,2,3)
        s    = x    .sum(dims, keepdim=True)
        ss   = (x*x).sum(dims, keepdim=True)
        c    = s.new_tensor(x.numel()/nc)
        mom1 = s.new_tensor(1 - (1-self.mom)/math.sqrt(bs-1))
        self.sums .lerp_(s , mom1)
        self.sqrs .lerp_(ss, mom1)
        self.count.lerp_(c , mom1)
        self.batch += bs
        means = self.sums/self.count
        varns = (self.sqrs/self.count).sub_(means*means)
        if bool(self.batch < 20): varns.clamp_min_(0.01)
        self.factor = self.mults / (varns+self.eps).sqrt()
        self.offset = self.adds - means*self.factor
        
    def forward(self, x):
        if self.training: self.update_stats(x)
        return x*self.factor + self.offset