In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export 
from exp.nb_07 import *

# Layerwise Sequential Unit Variance (LSUV)

In [3]:
x_train, y_train, x_valid, y_valid = get_data()

x_train, x_valid = normalize_to(x_train, x_valid)
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

nh, bs = 50, 512
c = y_train.max().item()+1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

In [4]:
mnist_view = view_tfm(1, 28, 28)
cbfs = [Recorder,
        partial(AvgStatsCallback, accuracy),
        CudaCallback,
        partial(BatchTransformXCallback, mnist_view)
]

In [5]:
nfs = [8,16,32,64,64]

In [6]:
class ConvLayer(nn.Module):
    def __init__(self, ni, nf, ks=3, stride=2, sub=0., **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True)
        self.relu = GeneralRelu(sub=sub, **kwargs)
        
    def forward(self, x): return self.relu(self.conv(x))
    
    @property
    def bias(self): return -self.relu.sub #how much to sub
    @bias.setter
    def bias(self, v): self.relu.sub = -v #set bias if have argument
    @property
    def weight(self): return self.conv.weight #how much weight

In [7]:
learn, run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

Now we're going to look at the paper All You Need is a Good Init, which introduces Layer-wise Sequential Unit-Variance (LSUV). We initialize our neural net with the usual technique, then we pass a batch through the model and check the outputs of the linear and convolutional layers. We can then rescale the weights according to the actual variance we observe on the activations, and subtract the mean we observe from the initial bias. That way we will have activations that stay normalized.

We repeat this process until we are satisfied with the mean/variance we observe.

Let's start by looking at a baseline:

In [8]:
run.fit(2, learn)

train : [1.560818125, tensor(0.4727, device='cuda:0')]
valid : [0.6745693359375, tensor(0.7932, device='cuda:0')]
train : [0.1749662109375, tensor(0.9472, device='cuda:0')]
valid : [0.10702642822265625, tensor(0.9690, device='cuda:0')]


In [9]:
learn, run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

In [10]:
#export
def get_batch(dl, run):
    # get 1 batch
    run.xb, run.yb = next(iter(dl))
    for cb in run.cbs: cb.set_runner(run)
    run('begin_batch')
    return run.xb, run.yb

In [11]:
xb, yb = get_batch(data.train_dl, run)

We only want the outputs of convolutional or linear layers. To find them, we need a recursive function. We can use sum(list, []) to concatenate the lists the function finds (sum applies the + operate between the elements of the list you pass it, beginning with the initial state in the second argument).

In [16]:
#export
def find_modules(m, cond):
    # cond = isinstance
    if cond(m): return [m]
    return sum([find_modules(o, cond) for o in m.children()], [])

def is_lin_layer(l):
    # if it is conv or linear or relu
    lin_layers = (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear, nn.ReLU)
    return isinstance(l, lin_layers)

In [17]:
mods = find_modules(learn.model, lambda o: isinstance(o, ConvLayer))
mods

[ConvLayer(
   (conv): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 )]

In [18]:
def append_stat(hook, mod, inp, outp):
    d = outp.data
    hook.mean, hook.std = d.mean().item(), d.std().item()

In [19]:
mdl = learn.model.cuda()

So now we can look at the mean and std of the conv layer of our model.

In [20]:
with Hooks(mods, append_stat) as hooks:
    mdl(xb)
    for hook in hooks: print(hook.mean, hook.std)

0.5165866017341614 0.8526816964149475
0.5158305168151855 0.8562607169151306
0.4825553297996521 0.8044730424880981
0.4346008598804474 0.7186169028282166
0.3289625346660614 0.48979833722114563


we first adjust the bias terms to make the means 0, then we adjust the standard deviations to make the stds 1 (with a threshold of 1e-3). The mdl(xb) is not `None` clause is just there to pass xb through mdl and compute all the activations so that the hooks get updated.

In [21]:
#export
def lsuv_module(m, xb):
    h = Hook(m, append_stat)
    
    while mdl(xb) is not None and abs(h.mean) > 1e-3: m.bias -= h.mean
    while mdl(xb) is not None and abs(h.std-1) > 1e-3: m.weight.data /= h.std
    h.remove()
    return h.mean, h.std

We execute the intialization on all the conv layers in order:

In [22]:
for m in mods: print(lsuv_module(m, xb))

(0.08925101906061172, 1.0)
(0.05479612946510315, 1.0)
(0.12100870907306671, 0.9999999403953552)
(0.1376228928565979, 1.0)
(0.29763686656951904, 0.9999998211860657)


mean != exactly 0. since we change the std after by scaling the weight.
Then training is beginning on better grounds.

In [23]:
%time run.fit(2, learn)

train : [0.4851798828125, tensor(0.8433, device='cuda:0')]
valid : [0.1190930419921875, tensor(0.9620, device='cuda:0')]
train : [0.097158623046875, tensor(0.9705, device='cuda:0')]
valid : [0.09382681884765624, tensor(0.9698, device='cuda:0')]
CPU times: user 2.31 s, sys: 439 ms, total: 2.75 s
Wall time: 2.76 s


# Export

In [24]:
!python notebook2script.py 07a_lsuv.ipynb

Converted 07a_lsuv.ipynb to exp/nb_07a.py
