# Normalization in PyTorch

[link](https://ut.philkr.net/deeplearning/residuals_and_normalizations/normalizations_in_pytorch/)

In [136]:
import torch
import torch.nn as nn

In [137]:
class MLPNoBIas(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPNoBIas, self).__init__()
        layers = [nn.Flatten()]
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.Linear(in_dim, h_dim, bias=False))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

## Vanashing activation

In [138]:
x = torch.randn(5, 28, 28)
for i in range(10):
    net = MLPNoBIas(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 12.411030769348145
out_norm = 5.221368789672852
out_norm = 2.2383782863616943
out_norm = 0.8727157711982727
out_norm = 0.3667830526828766
out_norm = 0.14457768201828003
out_norm = 0.0638008713722229
out_norm = 0.02657708339393139
out_norm = 0.010049083270132542
out_norm = 0.00452427938580513


## Batch Normalization

In [139]:
class MLPBNPre(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = [nn.Flatten()]
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.BatchNorm1d(in_dim,affine=False)) # bias learned by LinearLayer
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

**Note:** We should always learn the bias before the ReLU. Otherwise, the ReLU will set half of the activations to zero, which limits the expressiveness of the model

In [140]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPBNPre(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 12.024511337280273
out_norm = 5.024309158325195
out_norm = 5.271420955657959
out_norm = 5.451053142547607
out_norm = 5.185364246368408
out_norm = 5.233152866363525
out_norm = 5.526966094970703
out_norm = 5.2882866859436035
out_norm = 5.509188175201416
out_norm = 5.597162246704102
out_norm = 5.479552268981934
out_norm = 5.394810676574707
out_norm = 5.179454326629639
out_norm = 5.039365768432617
out_norm = 5.30059814453125
out_norm = 5.2813286781311035
out_norm = 5.495495796203613
out_norm = 5.530455112457275
out_norm = 5.437389850616455
out_norm = 5.115573883056641
out_norm = 5.191061973571777
out_norm = 4.9538421630859375
out_norm = 5.318709850311279
out_norm = 5.090663909912109
out_norm = 5.411695957183838
out_norm = 5.124027252197266
out_norm = 5.2969279289245605
out_norm = 5.216276168823242
out_norm = 5.327692985534668
out_norm = 5.260517597198486


In [141]:
class MLPBNPos(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = []
        layers.append(nn.Flatten())
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.Linear(in_dim, h_dim, bias=False))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [142]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPBN(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 13.410526275634766
out_norm = 9.821581840515137
out_norm = 9.03606128692627
out_norm = 9.867572784423828
out_norm = 8.556086540222168
out_norm = 9.152222633361816
out_norm = 9.228806495666504
out_norm = 9.490620613098145
out_norm = 9.340707778930664
out_norm = 8.71877670288086
out_norm = 9.14144515991211
out_norm = 8.685832023620605
out_norm = 9.378507614135742
out_norm = 9.54654312133789
out_norm = 8.792848587036133
out_norm = 9.000468254089355
out_norm = 9.063599586486816
out_norm = 9.271416664123535
out_norm = 8.872846603393555
out_norm = 8.996878623962402
out_norm = 9.420639991760254
out_norm = 9.194003105163574
out_norm = 9.413265228271484
out_norm = 9.407005310058594
out_norm = 9.761667251586914
out_norm = 9.082772254943848
out_norm = 9.130640983581543
out_norm = 8.916101455688477
out_norm = 8.82054615020752
out_norm = 9.423371315002441


## Layer Normalization

In [143]:
class MLPBNPre(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = [nn.Flatten()]
        in_dim = input_dim
        for h_dim in hidden_dim:
            # BatchNorm matches current input dimension
            layers.append(nn.LayerNorm(in_dim, bias=False)) # bias learned by LinearLayer
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [144]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPBNPre(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 12.980302810668945
out_norm = 4.991347312927246
out_norm = 4.997653961181641
out_norm = 5.150710582733154
out_norm = 5.36508321762085
out_norm = 5.078702449798584
out_norm = 5.298841953277588
out_norm = 5.6470866203308105
out_norm = 5.031192302703857
out_norm = 5.031366348266602
out_norm = 5.002532958984375
out_norm = 4.955268859863281
out_norm = 5.230772495269775
out_norm = 5.224359512329102
out_norm = 5.633170127868652
out_norm = 5.3707275390625
out_norm = 4.847537994384766
out_norm = 4.637307643890381
out_norm = 5.503502368927002
out_norm = 5.319293022155762
out_norm = 5.238644599914551
out_norm = 5.7760090827941895
out_norm = 5.5079474449157715
out_norm = 5.622540473937988
out_norm = 5.416176795959473
out_norm = 5.0370965003967285
out_norm = 5.123276233673096
out_norm = 5.376845359802246
out_norm = 5.7942423820495605
out_norm = 5.010921001434326


In [145]:
class MLPLNPos(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = []
        layers.append(nn.Flatten())
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.Linear(in_dim, h_dim, bias=False)) # Bias learned by LayerNorm
            layers.append(nn.LayerNorm(h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [146]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPLNPos(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 12.144548416137695
out_norm = 9.377350807189941
out_norm = 8.982717514038086
out_norm = 9.0455961227417
out_norm = 9.14870548248291
out_norm = 9.405213356018066
out_norm = 10.759411811828613
out_norm = 9.215036392211914
out_norm = 9.327682495117188
out_norm = 9.275083541870117
out_norm = 9.386228561401367
out_norm = 8.449091911315918
out_norm = 9.29554271697998
out_norm = 8.787675857543945
out_norm = 8.455885887145996
out_norm = 8.547210693359375
out_norm = 8.376311302185059
out_norm = 9.38184642791748
out_norm = 8.61821460723877
out_norm = 9.655498504638672
out_norm = 9.376791000366211
out_norm = 7.882428169250488
out_norm = 9.666019439697266
out_norm = 8.73833179473877
out_norm = 9.05306339263916
out_norm = 8.358280181884766
out_norm = 8.523652076721191
out_norm = 10.625301361083984
out_norm = 9.722755432128906
out_norm = 9.749566078186035
