# Normalization

[link](https://ut.philkr.net/deeplearning/residuals_and_normalizations/normalizations_in_pytorch/)

In [12]:
import torch
import torch.nn as nn

In [13]:
class MLPNoBIas(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPNoBIas, self).__init__()
        layers = [nn.Flatten()]
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.Linear(in_dim, h_dim, bias=False))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

## Vanashing activation

In [14]:
x = torch.randn(5, 28, 28)
for i in range(10):
    net = MLPNoBIas(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 13.235665321350098
out_norm = 5.109479904174805
out_norm = 2.2417287826538086
out_norm = 0.8878068923950195
out_norm = 0.3714502453804016
out_norm = 0.1554199755191803
out_norm = 0.05945925787091255
out_norm = 0.024814074859023094
out_norm = 0.009948276914656162
out_norm = 0.00434709619730711


## Batch Normalization

In [15]:
class MLPBNPre(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = [nn.Flatten()]
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.BatchNorm1d(in_dim,affine=False)) # bias learned by LinearLayer
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

**Note:** We should always learn the bias before the ReLU. Otherwise, the ReLU will set half of the activations to zero, which limits the expressiveness of the model

In [16]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPBNPre(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 12.1582612991333
out_norm = 5.250041961669922
out_norm = 5.146596908569336
out_norm = 5.144865989685059
out_norm = 5.4364705085754395
out_norm = 5.265069484710693
out_norm = 5.387322425842285
out_norm = 5.271307468414307
out_norm = 5.063490867614746
out_norm = 5.191336154937744
out_norm = 5.492993354797363
out_norm = 5.130956172943115
out_norm = 5.244418144226074
out_norm = 5.2606587409973145
out_norm = 5.013553142547607
out_norm = 5.314823627471924
out_norm = 5.065823078155518
out_norm = 5.035289764404297
out_norm = 5.108409404754639
out_norm = 5.232163429260254
out_norm = 5.492886543273926
out_norm = 5.6458210945129395
out_norm = 5.216387748718262
out_norm = 5.3886566162109375
out_norm = 5.405171871185303
out_norm = 5.305245876312256
out_norm = 5.2440714836120605
out_norm = 5.205931186676025
out_norm = 5.564276218414307
out_norm = 5.314305782318115


In [17]:
class MLPBNPos(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = []
        layers.append(nn.Flatten())
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.Linear(in_dim, h_dim, bias=False))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [18]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPBNPos(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 14.170045852661133
out_norm = 8.724059104919434
out_norm = 8.843928337097168
out_norm = 9.060585021972656
out_norm = 8.993568420410156
out_norm = 8.748834609985352
out_norm = 8.898076057434082
out_norm = 9.477777481079102
out_norm = 8.1068754196167
out_norm = 9.740224838256836
out_norm = 8.90272045135498
out_norm = 9.097138404846191
out_norm = 9.12496280670166
out_norm = 8.948826789855957
out_norm = 8.199789047241211
out_norm = 8.707714080810547
out_norm = 8.901817321777344
out_norm = 9.005202293395996
out_norm = 8.998388290405273
out_norm = 9.329340934753418
out_norm = 9.193537712097168
out_norm = 9.306073188781738
out_norm = 8.786357879638672
out_norm = 9.39171314239502
out_norm = 9.443123817443848
out_norm = 9.122742652893066
out_norm = 9.19918155670166
out_norm = 8.831351280212402
out_norm = 9.312227249145508
out_norm = 9.15225601196289


## Layer Normalization

In [19]:
class MLPBNPre(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = [nn.Flatten()]
        in_dim = input_dim
        for h_dim in hidden_dim:
            # BatchNorm matches current input dimension
            layers.append(nn.LayerNorm(in_dim, elementwise_affine=False)) # bias learned by LinearLayer
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [20]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPBNPre(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 13.84544849395752
out_norm = 5.214603900909424
out_norm = 5.113377094268799
out_norm = 5.348901748657227
out_norm = 4.925804138183594
out_norm = 5.6150617599487305
out_norm = 5.550417900085449
out_norm = 5.169072151184082
out_norm = 5.2616376876831055
out_norm = 4.960304260253906
out_norm = 5.183526039123535
out_norm = 5.386806488037109
out_norm = 5.098806858062744
out_norm = 5.266600608825684
out_norm = 5.357975006103516
out_norm = 5.1884307861328125
out_norm = 5.391501426696777
out_norm = 5.506072521209717
out_norm = 5.424647331237793
out_norm = 5.29259729385376
out_norm = 5.61790132522583
out_norm = 5.114840507507324
out_norm = 5.161526203155518
out_norm = 5.420399188995361
out_norm = 5.113193988800049
out_norm = 5.6424784660339355
out_norm = 5.430173873901367
out_norm = 5.270264625549316
out_norm = 4.936638832092285
out_norm = 5.396730899810791


In [21]:
class MLPLNPos(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        layers = []
        layers.append(nn.Flatten())
        in_dim = input_dim
        for h_dim in hidden_dim:
            layers.append(nn.Linear(in_dim, h_dim, bias=False)) # Bias learned by LayerNorm
            layers.append(nn.LayerNorm(h_dim))
            layers.append(nn.ReLU())
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim, bias=False))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [22]:
x = torch.randn(5, 28, 28)
for i in range(30):
    net = MLPLNPos(input_dim=28*28, hidden_dim=i*[512], output_dim=100)
    out_norm = net(x).norm().item()
    print(f'{out_norm = }')

out_norm = 13.674946784973145
out_norm = 9.108837127685547
out_norm = 8.960722923278809
out_norm = 8.448254585266113
out_norm = 9.74319076538086
out_norm = 8.173799514770508
out_norm = 9.221722602844238
out_norm = 9.078075408935547
out_norm = 8.76062297821045
out_norm = 8.310615539550781
out_norm = 10.636826515197754
out_norm = 9.499361991882324
out_norm = 8.590950965881348
out_norm = 10.054750442504883
out_norm = 8.244994163513184
out_norm = 9.755012512207031
out_norm = 8.789600372314453
out_norm = 9.30587387084961
out_norm = 10.1161527633667
out_norm = 8.145820617675781
out_norm = 8.818049430847168
out_norm = 8.688199043273926
out_norm = 9.163427352905273
out_norm = 8.954636573791504
out_norm = 8.770222663879395
out_norm = 8.566849708557129
out_norm = 8.197632789611816
out_norm = 9.962224006652832
out_norm = 9.507499694824219
out_norm = 9.523523330688477
