# Parameter Management

In [37]:
import torch
import torch.nn as nn

In [38]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
                    nn.Linear(20, 5),
                    nn.ReLU(),
                    nn.Linear(5, 10))
        
    def forward(self, data):
        return self.net(data)

In [39]:
def init_weight(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_uniform_(layer.weight)

In [40]:
net= MLP()

In [41]:
net.apply(init_weight)

MLP(
  (net): Sequential(
    (0): Linear(in_features=20, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=10, bias=True)
  )
)

# Parameter Access

In [42]:
print(net.net[0].parameters)

<bound method Module.parameters of Linear(in_features=20, out_features=5, bias=True)>


### Targeted Parameters

In [43]:
net.net[0].weight

Parameter containing:
tensor([[-3.1164e-01,  3.8008e-01, -2.8069e-01,  3.5592e-01,  5.7188e-02,
          4.6611e-01, -2.1368e-01,  3.2678e-01,  4.0443e-01, -1.5698e-01,
         -3.5363e-02,  4.0694e-01, -1.4859e-01, -3.2126e-01,  3.0339e-01,
         -1.6407e-01, -3.7212e-01,  2.6404e-01, -2.2538e-02,  3.1528e-01],
        [-4.5709e-01, -2.8391e-01, -2.6809e-01,  8.6438e-02, -4.7524e-01,
         -4.0493e-01, -3.9734e-01,  2.3249e-01, -3.5679e-01, -3.8669e-01,
          1.8968e-01,  2.4396e-01,  3.4076e-01, -4.3200e-01,  3.8828e-01,
          3.8227e-04, -2.3610e-02, -4.7618e-01, -4.7481e-01,  4.4778e-01],
        [ 2.7669e-01, -4.3521e-02, -1.4967e-01, -2.2915e-01,  2.4365e-01,
         -2.1098e-01,  2.3438e-01,  3.2370e-01, -2.2168e-01, -2.0208e-01,
          1.6742e-01, -4.8065e-01,  2.5656e-02, -4.8058e-01,  2.6327e-01,
          1.1949e-01, -3.7686e-01,  1.3671e-01,  4.6907e-01, -4.3738e-01],
        [-2.0126e-01, -3.1665e-01,  4.5516e-01,  1.1283e-03,  1.0312e-01,
          5.6

In [44]:
net.net[0].bias, net.net[0].weight.grad

(Parameter containing:
 tensor([ 0.1658, -0.2065,  0.2058,  0.1433,  0.1419], requires_grad=True),
 None)

### All parameters at once

In [45]:
net.net[0].state_dict()

OrderedDict([('weight',
              tensor([[-3.1164e-01,  3.8008e-01, -2.8069e-01,  3.5592e-01,  5.7188e-02,
                        4.6611e-01, -2.1368e-01,  3.2678e-01,  4.0443e-01, -1.5698e-01,
                       -3.5363e-02,  4.0694e-01, -1.4859e-01, -3.2126e-01,  3.0339e-01,
                       -1.6407e-01, -3.7212e-01,  2.6404e-01, -2.2538e-02,  3.1528e-01],
                      [-4.5709e-01, -2.8391e-01, -2.6809e-01,  8.6438e-02, -4.7524e-01,
                       -4.0493e-01, -3.9734e-01,  2.3249e-01, -3.5679e-01, -3.8669e-01,
                        1.8968e-01,  2.4396e-01,  3.4076e-01, -4.3200e-01,  3.8828e-01,
                        3.8227e-04, -2.3610e-02, -4.7618e-01, -4.7481e-01,  4.4778e-01],
                      [ 2.7669e-01, -4.3521e-02, -1.4967e-01, -2.2915e-01,  2.4365e-01,
                       -2.1098e-01,  2.3438e-01,  3.2370e-01, -2.2168e-01, -2.0208e-01,
                        1.6742e-01, -4.8065e-01,  2.5656e-02, -4.8058e-01,  2.6327e-01,
      

In [46]:
net.net.state_dict()

OrderedDict([('0.weight',
              tensor([[-3.1164e-01,  3.8008e-01, -2.8069e-01,  3.5592e-01,  5.7188e-02,
                        4.6611e-01, -2.1368e-01,  3.2678e-01,  4.0443e-01, -1.5698e-01,
                       -3.5363e-02,  4.0694e-01, -1.4859e-01, -3.2126e-01,  3.0339e-01,
                       -1.6407e-01, -3.7212e-01,  2.6404e-01, -2.2538e-02,  3.1528e-01],
                      [-4.5709e-01, -2.8391e-01, -2.6809e-01,  8.6438e-02, -4.7524e-01,
                       -4.0493e-01, -3.9734e-01,  2.3249e-01, -3.5679e-01, -3.8669e-01,
                        1.8968e-01,  2.4396e-01,  3.4076e-01, -4.3200e-01,  3.8828e-01,
                        3.8227e-04, -2.3610e-02, -4.7618e-01, -4.7481e-01,  4.4778e-01],
                      [ 2.7669e-01, -4.3521e-02, -1.4967e-01, -2.2915e-01,  2.4365e-01,
                       -2.1098e-01,  2.3438e-01,  3.2370e-01, -2.2168e-01, -2.0208e-01,
                        1.6742e-01, -4.8065e-01,  2.5656e-02, -4.8058e-01,  2.6327e-01,
    

In [47]:
net.net.state_dict()['2.bias']

tensor([ 3.9324e-02,  4.0156e-01,  4.3964e-01,  3.6864e-01,  8.4072e-05,
        -2.6539e-04, -3.5723e-01,  2.8756e-01,  3.5925e-01,  2.5525e-01])

### Rube Goldberg strikes again

In [49]:
def block1():
    net = nn.Sequential(nn.Linear(16, 32),
                        nn.ReLU(),
                        nn.Linear(32, 16),
                        nn.ReLU())
    return net

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module('block'+str(i), block1())
    return net    
        
rgnet = nn.Sequential()
rgnet.add_module('model',block2())
rgnet.add_module('Last_linear_layer', nn.Linear(16,10))
rgnet.apply(init_weight)
x = torch.randn(2,16)
rgnet(x) # forward computation

tensor([[ 0.0406,  0.0170,  0.0238, -0.0192, -0.2002, -0.0145,  0.0344,  0.0773,
         -0.2521,  0.0359],
        [ 0.0153,  0.0602,  0.0733, -0.0275, -0.1942, -0.0384, -0.0503,  0.0246,
         -0.2964,  0.0327]], grad_fn=<AddmmBackward>)

In [51]:
print(rgnet)
for param in rgnet.parameters():
    print(param.size(), param.dtype) 

Sequential(
  (model): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
  )
  (Last_linear_layer): Linear(in_features=16, out_features=10, bias=True)
)
torch.Size([32, 16]) torch.float32
torch.Size([32]) torch.float32
torch.Size([16, 32]) torch.float32
torch.Size([16])

In [61]:
rgnet.model.block1[0], rgnet.model.block1[0].bias.data

(Linear(in_features=16, out_features=32, bias=True),
 tensor([ 0.0926, -0.0968, -0.1169,  0.2213, -0.0335,  0.1375, -0.1123, -0.2301,
          0.1367, -0.0840, -0.0102, -0.2399, -0.1354,  0.0720, -0.0139, -0.2314,
          0.1297, -0.1476,  0.2278,  0.1509, -0.2259,  0.2499, -0.2277, -0.2167,
         -0.0845, -0.1833, -0.1025, -0.2463, -0.2133,  0.2205, -0.1539,  0.1072]))

In [65]:
rgnet.model.block1[0].named_parameters

<bound method Module.named_parameters of Linear(in_features=16, out_features=32, bias=True)>

### Built-in Initialization

In [67]:
def gaussian_normal(layer):
    if isinstance(layer, nn.Linear):
        nn.init.normal_(layer.weight)
        
net.apply(gaussian_normal)
net.net[0].weight

Parameter containing:
tensor([[-1.7215e+00,  7.8262e-01, -9.1423e-01, -1.7202e+00, -1.4108e+00,
          9.4831e-02,  1.9820e+00,  1.3737e+00, -7.9667e-01,  2.3070e-01,
         -2.8985e-01, -2.4998e+00,  3.5841e-01,  2.1261e+00, -2.4622e+00,
          6.6776e-01,  6.9462e-01,  2.4012e+00,  4.1587e-02, -1.2988e+00],
        [ 6.6969e-03, -7.5949e-01, -1.5559e+00, -1.1724e+00,  8.6252e-01,
          6.5788e-02, -2.0958e-01,  1.2275e-01,  6.9281e-01, -1.4128e+00,
         -2.9644e-01,  3.7062e-01, -6.9104e-01,  2.8148e-01, -7.6726e-01,
         -5.6980e-01, -8.3769e-01, -9.5940e-01,  7.8751e-01, -4.2404e-01],
        [ 1.9714e+00, -1.5152e+00, -7.4686e-01,  6.6779e-01, -4.6879e-01,
          1.8414e+00, -1.8092e-01,  9.5749e-01,  1.5418e+00, -1.4059e+00,
         -1.3374e+00, -1.6152e+00, -9.1020e-01,  3.1600e+00, -5.0846e-01,
         -2.8166e-01,  1.5207e-01,  1.1512e+00, -3.3227e-01, -7.0956e-01],
        [-8.7555e-01, -1.4496e+00, -6.7565e-01,  1.0525e+00, -9.0258e-01,
         -8.5

In [70]:
def constant(layer):
    if isinstance(layer, nn.Linear):
        nn.init.constant_(layer.weight, 1)
        
net.apply(constant)
net.net[0].weight

Parameter containing:
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.]], requires_grad=True)

In [73]:
net.net[0].apply(init_weight)
net.net[2].apply(constant)
net.state_dict()

OrderedDict([('net.0.weight',
              tensor([[ 0.4531, -0.3169,  0.3724, -0.2730, -0.1535,  0.0365,  0.1853, -0.4300,
                       -0.4036, -0.1871, -0.4559,  0.3143, -0.3316,  0.3823, -0.3569,  0.0886,
                       -0.3918, -0.3276,  0.0171, -0.2166],
                      [-0.4489,  0.1880, -0.3085,  0.2780,  0.3876, -0.3468, -0.4844,  0.3453,
                       -0.3023, -0.3810,  0.1271, -0.4123, -0.3971, -0.4346, -0.2016, -0.1957,
                       -0.2678, -0.3830, -0.1250, -0.0430],
                      [-0.4742, -0.2531,  0.0715, -0.4427, -0.4681, -0.2460,  0.1948, -0.2795,
                       -0.0187,  0.4710, -0.4713,  0.4213, -0.2551, -0.3445, -0.1942, -0.0772,
                       -0.2978, -0.2765, -0.3335, -0.3475],
                      [ 0.3936,  0.3015, -0.2273, -0.3747, -0.1632, -0.1185,  0.0740, -0.4075,
                       -0.1124,  0.2076,  0.2868, -0.2014,  0.3940,  0.3117,  0.1188,  0.0535,
                        0.4276

# Custom Initialization

In [76]:
def custom_init(layer):
    if isinstance(layer, nn.Linear):
        nn.init.uniform_(layer.weight, -10, 10)
        layer.weight.data *=  (abs(layer.weight.data) >= 5)

In [77]:
net.apply(custom_init)
net.state_dict()

OrderedDict([('net.0.weight',
              tensor([[ 0.0000, -0.0000,  0.0000,  5.0879,  9.5724, -5.5078,  9.8945, -5.9418,
                        0.0000, -0.0000, -9.9102, -0.0000,  0.0000, -5.6833,  6.0102, -9.2037,
                        0.0000,  0.0000,  7.9078, -0.0000],
                      [ 0.0000, -0.0000,  7.4762,  5.4852, -0.0000, -0.0000, -0.0000,  0.0000,
                       -0.0000,  0.0000, -9.1704, -8.5867,  0.0000, -7.0091, -6.8361, -6.4241,
                       -0.0000, -9.0102, -0.0000,  0.0000],
                      [ 5.5506, -9.5957, -8.7366,  8.1898, -0.0000,  8.8721, -0.0000,  0.0000,
                        0.0000, -8.2533, -5.7364,  0.0000, -0.0000,  0.0000, -0.0000,  0.0000,
                        0.0000,  0.0000, -0.0000,  5.0262],
                      [-9.3613, -0.0000, -0.0000, -0.0000, -8.9586,  6.4050,  5.4792, -0.0000,
                        0.0000, -0.0000,  8.6332,  6.1598, -7.7537, -0.0000,  6.4686, -0.0000,
                       -0.0000

In [79]:
net.net[0].bias

Parameter containing:
tensor([ 0.1658, -0.2065,  0.2058,  0.1433,  0.1419], requires_grad=True)

In [82]:
net.net[0].bias[0] = 3

In [83]:
net.net[0].bias

Parameter containing:
tensor([ 3.0000, -0.2065,  0.2058,  0.1433,  0.1419], grad_fn=<CopySlices>)

# Tied Parameters & Exercise

In [86]:
class ShareModel(nn.Module):
    def __init__(self):
        super(ShareModel, self).__init__()
        shared = nn.Linear(3,3)
        self.net = nn.Sequential(
                nn.Linear(2,3),
                shared,
                shared,
                nn.Linear(3, 4))
        
    def forward(self, data):
        return self.net(data)

In [96]:
net = ShareModel()
x = torch.randn(3, 2)
out = net(x)
out.mean().backward()

In [97]:
net.state_dict()

OrderedDict([('net.0.weight',
              tensor([[-0.6898,  0.2031],
                      [ 0.1793,  0.0130],
                      [-0.1435,  0.1035]])),
             ('net.0.bias', tensor([-0.0479,  0.3590,  0.3052])),
             ('net.1.weight',
              tensor([[-0.1917,  0.1171, -0.5175],
                      [ 0.1423, -0.0581, -0.5330],
                      [ 0.3312,  0.3917, -0.4905]])),
             ('net.1.bias', tensor([ 0.2725,  0.0596, -0.2111])),
             ('net.2.weight',
              tensor([[-0.1917,  0.1171, -0.5175],
                      [ 0.1423, -0.0581, -0.5330],
                      [ 0.3312,  0.3917, -0.4905]])),
             ('net.2.bias', tensor([ 0.2725,  0.0596, -0.2111])),
             ('net.3.weight',
              tensor([[-0.3348,  0.4493, -0.3914],
                      [-0.0207, -0.0392,  0.5490],
                      [-0.3609,  0.5214,  0.1443],
                      [-0.5588,  0.3901, -0.0596]])),
             ('net.3.bias', tensor

In [101]:
net.net[1].weight.grad == net.net[2].weight.grad

tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])