# Parameter Management

In [1]:
import torch
import torch.nn as nn

In [2]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
                    nn.Linear(20, 5),
                    nn.ReLU(),
                    nn.Linear(5, 10))
        
    def forward(self, data):
        return self.net(data)

In [3]:
def init_weight(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_uniform_(layer.weight)

In [4]:
net= MLP()

In [5]:
net.apply(init_weight)

MLP(
  (net): Sequential(
    (0): Linear(in_features=20, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=10, bias=True)
  )
)

# Parameter Access

In [6]:
print(net.net[0].parameters)

<bound method Module.parameters of Linear(in_features=20, out_features=5, bias=True)>


### Targeted Parameters

In [7]:
net.net[0].weight

Parameter containing:
tensor([[-0.3090,  0.1840,  0.1129, -0.1278, -0.4424, -0.4689, -0.1007,  0.0250,
         -0.1548,  0.0808,  0.3053,  0.4417,  0.1190, -0.3757, -0.0012, -0.2100,
         -0.1744,  0.2273,  0.0871, -0.0366],
        [-0.1260,  0.1336, -0.3288,  0.0025,  0.1394,  0.3853, -0.0126,  0.0793,
          0.1218,  0.3870,  0.3727, -0.2385,  0.4494, -0.2669, -0.4154, -0.0372,
         -0.0702,  0.0094,  0.2522, -0.2270],
        [-0.1449, -0.1649, -0.3856, -0.4840, -0.3017,  0.2236, -0.2372,  0.1625,
         -0.3108, -0.4045,  0.3918,  0.4056,  0.3551,  0.4056, -0.4010, -0.2296,
          0.2823, -0.4041, -0.1722, -0.0367],
        [ 0.1311, -0.4556,  0.4553,  0.2848, -0.3743,  0.1516,  0.0170,  0.0415,
          0.3544,  0.2958, -0.4383, -0.1730,  0.0936,  0.0978,  0.2261,  0.0151,
         -0.0805,  0.4032, -0.4339, -0.1253],
        [-0.2712,  0.0958, -0.4718, -0.1405, -0.2407,  0.2463,  0.0369,  0.1692,
         -0.1643,  0.2274,  0.3384,  0.0259, -0.4640,  0.2036, -0

In [8]:
net.net[0].bias, net.net[0].weight.grad

(Parameter containing:
 tensor([ 0.2235,  0.0581,  0.1642, -0.1655, -0.1108], requires_grad=True),
 None)

### All parameters at once

In [9]:
net.net[0].state_dict()

OrderedDict([('weight',
              tensor([[-0.3090,  0.1840,  0.1129, -0.1278, -0.4424, -0.4689, -0.1007,  0.0250,
                       -0.1548,  0.0808,  0.3053,  0.4417,  0.1190, -0.3757, -0.0012, -0.2100,
                       -0.1744,  0.2273,  0.0871, -0.0366],
                      [-0.1260,  0.1336, -0.3288,  0.0025,  0.1394,  0.3853, -0.0126,  0.0793,
                        0.1218,  0.3870,  0.3727, -0.2385,  0.4494, -0.2669, -0.4154, -0.0372,
                       -0.0702,  0.0094,  0.2522, -0.2270],
                      [-0.1449, -0.1649, -0.3856, -0.4840, -0.3017,  0.2236, -0.2372,  0.1625,
                       -0.3108, -0.4045,  0.3918,  0.4056,  0.3551,  0.4056, -0.4010, -0.2296,
                        0.2823, -0.4041, -0.1722, -0.0367],
                      [ 0.1311, -0.4556,  0.4553,  0.2848, -0.3743,  0.1516,  0.0170,  0.0415,
                        0.3544,  0.2958, -0.4383, -0.1730,  0.0936,  0.0978,  0.2261,  0.0151,
                       -0.0805,  0.4

In [10]:
net.net.state_dict()

OrderedDict([('0.weight',
              tensor([[-0.3090,  0.1840,  0.1129, -0.1278, -0.4424, -0.4689, -0.1007,  0.0250,
                       -0.1548,  0.0808,  0.3053,  0.4417,  0.1190, -0.3757, -0.0012, -0.2100,
                       -0.1744,  0.2273,  0.0871, -0.0366],
                      [-0.1260,  0.1336, -0.3288,  0.0025,  0.1394,  0.3853, -0.0126,  0.0793,
                        0.1218,  0.3870,  0.3727, -0.2385,  0.4494, -0.2669, -0.4154, -0.0372,
                       -0.0702,  0.0094,  0.2522, -0.2270],
                      [-0.1449, -0.1649, -0.3856, -0.4840, -0.3017,  0.2236, -0.2372,  0.1625,
                       -0.3108, -0.4045,  0.3918,  0.4056,  0.3551,  0.4056, -0.4010, -0.2296,
                        0.2823, -0.4041, -0.1722, -0.0367],
                      [ 0.1311, -0.4556,  0.4553,  0.2848, -0.3743,  0.1516,  0.0170,  0.0415,
                        0.3544,  0.2958, -0.4383, -0.1730,  0.0936,  0.0978,  0.2261,  0.0151,
                       -0.0805,  0

In [11]:
net.net.state_dict()['2.bias']

tensor([-0.2939, -0.0975, -0.1125, -0.3357,  0.3381,  0.2480, -0.2858,  0.1832,
         0.2769, -0.0832])

### Rube Goldberg strikes again

In [12]:
def block1():
    net = nn.Sequential(nn.Linear(16, 32),
                        nn.ReLU(),
                        nn.Linear(32, 16),
                        nn.ReLU())
    return net

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module('block'+str(i), block1())
    return net    
        
rgnet = nn.Sequential()
rgnet.add_module('model',block2())
rgnet.add_module('Last_linear_layer', nn.Linear(16,10))
rgnet.apply(init_weight)
x = torch.randn(2,16)
rgnet(x) # forward computation

tensor([[-0.1921,  0.1614,  0.1013,  0.2589,  0.0398,  0.0767, -0.0027, -0.0177,
         -0.1136,  0.0944],
        [-0.1847,  0.1810,  0.0737,  0.2840,  0.0323,  0.1208, -0.0361,  0.0243,
         -0.1105,  0.1030]], grad_fn=<AddmmBackward>)

In [13]:
print(rgnet)
for param in rgnet.parameters():
    print(param.size(), param.dtype) 

Sequential(
  (model): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): ReLU()
    )
  )
  (Last_linear_layer): Linear(in_features=16, out_features=10, bias=True)
)
torch.Size([32, 16]) torch.float32
torch.Size([32]) torch.float32
torch.Size([16, 32]) torch.float32
torch.Size([16])

In [14]:
rgnet.model.block1[0], rgnet.model.block1[0].bias.data

(Linear(in_features=16, out_features=32, bias=True),
 tensor([ 0.0373,  0.2288,  0.1572, -0.0583,  0.0834, -0.1277, -0.2312,  0.0233,
         -0.1151,  0.1939, -0.0004, -0.2498,  0.2016,  0.2131, -0.2196, -0.1807,
          0.2207, -0.2028, -0.0503,  0.1083, -0.0701, -0.2404, -0.1881, -0.1085,
          0.1271, -0.1441, -0.1190,  0.0082, -0.0284, -0.1205, -0.0510, -0.0670]))

In [15]:
rgnet.model.block1[0].named_parameters

<bound method Module.named_parameters of Linear(in_features=16, out_features=32, bias=True)>

### Built-in Initialization

In [16]:
def gaussian_normal(layer):
    if isinstance(layer, nn.Linear):
        nn.init.normal_(layer.weight)
        
net.apply(gaussian_normal)
net.net[0].weight

Parameter containing:
tensor([[-0.6830,  2.3053,  0.2357, -0.4555, -0.6635, -1.6496, -2.5893,  1.4305,
          0.6197,  0.9248, -0.3411, -0.5622,  0.1512,  0.4714, -0.6439, -0.2990,
          1.3127,  1.0076, -0.4822,  0.2196],
        [ 0.7603, -0.6114,  0.4463,  0.4289,  1.9079, -1.6336,  0.9814,  0.0333,
         -1.3480,  0.0652, -0.4120, -0.1600, -0.0437, -0.0521,  0.8929,  0.3791,
         -0.5071,  0.8040,  0.2205, -0.6248],
        [-0.2247,  1.0191, -1.0349, -1.4871,  2.2338,  0.9358,  0.9227,  0.6791,
          0.1376,  1.4454,  0.6344,  2.0592, -0.8373, -1.5749,  0.3182, -0.0400,
          1.4774,  0.9315, -1.5869, -0.0971],
        [ 0.5890, -0.3836, -1.0055, -0.2519,  1.1270,  0.4325, -1.6301, -1.3290,
         -1.2318, -1.4212, -1.2994,  0.2725,  1.0611, -0.6924,  1.7792,  1.1971,
          0.9034, -0.1926,  0.1419, -0.6016],
        [ 0.7491, -0.6367,  1.1025, -0.8062,  0.4488, -0.5828, -0.1888, -0.0301,
         -0.2525, -0.8576,  0.4532,  0.7267, -0.0554,  1.5241,  0

In [17]:
def constant(layer):
    if isinstance(layer, nn.Linear):
        nn.init.constant_(layer.weight, 1)
        
net.apply(constant)
net.net[0].weight

Parameter containing:
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.]], requires_grad=True)

In [18]:
net.net[0].apply(init_weight)
net.net[2].apply(constant)
net.state_dict()

OrderedDict([('net.0.weight',
              tensor([[-0.0961,  0.3489, -0.3601,  0.4427, -0.2633, -0.3992,  0.1203, -0.1352,
                        0.3637,  0.0562,  0.1300, -0.2271, -0.3638,  0.1306, -0.0768,  0.0853,
                       -0.4180,  0.4230,  0.2922,  0.1488],
                      [ 0.3007,  0.1971,  0.3760,  0.4800, -0.2691,  0.1996,  0.1697,  0.1968,
                       -0.0361, -0.1425,  0.2101, -0.0302, -0.0837, -0.0276,  0.0220, -0.0472,
                        0.1260, -0.0682, -0.3743,  0.0044],
                      [ 0.1403,  0.0212,  0.4795, -0.3672,  0.3202,  0.2806, -0.3803,  0.1390,
                       -0.3348, -0.0678,  0.0860,  0.2064, -0.1096, -0.3903, -0.1879, -0.1237,
                        0.0391, -0.3663,  0.1962, -0.1123],
                      [ 0.4183, -0.0812, -0.1758,  0.0711,  0.0020, -0.2695,  0.3749,  0.2725,
                       -0.4294, -0.2714,  0.4444, -0.0824, -0.4879,  0.4694,  0.3395,  0.2755,
                       -0.4570

# Custom Initialization

In [19]:
def custom_init(layer):
    if isinstance(layer, nn.Linear):
        nn.init.uniform_(layer.weight, -10, 10)
        layer.weight.data *=  (abs(layer.weight.data) >= 5)

In [20]:
net.apply(custom_init)
net.state_dict()

OrderedDict([('net.0.weight',
              tensor([[ 0.0000, -7.6604,  5.0198,  8.9336, -0.0000,  8.1368, -0.0000,  0.0000,
                       -0.0000, -7.8911,  9.0172, -5.2221,  9.2773, -7.8020,  8.8087, -0.0000,
                       -0.0000,  0.0000, -8.4649, -7.4604],
                      [-8.0602,  0.0000, -6.0209, -0.0000,  0.0000,  8.7263,  8.1468,  8.2705,
                       -7.7412,  0.0000,  0.0000,  7.7357, -0.0000, -0.0000,  0.0000,  0.0000,
                        8.9293,  0.0000, -8.5831,  9.5831],
                      [ 0.0000, -0.0000,  0.0000, -6.9102,  0.0000,  8.2808,  5.0199, -5.9622,
                       -0.0000,  7.4637, -0.0000,  0.0000,  5.9623,  0.0000,  8.5386, -0.0000,
                       -9.9118,  7.7232,  9.1209,  0.0000],
                      [ 6.4823,  7.5916, -5.9966,  5.0652, -0.0000,  9.9320,  8.9006,  0.0000,
                        0.0000,  9.4490, -0.0000, -0.0000,  8.4782,  0.0000,  5.5494, -0.0000,
                       -5.3959

In [21]:
net.net[0].bias

Parameter containing:
tensor([ 0.2235,  0.0581,  0.1642, -0.1655, -0.1108], requires_grad=True)

In [22]:
net.net[0].bias[0] = 3

In [23]:
net.net[0].bias

Parameter containing:
tensor([ 3.0000,  0.0581,  0.1642, -0.1655, -0.1108], grad_fn=<CopySlices>)

# Tied Parameters & Exercise

In [24]:
class ShareModel(nn.Module):
    def __init__(self):
        super(ShareModel, self).__init__()
        shared = nn.Linear(3,3)
        self.net = nn.Sequential(
                nn.Linear(2,3),
                shared,
                shared,
                nn.Linear(3, 4))
        
    def forward(self, data):
        return self.net(data)

In [25]:
net = ShareModel()
x = torch.randn(3, 2)
out = net(x)
out.mean().backward()

In [29]:
net.net[0].parameters

<bound method Module.parameters of Linear(in_features=2, out_features=3, bias=True)>

In [26]:
net.state_dict()

OrderedDict([('net.0.weight',
              tensor([[ 0.0843, -0.3777],
                      [ 0.3331,  0.1068],
                      [ 0.4260,  0.1343]])),
             ('net.0.bias', tensor([0.5181, 0.5286, 0.3189])),
             ('net.1.weight',
              tensor([[-0.0312, -0.4668, -0.4777],
                      [ 0.0494,  0.1682, -0.4644],
                      [ 0.2081,  0.2456,  0.0224]])),
             ('net.1.bias', tensor([ 0.5437,  0.5618, -0.1748])),
             ('net.2.weight',
              tensor([[-0.0312, -0.4668, -0.4777],
                      [ 0.0494,  0.1682, -0.4644],
                      [ 0.2081,  0.2456,  0.0224]])),
             ('net.2.bias', tensor([ 0.5437,  0.5618, -0.1748])),
             ('net.3.weight',
              tensor([[-0.0721, -0.4978, -0.1776],
                      [ 0.5074, -0.4411, -0.3473],
                      [-0.4309,  0.5612, -0.5570],
                      [-0.4612,  0.1885,  0.3035]])),
             ('net.3.bias', tensor([-

In [27]:
net.net[1].weight.grad == net.net[2].weight.grad

tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])