In [2]:
import torch
from torch import nn

In [14]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.0932],
        [0.0213]], grad_fn=<AddmmBackward0>)

### parameter access

In [17]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.3091,  0.1239,  0.2878, -0.2662, -0.0991,  0.0820,  0.0656,  0.0760]])), ('bias', tensor([-0.0365]))])


In [21]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0365], requires_grad=True)
tensor([-0.0365])


In [23]:
net[2].weight.grad == None

True

In [24]:
# access all parameters at once
print(*[(name, param.shape) for name, param in net[0].named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))


In [25]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [31]:
# another way to access parameters
net.state_dict()['2.bias'].data

tensor([-0.0365])

In [32]:
# access parameters from nested blocks
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[-0.5787],
        [-0.5786]], grad_fn=<AddmmBackward0>)

In [33]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [34]:
rgnet[0][1][0].bias.data

tensor([-0.4818, -0.0174, -0.0208, -0.4686,  0.2109,  0.0775, -0.1493,  0.4267])

### parameter initialization

In [36]:
# built-in initialization
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data, net[0].bias.data

(tensor([[ 0.0100,  0.0319, -0.0067,  0.0037],
         [ 0.0011,  0.0015, -0.0035, -0.0195],
         [ 0.0130,  0.0018,  0.0118,  0.0184],
         [-0.0087, -0.0050,  0.0055, -0.0111],
         [-0.0116,  0.0087, -0.0053,  0.0026],
         [-0.0075,  0.0052,  0.0076,  0.0145],
         [-0.0041,  0.0046, -0.0167,  0.0064],
         [ 0.0060, -0.0148,  0.0074, -0.0121]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [38]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data, net[0].bias.data

(tensor([[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [40]:
# Xavier initialization
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
net.apply(init_xavier)
net[0].weight.data

tensor([[-0.0643, -0.6283,  0.3593,  0.0774],
        [-0.5402,  0.5108,  0.6089,  0.1614],
        [-0.1648, -0.6341,  0.3132,  0.0545],
        [-0.3617,  0.3193, -0.5312,  0.5093],
        [ 0.2761,  0.4817, -0.5850,  0.1586],
        [-0.5534, -0.3259, -0.4372,  0.0915],
        [-0.4286,  0.4302,  0.3723, -0.1169],
        [ 0.2296, -0.3599, -0.2380,  0.3878]])

In [42]:
# custom initialization
# w obeys U(5,10) with 1/4, 0 with 1/2, U(-10,5) with 1/4 pobability
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5
net.apply(my_init)
net[0].weight.data

Init ('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
Init ('weight', torch.Size([1, 8])) ('bias', torch.Size([1]))


tensor([[-5.4130, -0.0000, -0.0000, -0.0000],
        [-0.0000,  0.0000,  0.0000,  0.0000],
        [ 5.9090,  8.1580,  5.1993,  0.0000],
        [-0.0000, -7.6020, -0.0000, -8.8065],
        [-9.2583,  0.0000,  8.3476,  6.2357],
        [ 0.0000, -0.0000, -9.8175,  9.8928],
        [ 7.5587,  0.0000, -0.0000, -0.0000],
        [-0.0000, -0.0000, -0.0000,  0.0000]])

In [43]:
# set parameters directly
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.,  1.,  1.,  1.])

### parameter binding

In [46]:
# we should first set a name to shared layer
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)

tensor([[0.3121],
        [0.3062]], grad_fn=<AddmmBackward0>)

In [47]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=8, bias=True)
  (5): ReLU()
  (6): Linear(in_features=8, out_features=1, bias=True)
)

In [48]:
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


In [49]:
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
