# CHAPTER 5 - D2L.ai
This notebook was run by Arsyi Syarief Aziz (H071191003) for the Introduction to Deep Learning course (Unhas) taught by Risman Adnan.

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

# 5.1 Layers and Blocks

**MLP**

In [2]:
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.0499, -0.1611,  0.0672, -0.0895,  0.1796,  0.3248, -0.0010, -0.0797,
          0.0503, -0.0436],
        [-0.0360, -0.1827,  0.0832, -0.1171,  0.1132,  0.1783, -0.0652, -0.1709,
          0.1219, -0.1799]], grad_fn=<AddmmBackward0>)

## 5.1.1 A Custom Block

In [5]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)
    
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [7]:
net = MLP()
net(X)

tensor([[ 0.0802, -0.0963, -0.3483, -0.0991, -0.0429,  0.0799,  0.2326, -0.0274,
         -0.0225,  0.0361],
        [ 0.0231, -0.0435, -0.3410, -0.1517, -0.1208,  0.1338,  0.2772, -0.1176,
         -0.0387,  0.1321]], grad_fn=<AddmmBackward0>)

## 5.1.2 The Sequential Block

In [48]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[str(idx)] = module
    
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X

In [49]:
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[ 0.1962, -0.0862,  0.2806,  0.0399,  0.2997, -0.1053,  0.0283,  0.0352,
         -0.0996, -0.1864],
        [ 0.1835, -0.1673,  0.2548, -0.0067,  0.1429, -0.0555,  0.0732, -0.0368,
         -0.0368, -0.2123]], grad_fn=<AddmmBackward0>)

## 5.1.3 Executing Code in the Forward Propagation Function

In [14]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
        
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /=2
        return X.sum()

In [15]:
net = FixedHiddenMLP()
net(X)

tensor(0.0751, grad_fn=<SumBackward0>)

In [18]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn. ReLU())
        self.linear = nn.Linear(32, 16)
    
    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(-0.0787, grad_fn=<SumBackward0>)

# 5.2 Parameter Management

## 5.2.1 Parameter Access

In [50]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[-0.0836],
        [-0.0818]], grad_fn=<AddmmBackward0>)

In [66]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0156,  0.0775, -0.0777, -0.0937,  0.0510, -0.1529,  0.1367, -0.1091]])), ('bias', tensor([-0.0659]))])


In [67]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0659], requires_grad=True)
tensor([-0.0659])


In [68]:
net[2].weight.grad == None

True

In [69]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [70]:
net.state_dict()['2.bias'].data

tensor([-0.0659])

In [71]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.4249],
        [0.4249]], grad_fn=<AddmmBackward0>)

In [72]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [74]:
rgnet[0][1][0].bias.data

tensor([-0.4772,  0.4219, -0.0511,  0.2918, -0.1627, -0.4710, -0.2698, -0.0410])

## 5.2.2 Parameter Initialization 

### 5.2.2.1 Built-in Initialization

In [79]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0040,  0.0102,  0.0291, -0.0006]), tensor(0.))

In [80]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [84]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)
        
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.1736, -0.3667, -0.4524, -0.1194])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


### 5.2.2.2 Custom Initialization

In [88]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >=5
        
net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 0.0000, -0.0000, -9.2657, -5.8867],
        [ 9.2226, -9.9599, -9.3391, -0.0000]], grad_fn=<SliceBackward0>)

In [89]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  1.0000, -8.2657, -4.8867])

### 5.2.3 Tied Parameters

In [93]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)

print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


# 5.3 Deferred Initialization
Not relevant in PyTorch

# 5.4 Custom Layers

## 5.4.1 Layers without Parameters

In [95]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, X):
        return X - X.mean()

In [96]:
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [97]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [100]:
Y =net(torch.rand(4, 8))
Y.mean()

tensor(-5.5879e-09, grad_fn=<MeanBackward0>)

## 5.4.2 Layers with Parameters

In [102]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
        
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [122]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[-1.5370, -0.8776, -1.3446],
        [ 1.2160, -0.0453,  0.0934],
        [-0.0412, -0.6012,  0.1455],
        [ 0.0086,  1.4191,  1.4225],
        [ 0.1555,  1.7418,  0.2478]], requires_grad=True)

In [125]:
linear(torch.rand(2, 5))

tensor([[0.8800, 3.5178, 2.4829],
        [0.7081, 1.5291, 2.0190]])

In [126]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[0.0000],
        [3.7206]])

# 5.5 File I/O

## 5.5.1 Loading and Saving Tensors

In [127]:
x = torch.arange(4)
torch.save(x, 'x-file')

In [129]:
x2 = torch.load('x-file')
x2

tensor([0, 1, 2, 3])

In [130]:
y = torch.zeros(4)
torch.save([x, y], 'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [131]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

## 5.5.2 Loading and Saving Model Parameters

In [137]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)
    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))
    
net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

In [138]:
torch.save(net.state_dict(), 'mlp.params')

In [139]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

In [140]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

# 5.6 GPUs
Note: I don't have a dedicated GPU. All computations done in this subchapter will be done on my CPU. I ran the commands just to understand the functions related to GPU computation.

## 5.6.1 Computing Devices

In [148]:
torch.device('cpu'), torch.device('cuda')

(device(type='cpu'), device(type='cuda'))

In [149]:
torch.cuda.device_count()

0

In [146]:
def try_gpu(i=0):  #@save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cpu'), device(type='cpu'), [device(type='cpu')])

## 5.6.2 Tensors and GPUs

In [150]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [153]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [154]:
Y = torch.rand(2, 3, device=try_gpu(1))
Y

tensor([[0.8163, 0.4712, 0.8609],
        [0.2571, 0.2951, 0.1604]])

In [158]:
Z = X # Z = X.cuda(1)
print(X)
print(Z)

tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[1., 1., 1.],
        [1., 1., 1.]])


In [159]:
Y + Z


tensor([[1.8163, 1.4712, 1.8609],
        [1.2571, 1.2951, 1.1604]])

In [160]:
# Z.cuda(1) is Z

In [161]:
## 5.6.3 Neural Networks and GPUs

In [162]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

In [163]:
net(X)

tensor([[1.0144],
        [1.0144]], grad_fn=<AddmmBackward0>)

In [164]:
net[0].weight.data.device

device(type='cpu')