# Chapter 5 - The Mechanics of Learning

## Part III: Optimizers

In [1]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, linewidth=75)

In [2]:
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0,
                    8.0, 3.0, -4.0, 6.0, 13.0, 21.0])

t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9,
                    33.9, 21.8, 48.4, 60.4, 68.4])

t_un = 0.1 * t_u

In [3]:
def model(t_u, w, b):
    return w * t_u + b

In [4]:
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

In [5]:
dir(torch.optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'Optimizer',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'lr_scheduler']

In [6]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-5
optimizer = torch.optim.SGD([params], lr=lr)

In [7]:
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()

optimizer.step()

params

tensor([ 9.5483e-01, -8.2600e-04], requires_grad=True)

### Complete training loop

In [8]:
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 500 == 0:
            print('Epoch: {}, Loss: {}'.format(epoch, float(loss)))
    
    return params

In [9]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-2
optimizer = torch.optim.SGD([params], lr)

In [10]:
training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    params=params,
    t_u=t_un,
    t_c=t_c)

Epoch: 500, Loss: 7.8601155281066895
Epoch: 1000, Loss: 3.828537940979004
Epoch: 1500, Loss: 3.092191219329834
Epoch: 2000, Loss: 2.9576973915100098
Epoch: 2500, Loss: 2.933133840560913
Epoch: 3000, Loss: 2.9286482334136963
Epoch: 3500, Loss: 2.9278297424316406
Epoch: 4000, Loss: 2.9276793003082275
Epoch: 4500, Loss: 2.92765212059021
Epoch: 5000, Loss: 2.9276468753814697


tensor([  5.3671, -17.3012], requires_grad=True)

### Try other optimizers

In [11]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-1
optimizer = torch.optim.Adam([params], lr)

In [12]:
training_loop(
    n_epochs=2000,
    optimizer=optimizer,
    params=params,
    t_u=t_u,
    t_c=t_c)

Epoch: 500, Loss: 7.612901210784912
Epoch: 1000, Loss: 3.0866997241973877
Epoch: 1500, Loss: 2.9285781383514404
Epoch: 2000, Loss: 2.9276459217071533


tensor([  0.5367, -17.3021], requires_grad=True)

### Training, validation, and overfitting

In [13]:
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

In [14]:
help(torch.randperm)

Help on built-in function randperm:

randperm(...)
    randperm(n, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor
    
    Returns a random permutation of integers from ``0`` to ``n - 1``.
    
    Args:
        n (int): the upper bound (exclusive)
        out (Tensor, optional): the output tensor
        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
            Default: ``torch.int64``.
        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
            Default: ``torch.strided``.
        device (:class:`torch.device`, optional): the desired device of returned tensor.
            Default: if ``None``, uses the current device for the default tensor type
            (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU
            for CPU tensor types and the current CUDA device for CUDA tensor types.
        requires_grad (bool, optional): If au

In [15]:
shuffled_indices

tensor([ 1, 10,  6,  7,  0,  8,  2,  4,  3,  5,  9])

In [16]:
train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices, val_indices

(tensor([ 1, 10,  6,  7,  0,  8,  2,  4,  3]), tensor([5, 9]))

In [17]:
train_t_u = t_u[train_indices]
train_t_u

tensor([55.9000, 68.4000, 33.9000, 21.8000, 35.7000, 48.4000, 58.2000,
        56.3000, 81.9000])

In [18]:
train_t_c = t_c[train_indices]
train_t_c

tensor([14.0000, 21.0000,  3.0000, -4.0000,  0.5000,  6.0000, 15.0000,
        11.0000, 28.0000])

In [19]:
val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

In [20]:
train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

In [22]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        val_t_p = model(val_t_u, *params)
        val_loss = loss_fn(val_t_p, val_t_c)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        if epoch <= 3 or epoch % 500 == 0:
            print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
    
    return params

In [23]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-2

In [24]:
training_loop(
    n_epochs=3000,
    optimizer=torch.optim.SGD([params], lr=lr),
    params=params,
    train_t_u=train_t_un,
    val_t_u=val_t_un,
    train_t_c=train_t_c,
    val_t_c=val_t_c)

Epoch: 1, Training Loss: 91.76600646972656, Validation Loss: 29.056846618652344
Epoch: 2, Training Loss: 43.77659606933594, Validation Loss: 2.30248761177063
Epoch: 3, Training Loss: 36.09001159667969, Validation Loss: 3.5195436477661133
Epoch: 500, Training Loss: 7.092033386230469, Validation Loss: 4.611813068389893
Epoch: 1000, Training Loss: 3.411590337753296, Validation Loss: 4.0900797843933105
Epoch: 1500, Training Loss: 2.927306652069092, Validation Loss: 3.9970316886901855
Epoch: 2000, Training Loss: 2.8635826110839844, Validation Loss: 3.9759416580200195
Epoch: 2500, Training Loss: 2.8551971912384033, Validation Loss: 3.9699583053588867
Epoch: 3000, Training Loss: 2.8540937900543213, Validation Loss: 3.967996597290039


tensor([  5.4240, -17.2490], requires_grad=True)

In [25]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u,
                  train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        with torch.no_grad():
            val_t_p = model(val_t_u, *params)
            val_loss = loss_fn(val_t_p, val_t_c)
            assert val_loss.requires_grad == False
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
    return params

In [26]:
def calc_forward(t_u, t_c, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
    return loss