In [8]:
import torch
import numpy as np
import plotly.express as px
torch.set_printoptions(edgeitems=2, linewidth=75)

In [10]:
def model(t_u, w, b):
    return w*t_u + b

def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c) ** 2
    return squared_diffs.mean()

In [13]:
params = torch.tensor([1.0, 0.0], requires_grad = True)
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0, 8.0,
                    3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9,
                    33.9, 21.8, 48.4, 60.4, 68.4])
t_un = 0.1 * t_u

In [14]:
params.grad is None

True

In [15]:
# Tracks the entire family treeof tensors resultung on operations from params
# Amy tensor with params as an ancestor will have access to chain of functions that were called
# to get from params to that tensor

# The derivative of such functions can will be automatically calculated as a grad attribute of the params tensor
# Call the model, compute the loss and then call backward on the loss tensor

loss = loss_fn(model(t_u, *params), t_c)
loss.backward()

In [16]:
params.grad

tensor([4517.2969,   82.6000])

In [17]:
# Gradient accumulates at the leaf nodes, so we need to explicitly ero the gradient, using the inplace zero_ method
if params.grad is not None:
    params.grad.zero_()

In [20]:
# Comlpete code for training using autograd 
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
    for epoch in range(1, n_epochs+1):
        if params.grad is not None:
            params.grad.zero_()
        
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
        loss.backward()
        
        with torch.no_grad(): # Here the autograd doesnt work, i.e doesnt add edges to forward graph
            params -= learning_rate* params.grad # Keeping the same params to use in optimizers registration
            
        if epoch % 500 == 0:
            print("Epoch %d, Loss %f" % (epoch, float(loss)))
            
    return params

In [21]:
training_loop(n_epochs= 5000,
             learning_rate=1e-2,
             params = torch.tensor([1.0, 0.0], requires_grad = True),
             t_u = t_un,
             t_c = t_c)

Epoch 500, Loss 7.860115
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957698
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927679
Epoch 4500, Loss 2.927652
Epoch 5000, Loss 2.927647


tensor([  5.3671, -17.3012], requires_grad=True)

## Optimizers 

In [22]:
import torch.optim as optim
dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'Optimizer',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_multi_tensor',
 'functional',
 'lr_scheduler',
 'swa_utils']

In [23]:
params = torch.tensor([.0, 0.0], requires_grad = True)
learning_rate = 1e-5
optimizer = optim.SGD([params], lr= learning_rate)

In [25]:
# Trying it out
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()

optimizer.step()
params

tensor([0.0401, 0.0006], requires_grad=True)

In [26]:
# Putting it back with zero_grad to get the required logic and preventing accumulation
params = torch.tensor([.0, 0.0], requires_grad = True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr= learning_rate)

t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)

optimizer.zero_grad()
loss.backward()
optimizer.step()

params

tensor([13.6430,  0.2100], requires_grad=True)

In [28]:
# Training loop
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs+1):
        if params.grad is not None:
            params.grad.zero_()
        
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()  
        
        if epoch % 500 == 0:
            print("Epoch %d, Loss %f" % (epoch, float(loss)))
            
    return params

In [29]:
params = torch.tensor([1.0, 0.0], requires_grad = True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr= learning_rate)

training_loop(n_epochs=5000,
             optimizer=optimizer,
             params=params,
             t_u=t_un,
             t_c = t_c)

Epoch 500, Loss 7.860115
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957698
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927679
Epoch 4500, Loss 2.927652
Epoch 5000, Loss 2.927647


tensor([  5.3671, -17.3012], requires_grad=True)

In [30]:
learning_rate = 1e-1
optimizer = optim.Adam([params], lr= learning_rate)
training_loop(n_epochs=5000,
             optimizer=optimizer,
             params=params,
             t_u=t_u, # Not normalized
             t_c = t_c)

# Adam optimizer is really sophisticated 

Epoch 500, Loss 4.661878
Epoch 1000, Loss 4.247613
Epoch 1500, Loss 3.804652
Epoch 2000, Loss 3.425622
Epoch 2500, Loss 3.161072
Epoch 3000, Loss 3.013728
Epoch 3500, Loss 2.951061
Epoch 4000, Loss 2.931960
Epoch 4500, Loss 2.928126
Epoch 5000, Loss 2.927674


tensor([  0.5371, -17.3226], requires_grad=True)

In [31]:
# Less sensitive to scaling of parameters
# Learning rate is set adaptively

# Splitting DataSet

In [32]:
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices, val_indices

(tensor([6, 2, 5, 0, 8, 1, 3, 7, 4]), tensor([ 9, 10]))

In [33]:
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]

val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

In [47]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        val_t_p = model(val_t_u, *params)
        val_loss = loss_fn(val_t_p, val_t_c)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        if epoch <= 3 or epoch % 500 == 0:
            print(f"Epoch {epoch}, Training loss {train_loss.item():.4f},"
                  f" Validation loss {val_loss.item():.4f}")
            
    return params

In [48]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

training_loop(
    n_epochs = 3000, 
    optimizer = optimizer,
    params = params,
    train_t_u = train_t_un,  
    val_t_u = val_t_un,  
    train_t_c = train_t_c,
    val_t_c = val_t_c)

Epoch 1, Training loss 70.5623, Validation loss 124.4736
Epoch 2, Training loss 40.3757, Validation loss 51.5750
Epoch 3, Training loss 34.2207, Validation loss 30.3821
Epoch 500, Training loss 7.1204, Validation loss 6.4052
Epoch 1000, Training loss 3.4128, Validation loss 4.2236
Epoch 1500, Training loss 2.8828, Validation loss 3.7480
Epoch 2000, Training loss 2.8070, Validation loss 3.6181
Epoch 2500, Training loss 2.7962, Validation loss 3.5761
Epoch 3000, Training loss 2.7946, Validation loss 3.5612


tensor([  5.3306, -17.0593], requires_grad=True)

In [50]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        with torch.no_grad():
            val_t_p = model(val_t_u, *params)
            val_loss = loss_fn(val_t_p, val_t_c)
            assert val_loss.requires.grad == False
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        if epoch <= 3 or epoch % 500 == 0:
            print(f"Epoch {epoch}, Training loss {train_loss.item():.4f},"
                  f" Validation loss {val_loss.item():.4f}")
            
    return params

In [51]:
def calc_forward(t_u, t_c, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
    return loss