In [1]:
%matplotlib inline
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, linewidth=75)

In [2]:
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0,
                    8.0, 3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9,
                    33.9, 21.8, 48.4, 60.4, 68.4])
t_un = 0.1 * t_u

In [3]:
def model(t_u, w0, w1, b):
    return w0 * t_u**2 + w1 * t_u + b

In [4]:
def loss_fn(predicted, actual):
    squared_diffs = (predicted - actual)**2
    return squared_diffs.mean()

In [5]:
import torch.optim as optim

dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

In [6]:
# Validate that the the new model is working

params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = 1e-4
optimizer = optim.SGD([params], lr=learning_rate)

t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)

loss.backward()
optimizer.step()

params

tensor([-2.3139e+03, -3.5181e+01, -5.9642e-01], requires_grad=True)

In [7]:
# Does zeroing the optimizer gradient at the begining make a difference 

params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = 1e-4
optimizer = optim.SGD([params], lr=learning_rate)

t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)

optimizer.zero_grad() # <1>

loss.backward()
optimizer.step()

params

tensor([-2.3139e+03, -3.5181e+01, -5.9642e-01], requires_grad=True)

In [8]:
# Significance of T_un

params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = 1e-4
optimizer = optim.SGD([params], lr=learning_rate)

t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)

loss.backward()
optimizer.step()

params

tensor([-2.3139e+03, -3.5181e+01, -5.9642e-01], requires_grad=True)

In [9]:
# Significance of T_un with zero gradient

params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = 1e-4
optimizer = optim.SGD([params], lr=learning_rate)

t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)

optimizer.zero_grad() # <1>

loss.backward()
optimizer.step()

params

tensor([-2.3139e+03, -3.5181e+01, -5.9642e-01], requires_grad=True)

## final setup

In [10]:
rates_to_learn_at = [1/x for x in [10, 100, 1000, 10000, 100000]]
rates_to_learn_at

[0.1, 0.01, 0.001, 0.0001, 1e-05]

In [11]:
def training_loop(n_epochs, optimizer, params, t_u, t_c, epoch_report_val = 500):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params) 
        loss = loss_fn(t_p, t_c)
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

        if epoch % epoch_report_val == 0:
            print(f'Epoch {epoch}, Loss {float(loss)}')
            
    return params

## Deciding which optimizer to use

In [12]:
params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = rates_to_learn_at[3]
optimizer = optim.SGD([params], lr=learning_rate) 

training_loop(
    n_epochs = 5000, 
    optimizer = optimizer,
    params = params, 
    t_u = t_un,
    t_c = t_c)

Epoch 500, Loss 10.708596229553223
Epoch 1000, Loss 8.642083168029785
Epoch 1500, Loss 7.1710052490234375
Epoch 2000, Loss 6.123478412628174
Epoch 2500, Loss 5.377227306365967
Epoch 3000, Loss 4.8452863693237305
Epoch 3500, Loss 4.465787887573242
Epoch 4000, Loss 4.194724082946777
Epoch 4500, Loss 4.0008015632629395
Epoch 5000, Loss 3.8617441654205322


tensor([ 0.5570, -0.8881, -0.8753], requires_grad=True)

In [13]:
params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
learning_rate = rates_to_learn_at[3]
optimizer = optim.Adam([params], lr=learning_rate) 

training_loop(
    n_epochs = 5000, 
    optimizer = optimizer,
    params = params,
    t_u = t_u, 
    t_c = t_c
    )

Epoch 500, Loss 10577728.0
Epoch 1000, Loss 9524402.0
Epoch 1500, Loss 8545122.0
Epoch 2000, Loss 7634292.5
Epoch 2500, Loss 6787368.0
Epoch 3000, Loss 6000706.0
Epoch 3500, Loss 5271407.5
Epoch 4000, Loss 4597170.0
Epoch 4500, Loss 3976134.25
Epoch 5000, Loss 3406753.75


tensor([ 0.5412,  0.5412, -0.4588], requires_grad=True)

Best model is subjective but loss seems to be better with SGD with higher optimizers

## Training loop at different learning rates

### SGD

In [14]:
for rate in rates_to_learn_at:
    print(f"Learning rate is {rate} :\n"+"/\\"*15)
    
    params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
    learning_rate = rate
    optimizer = optim.SGD([params], lr=learning_rate) 

    training_loop(
        n_epochs = 5000, 
        optimizer = optimizer,
        params = params, 
        t_u = t_un,
        t_c = t_c)
    
    print(f"Final Paramaters: {params}")
    
    print('\n'+"--"*30+'\n')

Learning rate is 0.1 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss nan
Epoch 1000, Loss nan
Epoch 1500, Loss nan
Epoch 2000, Loss nan
Epoch 2500, Loss nan
Epoch 3000, Loss nan
Epoch 3500, Loss nan
Epoch 4000, Loss nan
Epoch 4500, Loss nan
Epoch 5000, Loss nan
Final Paramaters: tensor([nan, nan, nan], requires_grad=True)

------------------------------------------------------------

Learning rate is 0.01 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss nan
Epoch 1000, Loss nan
Epoch 1500, Loss nan
Epoch 2000, Loss nan
Epoch 2500, Loss nan
Epoch 3000, Loss nan
Epoch 3500, Loss nan
Epoch 4000, Loss nan
Epoch 4500, Loss nan
Epoch 5000, Loss nan
Final Paramaters: tensor([nan, nan, nan], requires_grad=True)

------------------------------------------------------------

Learning rate is 0.001 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss nan
Epoch 1000, Loss nan
Epoch 1500, Loss nan
Epoch 2000, Loss nan
Epoch 2500, Loss nan
Epoch 3000, Loss nan
Epoch 3500, Loss nan
Epoch 4000, Loss nan


### ADAM

In [15]:
for rate in rates_to_learn_at:
    print(f"Learning rate is {rate} :\n"+"/\\"*15)
    
    params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
    learning_rate = rate
    optimizer = optim.Adam([params], lr=learning_rate) 

    training_loop(
        n_epochs = 5000, 
        optimizer = optimizer,
        params = params, 
        t_u = t_un,
        t_c = t_c)
    
    print(f"Final Paramaters: {params}")
    
    print('\n'+"--"*30+'\n')

Learning rate is 0.1 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss 2.7825067043304443
Epoch 1000, Loss 2.4860265254974365
Epoch 1500, Loss 2.2615137100219727
Epoch 2000, Loss 2.144075393676758
Epoch 2500, Loss 2.101926565170288
Epoch 3000, Loss 2.092149019241333
Epoch 3500, Loss 2.0908169746398926
Epoch 4000, Loss 2.0907232761383057
Epoch 4500, Loss 2.090721368789673
Epoch 5000, Loss 2.090721368789673
Final Paramaters: tensor([  0.2830,   2.4760, -10.6496], requires_grad=True)

------------------------------------------------------------

Learning rate is 0.01 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss 6.111172199249268
Epoch 1000, Loss 3.936776638031006
Epoch 1500, Loss 3.1178040504455566
Epoch 2000, Loss 2.931839942932129
Epoch 2500, Loss 2.8712592124938965
Epoch 3000, Loss 2.8129403591156006
Epoch 3500, Loss 2.7440879344940186
Epoch 4000, Loss 2.664674997329712
Epoch 4500, Loss 2.5763678550720215
Epoch 5000, Loss 2.482455253601074
Final Paramaters: tensor([ 0.4673,  0.4768,

### ADAGRAD

In [16]:
for rate in rates_to_learn_at:
    print(f"Learning rate is {rate} :\n"+"/\\"*15)
    
    params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
    learning_rate = rate
    optimizer = optim.Adagrad([params], lr=learning_rate) 

    training_loop(
        n_epochs = 5000, 
        optimizer = optimizer,
        params = params, 
        t_u = t_un,
        t_c = t_c)
    
    print(f"Final Paramaters: {params}")
    
    print('\n'+"--"*30+'\n')

Learning rate is 0.1 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss 5.5968017578125
Epoch 1000, Loss 4.2388691902160645
Epoch 1500, Loss 3.5880417823791504
Epoch 2000, Loss 3.2693517208099365
Epoch 2500, Loss 3.109431266784668
Epoch 3000, Loss 3.026103973388672
Epoch 3500, Loss 2.9799187183380127
Epoch 4000, Loss 2.951834201812744
Epoch 4500, Loss 2.932614803314209
Epoch 5000, Loss 2.917778730392456
Final Paramaters: tensor([ 0.5440, -0.3944, -3.3654], requires_grad=True)

------------------------------------------------------------

Learning rate is 0.01 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss 130.22694396972656
Epoch 1000, Loss 51.7548828125
Epoch 1500, Loss 24.96746063232422
Epoch 2000, Loss 14.81622314453125
Epoch 2500, Loss 10.820043563842773
Epoch 3000, Loss 9.208422660827637
Epoch 3500, Loss 8.536825180053711
Epoch 4000, Loss 8.238479614257812
Epoch 4500, Loss 8.088947296142578
Epoch 5000, Loss 7.999006748199463
Final Paramaters: tensor([ 0.3664,  0.3244, -0.7380], re

### ASGD

In [17]:
for rate in rates_to_learn_at:
    print(f"Learning rate is {rate} :\n"+"/\\"*15)
    
    params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
    learning_rate = rate
    optimizer = optim.ASGD([params], lr=learning_rate) 

    training_loop(
        n_epochs = 5000, 
        optimizer = optimizer,
        params = params, 
        t_u = t_un,
        t_c = t_c)
    
    print(f"Final Paramaters: {params}")
    
    print('\n'+"--"*30+'\n')

Learning rate is 0.1 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss nan
Epoch 1000, Loss nan
Epoch 1500, Loss nan
Epoch 2000, Loss nan
Epoch 2500, Loss nan
Epoch 3000, Loss nan
Epoch 3500, Loss nan
Epoch 4000, Loss nan
Epoch 4500, Loss nan
Epoch 5000, Loss nan
Final Paramaters: tensor([nan, nan, nan], requires_grad=True)

------------------------------------------------------------

Learning rate is 0.01 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss nan
Epoch 1000, Loss nan
Epoch 1500, Loss nan
Epoch 2000, Loss nan
Epoch 2500, Loss nan
Epoch 3000, Loss nan
Epoch 3500, Loss nan
Epoch 4000, Loss nan
Epoch 4500, Loss nan
Epoch 5000, Loss nan
Final Paramaters: tensor([nan, nan, nan], requires_grad=True)

------------------------------------------------------------

Learning rate is 0.001 :
/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
Epoch 500, Loss nan
Epoch 1000, Loss nan
Epoch 1500, Loss nan
Epoch 2000, Loss nan
Epoch 2500, Loss nan
Epoch 3000, Loss nan
Epoch 3500, Loss nan
Epoch 4000, Loss nan
