In [None]:
# %% Deep learning - Section 6.37
#    Code challenge 3: fixed vs. dynamic learning rate

#    1) Use code from Sec_06_032_gradient_descent_1D
#    2) Think about how to change the learning rate:
#       - Time (training epoch)
#       - Derivative
#       - Loss
#       - Current local minimum value
#    3) Implement and test your ideas

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import sympy               as sym
import copy

from google.colab                     import files
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Define function

# The function
def fx(x):
    return 3*x**2 - 3*x + 4

def df(x):
    return 6*x -3


In [None]:
# %% Gradient descent
#    Method 0 - No scaling

x = np.linspace(-2,2,2001)

# Set a min to pick for other methods too
local_min_start = np.random.choice(x,1).item()
local_min       = local_min_start

print(f'Starting local mininum: {local_min:.8f}')

learning_rate   = .01
training_epochs = 50
model_paramsFix = np.zeros((training_epochs,3))

for i in range(training_epochs):
    gradient             = df(local_min)
    lr                   = learning_rate
    local_min            = local_min - gradient*lr
    model_paramsFix[i,:] = local_min,gradient,lr

print(f'Estimated local mininum: {local_min:.8f}')


In [None]:
# %% Plotting

plt.plot(x,fx(x),x,df(x))
plt.plot(local_min,df(local_min),'ro')
plt.plot(local_min,fx(local_min),'ro')

plt.xlim(x[[0,-1]])
plt.grid()
plt.xlabel('')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend(["f(x)","f'(x)","f(x) min"])
plt.suptitle('A function and its derivative')
plt.title('Empirical local minimum: %s' %np.round(local_min,4))

plt.savefig('figure42_code_challenge_3.png')

plt.show()

files.download('figure42_code_challenge_3.png')


In [None]:
# %% Gradient descent
#    Method 1 - Scale learning rate by gradient
#    Useful because adaptive, but for more complex functions it requires additional parameters
#    and appropriae scaling
#    In the industry, it is usually incorporated in optimizers such as RMSprop and Adam

# Choose between a random or a fixed starting point
local_min = local_min_start
local_min = np.random.choice(x,1).item()
print(f'Starting local mininum: {local_min:.8f}')

learning_rate   = .01
training_epochs = 50
model_paramsGrd = np.zeros((training_epochs,3))

for i in range(training_epochs):
    gradient             = df(local_min)
    lr                   = learning_rate * abs(gradient)
    local_min            = local_min - gradient*lr
    model_paramsGrd[i,:] = local_min,gradient,lr

print(f'Estimated local mininum: {local_min:.8f}')

In [None]:
# %% Gradient descent
#    Method 2 - Scale learning rate by number of epochs
#    Quite good, often done in blocks, but unrelated to the model performance or
#    accuracy, from which the l.r. is caled independently
#    In the industry it is called "learning rate decay"

# Choose between a random or a fixed starting point
local_min = local_min_start
local_min = np.random.choice(x,1).item()
print(f'Starting local mininum: {local_min:.8f}')

learning_rate   = .01
training_epochs = 50
model_paramsTim = np.zeros((training_epochs,3))

for i in range(training_epochs):
    gradient             = df(local_min)
    lr                   = learning_rate * (1-(i+1)/training_epochs)
    local_min            = local_min - gradient*lr
    model_paramsTim[i,:] = local_min,gradient,lr

print(f'Estimated local mininum: {local_min:.8f}')


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,3,figsize=(13,5))

for i in range(3):
    ax[i].plot(model_paramsFix[:,i],'o-',markerfacecolor='none')
    ax[i].plot(model_paramsGrd[:,i],'s-',markerfacecolor='none')
    ax[i].plot(model_paramsTim[:,i],'^-',markerfacecolor='none')
    ax[i].set_xlabel('Iteration')

ax[0].set_title('Local Minimum')
ax[1].set_title('Gradient')
ax[2].set_title('Learning rate')
ax[2].legend(['Fixed l.r.','Grad-based l.r.','Time-based l.r.'])
plt.tight_layout()

plt.savefig('figure43_code_challenge_3.png')

plt.show()

files.download('figure43_code_challenge_3.png')


In [None]:
# %% Excercise 1
#    Change the initial learning rate in the "time" experiment from .1 to .01. Do you still reach the same conclusion that
#    dynamic learning rates are better than a fixed learning rate?

# No, it is now actually performing quite poorly


In [None]:
# %% Excercise 2
#    Compute the average of all time-based learning rates (see variable 'modelparamsTime'). Next, replace the fixed
#    learning rate with the average over all dynamic learning rates. How does that affect the model's performance?

avg_lr = np.mean(model_paramsTim[:,2])

local_min = local_min_start
print(f'Starting local mininum: {local_min:.8f}')

learning_rate   = avg_lr
training_epochs = 50
model_paramsFix = np.zeros((training_epochs,3))

for i in range(training_epochs):
    gradient             = df(local_min)
    lr                   = learning_rate
    local_min            = local_min - gradient*lr
    model_paramsFix[i,:] = local_min,gradient,lr

print(f'Estimated local mininum: {local_min:.8f}')

fig,ax = plt.subplots(1,3,figsize=(13,5))

for i in range(3):
    ax[i].plot(model_paramsFix[:,i],'o-',markerfacecolor='none')
    ax[i].plot(model_paramsGrd[:,i],'s-',markerfacecolor='none')
    ax[i].plot(model_paramsTim[:,i],'^-',markerfacecolor='none')
    ax[i].set_xlabel('Iteration')

ax[0].set_title('Local Minimum')
ax[1].set_title('Gradient')
ax[2].set_title('Learning rate')
ax[2].legend(['Average time-based l.r.','Grad-based l.r.','Time-based l.r.'])
plt.tight_layout()

plt.savefig('figure43_code_challenge_3.png')

plt.show()

files.download('figure43_code_challenge_3.png')

# With this relatively low number of epochs, making the fixed learning rate so small
# makes the algorithm collapse, performing even worse than the time-based dynamic
# learning rate


In [None]:
# %% Excercise 3
#    Going back to the original code (without the modifications above), you saw that the fixed learning rate model didn't
#    get to the same local minimum. What happens if you increase the number of training epochs from 50 to 500? Does that
#    improve the situation, and what does that tell you about the relationship between learning rate and training epochs?

# Increasing the epochs improves the estimation of the local minimum, simply because the algorithm is given
# the opportunity to "learn for longer", a similar effect is also observed for the dynamic approaches


In [None]:
# %% Excercise 4
#    The code here initializes the starting value as a random number, which will differ for each learning rate method.
#    Is that appropriate or inappropriate for this experiment? Why? Change the code so that the starting value is the
#    same for all three learning rate models.

# In this case the random starting point constitutes a uncontrolled variable in our parametric experiment; keeping
# the initial value constant makes things easier to interpret. Indeed, by doing so, one can see that, at least for this function,
# this number of epochs (N=50), and an initial learning rate of 0.01, the gradient approach is outperforming all the others,
# the fixed approach still gives a good approximation (it would do better with more epochs), while the time-based approach
# produce a poor result
