In [None]:
# %% Deep learning - Section 6.32
#    Gradient descent in 1D

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import copy

from google.colab                     import files
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Define function

# The function
def fx(x):
    return 3*x**2 - 3*x + 4

def df(x):
    return 6*x -3


In [None]:
# %% Plotting

x = np.linspace(-2,2,2001)

plt.plot(x,fx(x),x,df(x))
plt.xlim(x[[0,-1]])
plt.grid()
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend(["f(x)","f'(x)"])
plt.title('A function and its derivative')
plt.show()


In [None]:
# %% Algorithm for gradient descent

# 1) Random starting point (out of vector x)
local_min = np.random.choice(x,1).item()

print(f'Random starting local mininum: {local_min:.8f}')

# 2) Learning parameters
learning_rate   = .01
training_epochs = 100

# 3) Loop over epochs
for i in range(training_epochs):
    gradient  = df(local_min)
    local_min = local_min - gradient*learning_rate

print(f'Estimated local mininum: {local_min:.8f}')


In [None]:
# %% Plotting

plt.plot(x,fx(x),x,df(x))
plt.plot(local_min,df(local_min),'ro')
plt.plot(local_min,fx(local_min),'ro')

plt.xlim(x[[0,-1]])
plt.grid()
plt.xlabel('')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend(["f(x)","f'(x)","f(x) min"])
plt.suptitle('A function and its derivative')
plt.title('Empirical local minimum: %s' %np.round(local_min,4))

plt.savefig('figure1_gradient_descent_1d.png')

plt.show()

files.download('figure1_gradient_descent_1d.png')


In [None]:
# %% Store gradient descent interations for visualisation

local_min = np.random.choice(x,1).item()

learning_rate   = .01
training_epochs = 100
model_params    = np.zeros((training_epochs,2))

for i in range(training_epochs):
    gradient          = df(local_min)
    local_min         = local_min - gradient*learning_rate
    model_params[i,:] = local_min,gradient


In [None]:
# %% Plotting

fig,ax = plt.subplots(1,2,figsize=(12,4))

for i in range(2):
    ax[i].plot(model_params[:,i],'o-')
    ax[i].set_xlabel('Iteration')
    ax[i].set_title(f'Estimated minimum on last iteration: {local_min:.4f}')

ax[0].set_ylabel('Local minimum')
ax[1].set_ylabel('Derivative')

plt.savefig('figure2_gradient_descent_1d.png')

plt.show()

files.download('figure2_gradient_descent_1d.png')


In [None]:
# %% Explore effects of learning rate and training epochs

# Gradient descent
local_min = np.random.choice(x,1).item()

learning_rate   = .001
training_epochs = 1000
model_params    = np.zeros((training_epochs,2))

for i in range(training_epochs):
    gradient          = df(local_min)
    local_min         = local_min - gradient*learning_rate
    model_params[i,:] = local_min,gradient

# Plotting
fig,ax = plt.subplots(1,2,figsize=(12,4))

for i in range(2):
    ax[i].plot(model_params[:,i],'o-')
    ax[i].set_xlabel('Iteration')
    ax[i].set_title(f'Estimated minimum on last iteration: {local_min:.4f}')

ax[0].set_ylabel('Local minimum')
ax[1].set_ylabel('Derivative')
plt.suptitle(f'Learning rate = {learning_rate} ; Epochs (N) = {training_epochs}')

plt.savefig('figure4_gradient_descent_1d.png')

plt.show()

files.download('figure4_gradient_descent_1d.png')

In [None]:
# %% Exercise 1
#    Most often in DL, the model trains for a set number of iterations, which is what we do here. But there are other ways
#    of defining how long the training lasts. Modify the code so that training ends when the derivative is smaller than
#    some threshold, e.g., 0.1. Make sure your code is robust for negative derivatives.

local_min = np.random.choice(x,1).item()

learning_rate  = .01
grad_threshold = 1e-4
epoch_counter  = 0

while True:
    gradient       = df(local_min)
    local_min      = local_min - gradient*learning_rate
    epoch_counter += 1

    if abs(gradient) < grad_threshold:
        break
    elif epoch_counter > 1e5:
        break

print(f'Estimated local mininum: {local_min:.8f}')
print(f'Required {epoch_counter} iterations, for precision of {grad_threshold}')


In [None]:
# %% Exercise 2
#    Does this change to the code produce a more accurate result? What if you change the stopping threshold?

# Modify script in Ex. 1 to explore
# Maybe but not necessarily; using a threshold would require to know the scale of the data, and the procedure is still
# dependent on the random starting point; if anything, it can force an arbitrary upper level of precision (e.g.,
# in case you decide that a certain level of precision is enough and you prefer to potentially save up on the
# number of iterations)


In [None]:
# %% Exercise 3
#    Can you think of any potential problems that might arise when the stopping criterion is based on the derivative
#    instead of a specified number of training epochs?

# As mentioned above, it would require to know at least how the data are roughly distributed and what is their range,
# otherwise the choice of a threshold might be fatal (but to be fair, this might also be a problem when chosing a fixed
# number of iterations with a too small learning rate)
