In [None]:
# %% Deep learning - Section 6.36
#    Parametric experiments on gradient descent

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import sympy               as sym
import copy

from google.colab                     import files
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Function

# Function and derivative
x  = np.linspace(-2*np.pi,2*np.pi,401)
fx = np.sin(x)*np.exp(-x**2*.05)
df = np.cos(x)*np.exp(-x**2*.05) + np.sin(x)*(-.1*x)*np.exp(-x**2*.05)

# Quick inspection
plt.plot(x,fx,x,df)
plt.legend(['f(x)',"f'(x)"])
plt.show()


In [None]:
# %% Define Python functions for ease

def fx(x):
    return np.sin(x)*np.exp(-x**2*.05)

def df(x):
    return np.cos(x)*np.exp(-x**2*.05) + np.sin(x)*(-.1*x)*np.exp(-x**2*.05)


In [None]:
# %% Gradient descent

# Gradient descent
local_min    = np.random.choice(x,1)
learn_rate   = .01
train_epochs = 1000

for i in range(train_epochs):
    gradient  = df(local_min)
    local_min = local_min - gradient*learn_rate

# Plotting
plt.plot(x,fx(x),x,df(x),'--')
plt.plot(local_min,df(local_min),'ro')
plt.plot(local_min,fx(local_min),'ro')

plt.xlim(x[[0,-1]])
plt.grid()
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend(['f(x)',"f'(x)"])
plt.title('Empirical local minimum: %s' %round(local_min[0],4))

plt.savefig('figure33_parametric_experiment_0.png')

plt.show()

files.download('figure33_parametric_experiment_0.png')


In [None]:
# %% Experiment 1
#    Manipulate uniquely the initial value

# Parameters
start_loc = np.linspace(-5,5,50)
final_res = np.zeros(len(start_loc))

# Loop over starting point
for idx,local_min in enumerate(start_loc):

    for i in range(train_epochs):
        gradient  = df(local_min)
        local_min = local_min - gradient*learn_rate

    final_res[idx] = local_min

# Plotting
plt.plot(start_loc,final_res,'s-')
plt.xlabel('Starting point')
plt.ylabel('Final guess')
plt.suptitle('Minima identified by gradient descent for a range of initial values')
plt.title('True global min: x = -1.4289')

plt.savefig('figure34_parametric_experiment_1.png')

plt.show()

files.download('figure34_parametric_experiment_1.png')


In [None]:
# %% Experiment 2
#    Manipulate uniquely the learning rate

# Parameters
learn_rate = np.linspace(1e-10,1e-1,50)
final_res  = np.zeros(len(learn_rate))

# Loop over learning rates (keep starting point fixed at 0 for this experiment)
for idx,learnRate in enumerate(learn_rate):
    local_min = 0

    # Run through training
    for i in range(train_epochs):
        gradient = df(local_min)
        local_min = local_min - gradient*learnRate

    final_res[idx] = local_min

# Plotting
plt.plot(learn_rate,final_res,'s-')
plt.xlabel('Learning rates')
plt.ylabel('Final guess')
plt.suptitle('Minima identified by gradient descent for a range of learning rates ')
plt.title('True global min: x = -1.4289, fixed initial location = 0')

plt.savefig('figure35_parametric_experiment_2.png')

plt.show()

files.download('figure35_parametric_experiment_2.png')


In [None]:
# %% Experiment 3
#    Manipulate the learning rate and the training epochs together

# Parameters
learn_rate   = np.linspace(1e-10,1e-1,50)
train_epochs = np.round(np.linspace(10,500,40))
final_res    = np.zeros((len(learn_rate),len(train_epochs)))

# Loop over learning rates
for Lidx,learnRate in enumerate(learn_rate):

    # Loop over training epochs (keep starting point fixed at 0 for this experiment as well)
    for Tidx,trainEpochs in enumerate(train_epochs):
        local_min = 0

        for i in range(int(trainEpochs)):
            gradient = df(local_min)
            local_min = local_min - gradient*learnRate

        final_res[Lidx,Tidx] = local_min


In [None]:
# %% Experiment 3
#    Plotting

fig,ax = plt.subplots(figsize=(7,5))

plt.imshow(final_res,extent=[learn_rate[0],learn_rate[-1],train_epochs[0],train_epochs[-1]],
           aspect='auto',origin='lower',vmin=-1.45,vmax=-1.2,cmap='jet')
plt.xlabel('Learning rate')
plt.ylabel('Training epochs')
plt.suptitle('Final guess by manipulating learning rate and epochs number')
plt.title('#True global min: x = -1.4289')
plt.colorbar()

plt.savefig('figure36_parametric_experiment_3.png')

plt.show()

files.download('figure36_parametric_experiment_3.png')

# Another visualization

plt.plot(learn_rate,final_res)
plt.xlabel('Learning rates')
plt.ylabel('Final function estimate')
plt.title('Each line is a training epochs N')

plt.savefig('figure37_parametric_experiment_3.png')

plt.show()

files.download('figure37_parametric_experiment_3.png')



In [None]:
# %% Exercise 1
#    In experiment 3, set the starting location to be 1.6. Re-run the experiment and the image. You'll need to re-adjust
#    the figure color limits; check the line plots at the top of the code to determine a useful color range. Does the new
#    starting value change your conclusions about the interaction between learning rate and training epochs?

# Not really, a similar relationship is seen also for a local minima; however, since the initial point for the gradient
# descent is more far away from the minimum than using x = 0, then the 'suboptimal area' for epochs and leraninig rates is much wider


In [None]:
# %% Exercise 2
#    In the same experiment, now change the starting location to be random (use code: np.random.choice(x,1)). How do these
#    results look? Are you surprised? Are the results of this experiment still interpretable and what does this tell you
#    about running experiments in DL?

# This is a quite nice one. It makes sense that by introducing randomness in the starting point the structure of the
# whole matrix falls (almost entirely) apart. It is however interesting to note that for high numbers of epochs and high numbers
# of learning rate, the values still (mostly) settle at either the global minimum (~x=-1.43) or the local minimum (~x=4.4), while
# in the areas of low values (more visible when keeping a fixed starting point as a sort of hyperbola), the gradient descent tends
# to get stuck at more heterogeneous locations.
# So, it's harder to see, but there is still some structure left; that said, maybe it's better not to vary too many variables when doing
# parametric experiments
