In [None]:
# %% Deep learning - Section 6.34
#    Gradient descent in 2D

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import sympy               as sym
import copy

from mpl_toolkits.mplot3d             import Axes3D
from google.colab                     import files
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Define function

# This function is called 'peaks' and in facts is the function in the MatLab logo,
# may I be forgiven for implementing it in bloody Python

def peaks(x,y):

    # Expand to a 2D mesh
    x,y = np.meshgrid(x,y)

    z = 3*(1-x)**2 * np.exp(-(x**2) - (y+1)**2) \
        - 10*(x/5 - x**3 - y**5) * np.exp(-x**2 - y**2) \
        - 1/3*np.exp(-(x+1)**2 - y**2)

    return z


In [None]:
# %% Plotting

# Create landscape
x = np.linspace(-3,3,201)
y = np.linspace(-3,3,201)

z = peaks(x,y)

# Plot
plt.imshow(z,extent=[x[0],x[-1],y[0],y[-1]],vmin=-5,vmax=5,origin='lower',cmap='jet')
plt.show()


In [None]:
# %% Compute derivative with sympy

# Create symbols and redefine function for sympy
sx,sy = sym.symbols('sx,sy')
sz    = 3*(1-sx)**2 * sym.exp(-(sx**2) - (sy+1)**2) \
        - 10*(sx/5 - sx**3 - sy**5) * sym.exp(-sx**2 - sy**2) \
        - 1/3*sym.exp(-(sx+1)**2 - sy**2)

# Compute partial derivatives (.lambdify() transforms the symbolic function into a numpy usable function)
df_x = sym.lambdify( (sx,sy),sym.diff(sz,sx),'sympy' )
df_y = sym.lambdify( (sx,sy),sym.diff(sz,sy),'sympy' )

# Example of partial derivative computation
df_x(1,1).evalf()


In [None]:
# %% Gradient descent in 2D

# 1) Random starting point (uniform between -2 and +2); try also fixed at [0,1.4]
local_min = np.random.rand(2)*4-2
start_pnt = local_min[:]

print(f'Random starting local mininum: {local_min}')

# 2) Learning parameters
learning_rate   = .01
training_epochs = 1000

# 3) Loop over epochs
trajectory = np.zeros((training_epochs,2))

for i in range(training_epochs):
    gradient = np.array([ df_x(local_min[0],local_min[1]).evalf(),
                          df_y(local_min[0],local_min[1]).evalf()
                          ])
    local_min       = local_min - gradient*learning_rate
    trajectory[i,:] = local_min

print(f'Estimated local mininum: {local_min}')


In [None]:
# %% Plotting

plt.imshow(z,extent=[x[0],x[-1],y[0],y[-1]],vmin=-5,vmax=5,origin='lower',cmap='jet')
plt.plot(start_pnt[0],start_pnt[1],'bs')
plt.plot(local_min[0],local_min[1],'ro')
plt.plot(trajectory[:,0],trajectory[:,1],'r')
plt.legend(['Rand start','Local min'])
plt.suptitle('Random starting point')
plt.title(f'Training epochs: {training_epochs} and learning rate: {learning_rate}')
plt.colorbar()

plt.savefig('figure16_gradient_descent_2d.png')

plt.show()

files.download('figure16_gradient_descent_2d.png')


In [None]:
# %% Plot the function in 3D for fun

x = np.linspace(-3,3,201)
y = np.linspace(-3,3,201)

fig = plt.figure(figsize=(8,6))
ax  = fig.add_subplot(111,projection='3d')
X,Y = np.meshgrid(x,y)

ax.plot_surface(X,Y,z,cmap='jet')
ax.view_init(elev=30,azim=250)
ax.set_xlabel('x axis')
ax.set_ylabel('y axis')
ax.set_zlabel('f(x,y)')
ax.set_title('3D Surface Plot')

plt.savefig('figure24_gradient_descent_2d.png')

plt.show()

files.download('figure24_gradient_descent_2d.png')


In [None]:
# %% Visualise the basins of attraction for the gradient descent algorithm

# Grid for function visualization
x = np.linspace(-4,4,201)
y = np.linspace(-4,4,201)

# Gradient descent parameters
learning_rate   = 0.01
training_epochs = 100

# Generate multiple starting points in a 25x25 grid
start_x      = np.linspace(-2.5,2.5,25)
start_y      = np.linspace(-2.5,2.5,25)
start_points = np.array(np.meshgrid(start_x,start_y)).T.reshape(-1,2)

# Plot function
plt.figure(figsize=(10,8))
plt.imshow(z,extent=[x[0],x[-1],y[0],y[-1]],vmin=-5,vmax=5,origin='lower',cmap='jet')

# Run gradient descent for each starting point
for start_pnt in start_points:
    local_min  = start_pnt.copy()
    trajectory = np.zeros((training_epochs, 2))

    for i in range(training_epochs):
        gradient = np.array([df_x(local_min[0],local_min[1]),
                             df_y(local_min[0],local_min[1])
                             ])
        local_min       = local_min - gradient*learning_rate
        trajectory[i,:] = local_min

    # Plot trajectory
    plt.plot(start_pnt[0],start_pnt[1],'bs',markersize=2)
    plt.plot(local_min[0],local_min[1],'ko',markersize=3)
    plt.plot(trajectory[:,0],trajectory[:,1],'r',alpha=0.5)

plt.legend(['Start point','Local min'])
plt.suptitle('Basins of Attraction for Gradient Descent')
plt.title(f'Training epochs: {training_epochs}, Learning rate: {learning_rate}')

plt.savefig('figure25_gradient_descent_2d.png')

plt.show()

files.download('figure25_gradient_descent_2d.png')


In [None]:
# %% Exercise 1
#    Modify the code to force the initial guess to be [0,1.4]. Does the model reach a reasonable local minimum?

# No, it gets stuck at a poor local minimun on every run ([0.296 0.320])


In [None]:
# %% Exercise 2
#    Using the same starting point, change the number of training epochs to 10,000. Does the final solution differ from
#    using 1000 epochs?

# No, still stucked at [0.296 0.320]


In [None]:
# %% Exercise 3
#    (Again with the same starting location) Change the learning to .1 (1000 epochs). What do you notice about the trajectory?
#    Try again with the learning rate set to .5, and then to .00001.

# A rate of .1 doesn't change much, the gradient still falls in the same poor local minimum; a rate of .5 produces
# a catastrophic result, the rate is so out-of-scale compared to the data that when multiplied with the gradient
# it just makes the local minima jump everywhere across the function; finally, a rate of .00001 is also out-of-scale
# because it generates steps so small that with 1000 iterations the gradient is still nowhere close to a minimum
