# Understanding gradient descent in machine learning

## *Python* code example for fitting two parameters
Now we extend the problem by defining a hypothesis function with two parameters, 

$$h_\theta(x) = \theta_0 + \theta_1 x.$$

This time, we visualize $J(\theta_0,\theta_1)$  as a contour plot (right panel). The rest follows the same procedure as described for the one parameter case.

In [83]:
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact, FloatSlider, IntSlider

# The data to fit
m = 20
x = np.linspace(-1, 1, m)
y = None

def cost_func(x, y, theta0, theta1):
    """The cost function, J(theta0, theta1) describing the goodness of fit."""
    return np.average((y - hypothesis(x, theta0, theta1.reshape(-1, 1))) ** 2, axis=1) / 2

def hypothesis(x, theta0, theta1):
    """Our "hypothesis function", a straight line."""
    return theta0.reshape(-1, 1) + theta1 * x

def update_plots(N, alpha, theta0_true, theta1_true, theta0_start, theta1_start):
    global y

    # Generate new y values based on the updated true theta values
    y = theta0_true + theta1_true * x

    # Create the figure and axes objects
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 4.15))

    # Initialize the scatter plot for data points
    ax[0].scatter(x, y, marker='x', s=40, color='#009E73', label=r"true $\theta$")

    # Take N steps with learning rate alpha down the steepest gradient,
    # starting at (theta0, theta1) = (theta0_start, theta1_start).
    theta = [np.array((theta0_start, theta1_start))]
    J = [cost_func(x, y, *theta[0])]
    for j in range(N - 1):
        last_theta = theta[-1]
        this_theta = np.empty((2,))
        this_theta[0] = last_theta[0] - alpha / m * np.sum(
            (hypothesis(x, *last_theta) - y))
        this_theta[1] = last_theta[1] - alpha / m * np.sum(
            (hypothesis(x, *last_theta) - y) * x)
        theta.append(this_theta)
        J.append(cost_func(x, y, *this_theta))

    # First construct a grid of (theta0, theta1) parameter pairs and their
    # corresponding cost function values.
    theta0_grid = np.linspace(-4, 4, 101)
    theta1_grid = np.linspace(-5, 5, 101)
    X, Y = np.meshgrid(theta0_grid, theta1_grid)
    J_grid = cost_func(x, y, X.flatten(), Y.flatten()).reshape(X.shape)

    # A pcolor plot for the RHS cost function
    pcolor_plot = ax[1].pcolormesh(X, Y, J_grid, cmap='viridis')
    fig.colorbar(pcolor_plot, ax=ax[1])

    # Contour lines on the pcolor plot
    contours = ax[1].contour(X, Y, J_grid, 30, colors='gray')
    ax[1].clabel(contours, inline=True, fontsize=8, colors='w')

    # The target parameter values indicated on the cost function pcolor plot
    ax[1].scatter([theta0_true] * 2, [theta1_true] * 2, s=[50, 10], color=['k', 'w'])

    # Annotate the cost function plot with coloured points indicating the
    # parameters chosen and red arrows indicating the steps down the gradient.
    # Also plot the fit function on the LHS data plot.
    for j in range(1, N):
        ax[1].annotate('', xy=theta[j], xytext=theta[j - 1],
                       arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                       va='center', ha='center')
        ax[0].plot(x, hypothesis(x, *theta[j]).flatten(), lw=2, c="#0072B2", alpha=0.75)
                   #label=r'$\theta_0 = {:.3f}, \theta_1 = {:.3f}$'.format(*theta[j])
    ax[0].plot(x, hypothesis(x, *theta[j]).flatten(), lw=2, c="#E69F00")
    ax[1].scatter(*zip(*theta), s=40, lw=0)
    last_theta = theta[j]
    ax[1].plot(last_theta[0], last_theta[1], 'o', c="orange")
    
    

    # Labels, titles, and a legend.
    ax[1].set_xlabel(r'$\theta_0$')
    ax[1].set_ylabel(r'$\theta_1$')
    ax[1].set_title('Cost function')
    ax[0].set_xlabel(r'$x$')
    ax[0].set_ylabel(r'$y$')
    ax[0].set_title('Data and fit')
    ax[0].legend(loc='upper left', fontsize='small')
    """ axbox = ax[0].get_position()
    # Position the legend by hand so that it doesn't cover up any of the lines.
    ax[0].legend(loc=(axbox.x0 + 0.5 * axbox.width, axbox.y0 + 0.1 * axbox.height),
                 fontsize='small') """

    plt.tight_layout()
    plt.show()

# Create sliders
N_slider = IntSlider(min=2, max=20, step=1, value=15)
alpha_slider = FloatSlider(min=0.1, max=1.0, step=0.1, value=0.7)
theta0_true_slider = FloatSlider(min=-4, max=4, step=0.1, value=2)
theta1_true_slider = FloatSlider(min=-4, max=4, step=0.1, value=2)
theta0_start_slider = FloatSlider(min=-4, max=4, step=0.1, value=0)
theta1_start_slider = FloatSlider(min=-4, max=4, step=0.1, value=0)

# Create the interactive widget
interact(update_plots,
         N=N_slider,
         alpha=alpha_slider,
         theta0_true=theta0_true_slider,
         theta1_true=theta1_true_slider,
         theta0_start=theta0_start_slider,
         theta1_start=theta1_start_slider)


interactive(children=(IntSlider(value=15, description='N', max=20, min=2), FloatSlider(value=0.7, description=…

<function __main__.update_plots(N, alpha, theta0_true, theta1_true, theta0_start, theta1_start)>

## Acknowledgement
The main code is based on the blog post ["Visualizing the gradient descent method"](https://scipython.com/blog/visualizing-the-gradient-descent-method/) from [*scipython.com*](https://scipython.com).