This notebook is used for simple one and 2 variable examples and visualizations of gradient descent, momentum and linear regression using gradient descent

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math

import os

if not os.path.isdir("visualizations"):
    os.mkdir("visualizations")

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

Show a simple gradient descent algorithm for the function
$$y=x^2$$

The actual minimum is $x=0$

Sources:
- https://nbviewer.jupyter.org/url/courses.d2l.ai/berkeley-stat-157/slides/4_30/gd-sgd.ipynb

In [None]:
def gd(lr, initial_x, epochs):
    x = initial_x
    x_values = [x]
    for i in range(epochs):
        x = x - lr * 2 * x # derivative of x^2
        x_values.append(x)
    return x_values

In [None]:
def show_trace(ax, x_values, func):
    n = max(abs(min(x_values)), abs(max(x_values)))
    f = np.arange(-n, n, 0.1)
    ax.plot(f, [func(x) for x in f]) # the actual function
    ax.plot(x_values, [func(x) for x in x_values], "-o", color="red") # the values calculated for x after every gd epoch
    ax.grid()
    ax.set_xlabel("x")
    ax.set_ylabel("y")

In [None]:
initial_x = -10
epochs = 10
func = lambda x: x**2
learning_rates = [0.01, 0.2, 0.8, 0.99]
infos = ["too small", "good", "too large", "way too large"]
fig, axs = plt.subplots(2, 2, figsize=(10, 7), gridspec_kw=dict(hspace=0.35))
fig.suptitle(r"Gradient Descent behaviour for $y=x^2$"f" and different learning rates ({epochs} epochs)")
i = 0
k = 0
for idx, lr in enumerate(learning_rates):
    ax = axs[i, k]
    ax.set_title(f"lr={lr} ({infos[idx]})")
    x_values = gd(lr, initial_x=initial_x, epochs=epochs)
    show_trace(ax=ax, x_values=x_values, func=func)
    k = (k + 1) % 2
    if k == 0:
        i += 1

plt.savefig("visualizations/gradient_descent_1d_example.png")

Show the difference between gradient descent and stochastic gradient descent for the multivariate function
$$y=x_1^2+2x_2^2$$

The actual minimum is $x_1=0$ and $x_2=0$

In [None]:
def gd_2d(lr, initial_point, epochs):
    p = initial_point
    results = [p]
    for i in range(epochs):
        x1 = p[0] - lr * 2 * p[0] # partial derivative for x1
        x2 = p[1] - lr * 4 * p[1] # partial derivate for x2
        p = (x1, x2)
        results.append(p)
    return results

In [None]:
def show_trace_2d(ax, points, func, x1lim=(-5.5, 1.0), x2lim=(-2.5, 0.5)):
    x1, x2 = np.arange(x1lim[0], x1lim[1], 0.1), np.arange(x2lim[0], x2lim[1], 0.1)
    X1, X2 = np.meshgrid(x1, x2)
    Z = func(X1, X2)
    labels = ax.contour(X1, X2, Z, 40)
    ax.autoscale(False)
    for p in points:
        ax.plot([p[0] for p in points], [p[1] for p in points], "-o", color="red")
    ax.set_xlabel("x1")
    ax.set_ylabel("x2")
    # ax.clabel(labels, inline=1, fontsize=13)

In [None]:
initial_point = (-5, -2)
epochs = 20
learning_rates = [0.01, 0.1, 0.5, 0.9]
infos = ["too small", "good", "too large", "way too large"]
fig, axs = plt.subplots(2, 2, figsize=(10, 7), gridspec_kw=dict(hspace=0.35))
fig.suptitle(r"Gradient Descent behaviour for $y=x_1^2+2x_2^2$"f" and different learning rates ({epochs} epochs)")
func = lambda X1, X2: X1**2 + 2*X2**2
i = 0
k = 0
for idx, lr in enumerate(learning_rates):
    ax = axs[i, k]
    ax.set_title(f"lr={lr} ({infos[idx]})")
    points = gd_2d(lr, initial_point=initial_point, epochs=epochs)
    show_trace_2d(axs[i, k], points, func)
    k = (k + 1) % 2
    if k == 0:
        i += 1

plt.savefig("visualizations/gradient_descent_2d_example.png")

The same thing for momentum

Sources:
- https://nbviewer.jupyter.org/url/courses.d2l.ai/berkeley-stat-157/slides/5_2/momentum.ipynb

In [None]:
def momentum_2d(lr, gamma, initial_point, epochs):
    prev_x1, prev_x2 = 0, 0
    p = initial_point
    results = [p]
    for i in range(epochs):
        prev_x1 = gamma * prev_x1 + lr * 2 * p[0] # partial derivative for x1
        prev_x2 = gamma * prev_x2 + lr * 4 * p[1] # partial derivative for x2
        x1 = p[0] - prev_x1
        x2 = p[1] - prev_x2
        p = (x1, x2)
        results.append(p)
    return results

In [None]:
initial_point = (-5, -2)
epochs = 20
learning_rates = [0.01, 0.1, 0.5, 0.9]
gammas = [0.4, 0.5, 0.9, 0.99]
fig, axs = plt.subplots(4, 4, figsize=(25, 14), gridspec_kw=dict(hspace=0.4))
fig.suptitle("Gradient Descent with momentum\n"r"behaviour for $y=x_1^2+2x_2^2$"f" and different learning rates ({epochs} epochs)")
func = lambda X1, X2: X1**2 + 2*X2**2
i = 0
k = 0
for idx, lr in enumerate(learning_rates):
    for kdx, gamma in enumerate(gammas):
        ax = axs[i, k]
        ax.set_title(f"lr={lr}" r"$\gamma$"f"={gamma}")
        points = momentum_2d(lr, gamma=gamma, initial_point=initial_point, epochs=epochs)
        show_trace_2d(axs[i, k], points, func, x1lim=(-5.5, 3.0), x2lim=(-2.5, 3.0))
        k = (k + 1) % 4
        if k == 0:
            i += 1

plt.savefig("visualizations/momentum_2d_example.png")

Using Gradient Descent for Linear Regression

Sources:
- https://towardsdatascience.com/linear-regression-using-gradient-descent-97a6c8700931

In [None]:
# generate training data
np.random.seed(666)
truth = lambda x: 1.5*x - 20
amount = 250
X = range(0, amount)
target_line = [truth(val) for val in X]
training_data = np.random.normal(loc=target_line, scale=amount/3, size=(1, amount))
plt.scatter(X, training_data)
plt.plot(target_line, color="lime", linewidth=4)
plt.title(r"training data around truth function $y=\dfrac{3}{2}x-20$")
plt.xlabel("x")
plt.ylabel("y")

In [None]:
# model y = m * x + c
regression_model = lambda m, c, x: m * x + c

# loss function: mean squared error (mse)
def mse(training_data, predictions):
    squared_error_sum = 0
    for idx, prediction in enumerate(predictions):
        diff = training_data[0, idx] - prediction # precalculated prediction = m * idx + c
        squared_error_sum +=  diff * diff
    mean = squared_error_sum / len(predictions)
    return mean

def partial_derivative_mse_m(training_data, predictions):
    error_sum = 0
    for idx, prediction in enumerate(predictions):
        diff = training_data[0, idx] - prediction
        error_sum += idx * diff
    return -2 / len(predictions) * error_sum

def partial_derivative_mse_c(training_data, predictions):
    error_sum = 0
    for idx, prediction in enumerate(predictions):
        diff = training_data[0, idx] - prediction
        error_sum += diff
    return -2 / len(predictions) * error_sum

def calc_predictions_and_loss(m, c, X, training_data):
    predictions = [regression_model(m, c, x) for x in X]
    return predictions, mse(training_data, predictions)

# training
def train_regression_model(epochs, lr=0.0001, initial_m=0, initial_c=0):
    m, c = initial_m, initial_c
    loss_per_epoch = []
    for i in range(epochs):
        predictions, loss = calc_predictions_and_loss(m, c, X, training_data)
        loss_per_epoch.append(loss)
        m = m - lr * partial_derivative_mse_m(training_data, predictions)
        c = c - lr * partial_derivative_mse_c(training_data, predictions)
    return m, c, loss_per_epoch

In [None]:
epochs = 30
fig, axs = plt.subplots(1, 2, figsize=(20, 5), gridspec_kw=dict(hspace=0.35))
left = axs[0]
right = axs[1]
fig.suptitle(f"Result of training the regression model for {epochs} epochs", y=1.15)

m, c, loss_per_epoch = train_regression_model(epochs=epochs, lr=0.00001)

predictions, _ = calc_predictions_and_loss(m, c, X, training_data)
left.scatter(X, training_data)
left.plot(target_line, color="lime", linewidth=4)
left.plot(predictions, color="red", linewidth=4)
left.set_title(r"training data around truth function $y=\dfrac{3}{2}x-20$" + "\n"
               + r"and learned function $y=mx+c$" + "\nwith m = "
               + f"{round(m, 4)} c = {round(c, 4)}")
left.legend(["truth line", "predicted line", "training data"])
left.set_xlabel("x")
left.set_ylabel("y")

right.plot(range(0, len(loss_per_epoch)), loss_per_epoch)
right.set_title(f"loss from Mean Squared Error\nwith latest loss: {round(loss_per_epoch[-1], 4)}")
right.set_xlabel("epoch")
right.set_ylabel("loss")

plt.savefig("./visualizations/gradient_descent_linear_regression_example.png", bbox_inches="tight")