# Stochastic Gradient Descent


In [None]:
%matplotlib inline
import math
import tensorflow as tf
from dl import tensorflow as dl


## Stochastic Gradient Updates



In [None]:
def f(x1, x2):  # Objective function
    return x1**2 + 2 * x2**2

def f_grad(x1, x2):  # Gradient of the objective function
    return 2 * x1, 4 * x2

In [None]:
def sgd(x1, x2, s1, s2, f_grad):
    g1, g2 = f_grad(x1, x2)
    # Simulate noisy gradient
    g1 += tf.random.normal([1], 0.0, 1)
    g2 += tf.random.normal([1], 0.0, 1)
    eta_t = eta * lr()
    return (x1 - eta_t * g1, x2 - eta_t * g2, 0, 0)

In [None]:
def constant_lr():
    return 1

eta = 0.1
lr = constant_lr  # Constant learning rate
dl.show_trace_2d(f, dl.train_2d(sgd, steps=50, f_grad=f_grad))


## Dynamic Learning Rate


In [None]:
def exponential_lr():
    # Global variable that is defined outside this function and updated inside
    global t
    t += 1
    return math.exp(-0.1 * t)

t = 1
lr = exponential_lr
dl.show_trace_2d(f, dl.train_2d(sgd, steps=1000, f_grad=f_grad))

In [None]:
def polynomial_lr():
    # Global variable that is defined outside this function and updated inside
    global t
    t += 1
    return (1 + 0.1 * t)**(-0.5)

t = 1
lr = polynomial_lr
dl.show_trace_2d(f, dl.train_2d(sgd, steps=50, f_grad=f_grad))

## Exercises

1. Experiment with different learning rate schedules for stochastic gradient descent and with different numbers of iterations. In particular, plot the distance from the optimal solution $(0, 0)$ as a function of the number of iterations.
1. Prove that for the function $f(x_1, x_2) = x_1^2 + 2 x_2^2$ adding normal noise to the gradient is equivalent to minimizing a loss function $f(\mathbf{x}, \mathbf{w}) = (x_1 - w_1)^2 + 2 (x_2 - w_2)^2$ where $\mathbf{x}$ is drawn from a normal distribution.
1. Compare convergence of stochastic gradient descent when you sample from $\{(x_1, y_1), \ldots, (x_n, y_n)\}$ with replacement and when you sample without replacement.
1. How would you change the stochastic gradient descent solver if some gradient (or rather some coordinate associated with it) was consistently larger than all the other gradients?
1. Assume that $f(x) = x^2 (1 + \sin x)$. How many local minima does $f$ have? Can you change $f$ in such a way that to minimize it one needs to evaluate all the local minima?
