# Momentum



In [None]:
%matplotlib inline
import tensorflow as tf
from dl import tensorflow as dl

eta = 0.4
def f_2d(x1, x2):
    return 0.1 * x1 ** 2 + 2 * x2 ** 2
def gd_2d(x1, x2, s1, s2):
    return (x1 - eta * 0.2 * x1, x2 - eta * 4 * x2, 0, 0)

dl.show_trace_2d(f_2d, dl.train_2d(gd_2d))

In [None]:
eta = 0.6
dl.show_trace_2d(f_2d, dl.train_2d(gd_2d))

### The Momentum Method


In [None]:
def momentum_2d(x1, x2, v1, v2):
    v1 = beta * v1 + 0.2 * x1
    v2 = beta * v2 + 4 * x2
    return x1 - eta * v1, x2 - eta * v2, v1, v2

eta, beta = 0.6, 0.5
dl.show_trace_2d(f_2d, dl.train_2d(momentum_2d))

In [None]:
eta, beta = 0.6, 0.25
dl.show_trace_2d(f_2d, dl.train_2d(momentum_2d))


### Effective Sample Weight


In [None]:
dl.set_figsize()
betas = [0.95, 0.9, 0.6, 0]
for beta in betas:
    x = tf.range(40).numpy()
    dl.plt.plot(x, beta**x, label=f'beta = {beta:.2f}')
dl.plt.xlabel('time')
dl.plt.legend();

## Practical Experiments

### Implementation from Scratch


In [None]:
def init_momentum_states(features_dim):
    v_w = tf.Variable(tf.zeros((features_dim, 1)))
    v_b = tf.Variable(tf.zeros(1))
    return (v_w, v_b)

In [None]:
def sgd_momentum(params, grads, states, hyperparams):
    for p, v, g in zip(params, states, grads):
        v[:].assign(hyperparams['momentum'] * v + g)
        p[:].assign(p - hyperparams['lr'] * v)

In [None]:
def train_momentum(lr, momentum, num_epochs=2):
    dl.train_ch11(sgd_momentum, init_momentum_states(feature_dim), {
        'lr': lr,
        'momentum': momentum}, data_iter, feature_dim, num_epochs)

data_iter, feature_dim = dl.get_data_ch11(batch_size=10)
train_momentum(0.02, 0.5)

In [None]:
train_momentum(0.01, 0.9)

In [None]:
train_momentum(0.005, 0.9)

### Concise Implementation


In [None]:
trainer = tf.keras.optimizers.SGD
dl.train_concise_ch11(trainer, {
    'learning_rate': 0.005,
    'momentum': 0.9}, data_iter)

In [None]:
lambdas = [0.1, 1, 10, 19]
eta = 0.1
dl.set_figsize((6, 4))
for lam in lambdas:
    t = tf.range(20).numpy()
    dl.plt.plot(t, (1 - eta * lam)**t, label=f'lambda = {lam:.2f}')
dl.plt.xlabel('time')
dl.plt.legend();



## Exercises

1. Use other combinations of momentum hyperparameters and learning rates and observe and analyze the different experimental results.
1. Try out GD and momentum for a quadratic problem where you have multiple eigenvalues, i.e., $f(x) = \frac{1}{2} \sum_i \lambda_i x_i^2$, e.g., $\lambda_i = 2^{-i}$. Plot how the values of $x$ decrease for the initialization $x_i = 1$.
1. What changes when we perform stochastic gradient descent with momentum? What happens when we use minibatch stochastic gradient descent with momentum? Experiment with the parameters?
