# Adagrad


## The Algorithm

We use the variable $\mathbf{s}_t$ to accumulate past gradient variance as follows.

$$\begin{aligned}
    \mathbf{g}_t & = \partial_{\mathbf{w}} l(y_t, f(\mathbf{x}_t, \mathbf{w})), \\
    \mathbf{s}_t & = \mathbf{s}_{t-1} + \mathbf{g}_t^2, \\
    \mathbf{w}_t & = \mathbf{w}_{t-1} - \frac{\eta}{\sqrt{\mathbf{s}_t + \epsilon}} \cdot \mathbf{g}_t.
\end{aligned}$$


In [None]:
%matplotlib inline
import math
import tensorflow as tf
from dl import tensorflow as dl


In [None]:
def adagrad_2d(x1, x2, s1, s2):
    eps = 1e-6
    g1, g2 = 0.2 * x1, 4 * x2
    s1 += g1**2
    s2 += g2**2
    x1 -= eta / math.sqrt(s1 + eps) * g1
    x2 -= eta / math.sqrt(s2 + eps) * g2
    return x1, x2, s1, s2

def f_2d(x1, x2):
    return 0.1 * x1**2 + 2 * x2**2

eta = 0.4
dl.show_trace_2d(f_2d, dl.train_2d(adagrad_2d))

In [None]:
eta = 2
dl.show_trace_2d(f_2d, dl.train_2d(adagrad_2d))

## Implementation from Scratch


In [None]:
def init_adagrad_states(feature_dim):
    s_w = tf.Variable(tf.zeros((feature_dim, 1)))
    s_b = tf.Variable(tf.zeros(1))
    return (s_w, s_b)

def adagrad(params, grads, states, hyperparams):
    eps = 1e-6
    for p, s, g in zip(params, states, grads):
        s[:].assign(s + tf.math.square(g))
        p[:].assign(p - hyperparams['lr'] * g / tf.math.sqrt(s + eps))

In [None]:
data_iter, feature_dim = dl.get_data_ch11(batch_size=10)
dl.train_ch11(adagrad, init_adagrad_states(feature_dim), {'lr': 0.1},
               data_iter, feature_dim);

## Concise Implementation



In [None]:
trainer = tf.keras.optimizers.Adagrad
dl.train_concise_ch11(trainer, {'learning_rate': 0.1}, data_iter)