# Minibatch Stochastic Gradient Descent

## Vectorization and Caches



In [None]:
%matplotlib inline
import numpy as np
import tensorflow as tf
from dl import tensorflow as dl

timer = dl.Timer()
A = tf.Variable(tf.zeros((256, 256)))
B = tf.Variable(tf.random.normal([256, 256], 0, 1))
C = tf.Variable(tf.random.normal([256, 256], 0, 1))

In [None]:
# Compute A = BC one element at a time
timer.start()
for i in range(256):
    for j in range(256):
        A[i, j].assign(tf.tensordot(B[i, :], C[:, j], axes=1))
timer.stop()

In [None]:
timer.start()
for j in range(256):
    A[:, j].assign(tf.tensordot(B, C[:, j], axes=1))
timer.stop()

In [None]:
timer.start()
A.assign(tf.tensordot(B, C, axes=1))
timer.stop()

# Multiply and add count as separate operations (fused in practice)
gigaflops = [2 / i for i in timer.times]
print(f'performance in Gigaflops: element {gigaflops[0]:.3f}, '
      f'column {gigaflops[1]:.3f}, full {gigaflops[2]:.3f}')

## Minibatches




In [None]:
timer.start()
for j in range(0, 256, 64):
    A[:, j:j + 64].assign(tf.tensordot(B, C[:, j:j + 64], axes=1))
timer.stop()
print(f'performance in Gigaflops: block {2 / timer.times[3]:.3f}')


## Reading the Dataset


In [None]:

dl.DATA_HUB['airfoil'] = (dl.DATA_URL + 'airfoil_self_noise.dat',
                           '76e5be1548fd8222e5074cf0faae75edff8cf93f')

def get_data_ch11(batch_size=10, n=1500):
    data = np.genfromtxt(dl.download('airfoil'), dtype=np.float32,
                         delimiter='\t')
    data = (data - data.mean(axis=0)) / data.std(axis=0)
    data_iter = dl.load_array((data[:n, :-1], data[:n, -1]), batch_size,
                               is_train=True)
    return data_iter, data.shape[1] - 1

## Implementation from Scratch

In [None]:
def sgd(params, grads, states, hyperparams):
    for param, grad in zip(params, grads):
        param.assign_sub(hyperparams['lr'] * grad)

In [None]:

def train_ch11(trainer_fn, states, hyperparams, data_iter, feature_dim,
               num_epochs=2):
    # Initialization
    w = tf.Variable(
        tf.random.normal(shape=(feature_dim, 1), mean=0, stddev=0.01),
        trainable=True)
    b = tf.Variable(tf.zeros(1), trainable=True)

    # Train
    net, loss = lambda X: dl.linreg(X, w, b), dl.squared_loss
    animator = dl.Animator(xlabel='epoch', ylabel='loss',
                            xlim=[0, num_epochs], ylim=[0.22, 0.35])
    n, timer = 0, dl.Timer()

    for _ in range(num_epochs):
        for X, y in data_iter:
            with tf.GradientTape() as g:
                l = tf.math.reduce_mean(loss(net(X), y))

            dw, db = g.gradient(l, [w, b])
            trainer_fn([w, b], [dw, db], states, hyperparams)
            n += X.shape[0]
            if n % 200 == 0:
                timer.stop()
                p = n / X.shape[0]
                q = p / tf.data.experimental.cardinality(data_iter).numpy()
                r = (dl.evaluate_loss(net, data_iter, loss),)
                animator.add(q, r)
                timer.start()
    print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch')
    return timer.cumsum(), animator.Y[0]

In [None]:
def train_sgd(lr, batch_size, num_epochs=2):
    data_iter, feature_dim = get_data_ch11(batch_size)
    return train_ch11(sgd, None, {'lr': lr}, data_iter, feature_dim,
                      num_epochs)

gd_res = train_sgd(1, 1500, 10)

In [None]:
sgd_res = train_sgd(0.005, 1)

In [None]:
mini1_res = train_sgd(.4, 100)

In [None]:
mini2_res = train_sgd(.05, 10)

In [None]:
dl.set_figsize([6, 3])
dl.plot(*list(map(list, zip(gd_res, sgd_res, mini1_res, mini2_res))),
         'time (sec)', 'loss', xlim=[1e-2, 10],
         legend=['gd', 'sgd', 'batch size=100', 'batch size=10'])
dl.plt.gca().set_xscale('log')

## Concise Implementation


In [None]:

def train_concise_ch11(trainer_fn, hyperparams, data_iter, num_epochs=2):
    # Initialization
    net = tf.keras.Sequential()
    net.add(
        tf.keras.layers.Dense(
            1, kernel_initializer=tf.random_normal_initializer(stddev=0.01)))
    optimizer = trainer_fn(**hyperparams)
    loss = tf.keras.losses.MeanSquaredError()
    # Note: L2 Loss = 1/2 * MSE Loss. TensorFlow has MSE Loss which is
    # slightly different from MXNet's L2Loss by a factor of 2. Hence we halve
    # the loss value to get L2Loss in TensorFlow
    animator = dl.Animator(xlabel='epoch', ylabel='loss',
                            xlim=[0, num_epochs], ylim=[0.22, 0.35])
    n, timer = 0, dl.Timer()
    for _ in range(num_epochs):
        for X, y in data_iter:
            with tf.GradientTape() as g:
                out = net(X)
                l = loss(y, out) / 2
                params = net.trainable_variables
                grads = g.gradient(l, params)
            optimizer.apply_gradients(zip(grads, params))
            n += X.shape[0]
            if n % 200 == 0:
                timer.stop()
                p = n / X.shape[0]
                q = p / tf.data.experimental.cardinality(data_iter).numpy()
                r = (dl.evaluate_loss(net, data_iter, loss) / 2,)
                animator.add(q, r)
                timer.start()
    print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch')

In [None]:
data_iter, _ = get_data_ch11(10)
trainer = tf.keras.optimizers.SGD
train_concise_ch11(trainer, {'learning_rate': 0.05}, data_iter)


## Exercises

1. Modify the batch size and learning rate and observe the rate of decline for the value of the objective function and the time consumed in each epoch.
1. Compare minibatch stochastic gradient descent with a variant that actually *samples with replacement* from the training set. What happens?
1. An evil genie replicates your dataset without telling you (i.e., each observation occurs twice and your dataset grows to twice its original size, but nobody told you). How does the behavior of stochastic gradient descent, minibatch stochastic gradient descent and that of gradient descent change?
