# DL(w4) DNN - Learning Rate Scheduling
student ID: 7110018036\
name: Chieh-An, Chou

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

## Optimizer

In [3]:
[name for name in dir(keras.optimizers) if not name.startswith("_")]

['Adadelta',
 'Adagrad',
 'Adam',
 'Adamax',
 'Ftrl',
 'Nadam',
 'Optimizer',
 'RMSprop',
 'SGD',
 'deserialize',
 'experimental',
 'get',
 'legacy',
 'schedules',
 'serialize']

### 1. Momentum Optimizer
$$
\begin{align*}
\mathbf{v}^{t}&=\beta\mathbf{v}^{t-1}-\eta\nabla_\theta J(\theta^{t-1})\\
\theta^t &= \theta^{t-1}+\mathbf{v}^t\\
&= (\theta^{t-1}+\beta\mathbf{v}^{t-1})-\eta\nabla_\theta J(\theta^{t-1})
\end{align*}
$$

In [5]:
opt = keras.optimizers.SGD(learning_rate=.001, momentum=.9)

### 2. Nesterov Accelerated Gradient (NAG)
$$
\begin{align*}
\mathbf{v}^{t}&=\beta\mathbf{v}^{t-1}-\eta\nabla_\theta J(\theta^{t-1}+\beta\mathbf{v}^{t-1})\\
\theta^t &= \theta^{t-1}+\mathbf{v}^t\\
&= (\theta^{t-1}+\beta\mathbf{v}^{t-1})-\eta\nabla_\theta J(\theta^{t-1}+\beta\mathbf{v}^{t-1})
\end{align*}
$$

In [6]:
opt = keras.optimizers.SGD(learning_rate=.001, momentum=.9, nesterov=True)

### 3. Adaptive Gradient (AdaGrad)
$$
\begin{align*}
\mathbf{s}^{t}&=\mathbf{s}^{t-1}-\nabla_\theta J(\theta^{t-1})\otimes\nabla_\theta J(\theta^{t-1})\\
\theta^t &= \theta^{t-1}-\eta\nabla_\theta J(\theta^{t-1})\oslash\sqrt{\mathbf{s}^{t}+\epsilon}
\end{align*}
$$

In [7]:
opt = keras.optimizers.Adagrad(learning_rate=.001)

### 4. Root Mean Square Propagation (RMSProp)
$$
\begin{align*}
\mathbf{s}^{t}&=\beta\mathbf{s}^{t-1}-(1-\beta) \nabla_\theta J(\theta^{t-1})\otimes\nabla_\theta J(\theta^{t-1})\\
\theta^t &= \theta^{t-1}-\eta\nabla_\theta J(\theta^{t-1})\oslash\sqrt{\mathbf{s}^{t}+\epsilon}
\end{align*}
$$

In [8]:
opt = keras.optimizers.RMSprop(learning_rate=.001, rho=.9)

### 5. Adaptive Moment Estimation (Adam)
$$
\begin{align*}
\theta^t &= \theta^{t-1}-\eta\hat{\mathbf{v}}^{t}\oslash\sqrt{\hat{\mathbf{s}}^{t}+\epsilon}\\
\mathbf{v}^{t}&=\beta_1\mathbf{v}^{t-1}-(1-\beta_1)\nabla_\theta J(\theta^{t-1}),\;&
\hat{\mathbf{v}}^{t}= \dfrac{\mathbf{v}^{t}}{1-(\beta_1)^t} \\
\mathbf{s}^{t}&=\beta_2\mathbf{s}^{t-1}-(1-\beta_2) \nabla_\theta J(\theta^{t-1})\otimes\nabla_\theta J(\theta^{t-1}),\;&
\hat{\mathbf{s}}^{t}= \dfrac{\mathbf{s}^{t}}{1-(\beta_2)^t}
\end{align*}
$$

In [9]:
opt = keras.optimizers.Adam(learning_rate=.001, beta_1=.9, beta_2=.999)

### 6. AdaMax
$$
\begin{align*}
\mathbf{v}^{t}&=\beta_1\mathbf{v}^{t-1}-(1-\beta_1)\nabla_\theta J(\theta^{t-1}),\;&\hat{\mathbf{v}}^{t}&= \dfrac{\mathbf{v}^{t}}{1-(\beta_1)^t}\\
\mathbf{s}^{t}&=\max(\beta_2\mathbf{s}^{t-1},\|\nabla_\theta J(\theta^{t-1})\|)\\
\theta^t &= \theta^{t-1}-\eta\hat{\mathbf{v}}^{t}\oslash(\mathbf{s}^{t}+\epsilon)
\end{align*}
$$

In [10]:
opt = keras.optimizers.Adamax(learning_rate=.001, beta_1=.9, beta_2=.999)

### 7. Nadam
$$
\begin{align*}
\hat{\mathbf{g}}^t&=\dfrac{\mathbf{g}^t}{1-\Pi_{i=1}^t\mu_i},\;&\mathbf{g}^t&=\nabla_\theta J(\theta^{t-1})\\
\mathbf{v}^{t}&=\mu_t\mathbf{v}^{t-1}-(1-\mu_t)\mathbf{g}^t,\;&
\hat{\mathbf{v}}^{t}&= \dfrac{\mathbf{v}^{t}}{1-\Pi_{i=1}^{t+1}\mu_i}\\
\bar{\mathbf{v}}^t&=(1-\mu_t)\hat{\mathbf{g}}^t+\mu_{t+1}\hat{\mathbf{v}}^{t}\\
\mathbf{s}^{t}&=\beta\mathbf{s}^{t-1}-(1-\beta) \mathbf{g}^t\otimes\mathbf{g}^t,\;&
\hat{\mathbf{s}}^{t}&= \dfrac{\mathbf{s}^{t}}{1-(\beta)^t}\\
\theta^t &= \theta^{t-1}-\eta\bar{\mathbf{v}}^{t}\oslash\sqrt{\hat{\mathbf{s}}^{t}+\epsilon}
\end{align*}
$$
+ `beta_1`: initial value of $\mu_t$
+ `beta_2`: $\beta$

In [11]:
opt = keras.optimizers.Nadam(learning_rate=.001, beta_1=.9, beta_2=.999)

## Ex

In [12]:
# Load data
from tensorflow.keras.datasets import fashion_mnist
(x_train_set, y_train_set), (x_test, y_test) = fashion_mnist.load_data()

# Split data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train_set, y_train_set, random_state = 1)

# Preprocessing
pixel_means = x_train.mean(axis=0, keepdims=True) # (N, 28,28) -> (1,28,28)
pixel_stds = x_train.std(axis=0, keepdims=True)
x_train_scaled = (x_train-pixel_means)/pixel_stds
x_valid_scaled = (x_valid-pixel_means)/pixel_stds
x_test_scaled = (x_test-pixel_means)/pixel_stds

In [13]:
# clear and setting random seed
keras.backend.clear_session()
np.random.seed(1)
tf.random.set_seed(1)

In [14]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dense(units=300, activation='selu', kernel_initializer='lecun_normal'), 
    keras.layers.Dense(units=100, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.Dense(units=10, activation='softmax')
])

In [15]:
# choose learning rate scheduling method: Expnential Decay
opt = keras.optimizers.Nadam(learning_rate=.001, beta_1=.9, beta_2=.999)
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer=opt,
    metrics=['accuracy']
)

In [17]:
train = model.fit(x_train_scaled, y_train, epochs = 2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2
Epoch 2/2


## Schedules

In [18]:
[name for name in dir(keras.optimizers.schedules) if not name.startswith("_")]

['CosineDecay',
 'CosineDecayRestarts',
 'ExponentialDecay',
 'InverseTimeDecay',
 'LearningRateSchedule',
 'PiecewiseConstantDecay',
 'PolynomialDecay',
 'deserialize',
 'serialize']

### 1. Polynomial Decay
$$
\eta_t = (\eta_0+\eta_T)\times(1+t/s)^c+\eta_T
$$
+ $\eta_0$: initial_learning_rate(.01)
+ $\eta_1$: end_learning_rate(.0001)
+ $s$: decay steps, $s=\begin{cases}s &\text{, if cycle = False}\\s\times\text{ceil}(t/s)&\text{, if cycle = True}\end{cases}$
+ $c$: power, default = 1

In [19]:
lr_poly = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=.1,
    decay_steps=1000,
    end_learning_rate=1e-4,
    power=.5,
    cycle=False
)

### 2. Exponential Decay
$$
\eta_t = \eta_0r^{t/s}
$$
+ $r$: decay_rate

In [20]:
lr_exp = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=.1,
    decay_steps=1000000,
    decay_rate=.96
)

### 3. Piecewise Constant Decay
$$
\eta = \begin{cases}
\eta_0 &\text{if }0\le t\le s_0\\ 
\eta_1 &\text{if }s_0\le t\le s_1\\
\vdots & \vdots\\
\eta_T &\text{if }s_{T-1}\le t\le t
\end{cases}
$$
+ boundaries: $[s_0,\dots,s_{T-1}]$
+ values: $[\eta_0, \dots,\eta_{T-1},\eta_T]$

In [21]:
lr_pcd = keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[100000, 110000],
    values=[1.0,0.5,0.1]
)

## Ex

In [22]:
# Load data
from tensorflow.keras.datasets import fashion_mnist
(x_train_set, y_train_set), (x_test, y_test) = fashion_mnist.load_data()

# Split data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train_set, y_train_set, random_state = 1)

# Preprocessing
pixel_means = x_train.mean(axis=0, keepdims=True)
pixel_stds = x_train.std(axis=0, keepdims=True)
x_train_scaled = (x_train-pixel_means)/pixel_stds
x_valid_scaled = (x_valid-pixel_means)/pixel_stds
x_test_scaled = (x_test-pixel_means)/pixel_stds

In [23]:
# clear and setting random seed
keras.backend.clear_session()
np.random.seed(1)
tf.random.set_seed(1)

In [24]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dense(units=300, activation='selu', kernel_initializer='lecun_normal'), 
    keras.layers.Dense(units=100, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.Dense(units=10, activation='softmax')
])

In [25]:
# choose learning rate scheduling method: Expnential Decay
lr_exp = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=.1,
    decay_steps=100000,
    decay_rate=.96
)
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer=keras.optimizers.SGD(learning_rate=lr_exp),
    metrics=['accuracy']
)

In [26]:
train = model.fit(x_train_scaled, y_train, epochs = 2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2
Epoch 2/2
