In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import math
import tensorflow_addons as tfa
import keras_tuner as kt

from sklearn.model_selection import GridSearchCV



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [2]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "deep"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
cifar10 = tf.keras.datasets.cifar10.load_data()
(X_train_full, y_train_full), (X_test, y_test) = cifar10

X_train = X_train_full[5000:]
y_train = y_train_full[5000:]
X_valid = X_train_full[:5000]
y_valid = y_train_full[:5000]

## A. Build a DNN with 20 hidden layers of 100 neurons each (that’s too many, but it’s the point of this exercise). Use He initialization and the Swish activation function.

In [4]:
tf.keras.backend.clear_session()

In [None]:
X_train.shape

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape = [32,32,3]))
for _ in range(20):
    model.add(tf.keras.layers.Dense(100,
                                    activation="swish",
                                    kernel_initializer="he_normal"))


## B. Ｕsing Nadam optimization and early stopping, train the network on the CIFAR10 dataset. 

In [None]:
np.unique(y_train)

有 10 個類，因此需要一個具有 10 個神經元的 softmax 輸出層。

add the output layer to the model:

In [None]:
model.add(tf.keras.layers.Dense(10,activation="softmax"))


In [None]:
model.summary()

Compiling the Model

Nadam 優化是 Adam 優化加上 Nesterov 技巧，因此它往往會比 Adam 收斂得稍快一些。

In [None]:
optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

In [None]:
model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
              optimizer=optimizer,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])

EarlyStopping callback and Tensorboard callback


In [None]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20,restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("my_cifar10_model",save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs" / f"run_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

現在模型已準備好接受訓練。 為此，我們只需要調用它的 fit() 方法。


In [None]:
history = model.fit(X_train, y_train, epochs=100,
          validation_data=(X_valid, y_valid),
          callbacks=callbacks)

In [None]:
pd.DataFrame(history.history).plot(
    figsize=(8,5), xlim=[0,59], ylim=[0,2.5], grid=True, xlabel="Epoch",
    style=["r--", "r--", "b-", "b-*"])
plt.legend(loc="lower left")
plt.show()

In [None]:
total_time = 12  # Total time in minutes
num_epochs = 60  # Number of epochs

time_per_epoch = total_time / num_epochs
time_per_epoch

In [None]:
model.evaluate(X_valid, y_valid, return_dict=True)

驗證集loss最低的模型在驗證集上的準確率約為 42.4%。 花了 28 個Epoch才達到最低驗證損失，在我的筆記本電腦上每個epoch大約需要 12 秒(0.2min)。

In [None]:
model.save("my_cifar10_model", save_format="tf")

 看看是否可以使用Batch Normalization來改進模型。

## C.  try adding Batch Normalization and compare the learning curves: Is it converging faster than before? Does it produce a better model? How does it affect training speed?

In [None]:
model = tf.keras.models.load_model("my_cifar10_model")

The code below is very similar to the code above, with a few changes:

* I added a BN layer after every Dense layer (before the activation function), except for the output layer.

* I renamed the run directories to run_bn_* and the model file name to my_cifar10_bn_model.

In [None]:
tf.random.set_seed(42)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
    model.add(tf.keras.layers.Dense(100, kernel_initializer="he_normal"))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Activation("swish"))

model.add(tf.keras.layers.Dense(10, activation="softmax"))

optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])

early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20,
                                                     restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("my_cifar10_bn_model",
                                                         save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs_BN" / f"run_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

history = model.fit(X_train, y_train, epochs=100,
          validation_data=(X_valid, y_valid),
          callbacks=callbacks)


In [None]:
pd.DataFrame(history.history).plot(
    figsize=(8,5), xlim=[0,41], ylim=[0,2.5], grid=True, xlabel="Epoch",
    style=["r--", "r--", "b-", "b-*"])
plt.legend(loc="lower left")
plt.show()

In [None]:
# Get the validation loss values from the history object
val_losses = history.history['val_loss']

# Get the validation accuracy values from the history object
val_accs = history.history['val_accuracy']

# Find the index of the epoch with the lowest validation loss
best_epoch = np.argmin(val_losses)

# Get the lowest validation loss and the corresponding epoch number
lowest_val_loss = val_losses[best_epoch]
lowest_val_acc = val_accs[best_epoch]
epoch_with_lowest_loss = best_epoch + 1  # Add 1 since epochs are zero-indexed

print("Epoch with the lowest validation loss:", epoch_with_lowest_loss)
print("Lowest validation loss:", lowest_val_loss)
print("Validation accuracy at the epoch:", lowest_val_acc)

In [None]:
model.evaluate(X_valid, y_valid)

In [None]:
total_time = 14  # Total time in minutes
num_epochs = 40  # Number of epochs

time_per_epoch = total_time / num_epochs
time_per_epoch

* The final model is also much better, with 50.7% validation accuracy

* How does BN affect training speed? each epoch took about 21s instead of 12s, because of the extra computations required by the BN layers. 

## D. Try replacing Batch Normalization with SELU, and make the necessary adjustements to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).

In [None]:
tf.keras.backend.clear_session()

In [None]:
tf.random.set_seed(42)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
    model.add(tf.keras.layers.Dense(100,
                                    kernel_initializer="lecun_normal",
                                    activation="selu"))

model.add(tf.keras.layers.Dense(10, activation="softmax"))

optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=20, restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "my_cifar10_selu_model", save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs_selu" / f"run_selu_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

#輸入特徵必須標準化：均值 0 和標準差 1。
X_means = X_train.mean(axis=0)
X_stds = X_train.std(axis=0)
X_train_scaled = (X_train - X_means) / X_stds
X_valid_scaled = (X_valid - X_means) / X_stds
X_test_scaled = (X_test - X_means) / X_stds

history = model.fit(X_train_scaled, y_train, epochs=100,
          validation_data=(X_valid_scaled, y_valid),
          callbacks=callbacks)


In [None]:
pd.DataFrame(history.history).plot(
    figsize=(8,5), xlim=[0,33], ylim=[0,2.5], grid=True, xlabel="Epoch",
    style=["r--", "r--", "b-", "b-*"])
plt.legend(loc="lower left")
plt.show()

In [None]:
# Get the validation loss values from the history object
val_losses = history.history['val_loss']

# Get the validation accuracy values from the history object
val_accs = history.history['val_accuracy']

# Find the index of the epoch with the lowest validation loss
best_epoch = np.argmin(val_losses)

# Get the lowest validation loss and the corresponding epoch number
lowest_val_loss = val_losses[best_epoch]
lowest_val_acc = val_accs[best_epoch]
epoch_with_lowest_loss = best_epoch + 1  # Add 1 since epochs are zero-indexed

print("Epoch with the lowest validation loss:", epoch_with_lowest_loss)
print("Lowest validation loss:", lowest_val_loss)
print("Validation accuracy at the epoch:", lowest_val_acc)

In [None]:
total_time = 6.5  # Total time in minutes
num_epochs = 34  # Number of epochs

time_per_epoch = total_time / num_epochs
time_per_epoch

This model reached reached its lowest validation loss, with about 49.2% accuracy, which is better than the original model(42.4%), close to using batch normalization(49.5%). Each epoch took only 11 seconds. So it's the fastest model to train so far.

## E. Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC Dropout.

In [None]:
tf.keras.backend.clear_session()

In [None]:
X_train.shape

In [None]:
tf.random.set_seed(42)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
    model.add(tf.keras.layers.Dense(100,
                                    kernel_initializer="lecun_normal",
                                    activation="selu"))

model.add(tf.keras.layers.AlphaDropout(rate=0.1))
model.add(tf.keras.layers.Dense(10, activation="softmax"))

optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=20, restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "my_cifar10_alpha_dropout_model", save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs_alpha_dropout" / f"run_alpha_dropout_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

X_means = X_train.mean(axis=0)
X_stds = X_train.std(axis=0)
X_train_scaled = (X_train - X_means) / X_stds
X_valid_scaled = (X_valid - X_means) / X_stds
X_test_scaled = (X_test - X_means) / X_stds

history = model.fit(X_train_scaled, y_train, epochs=100,
          validation_data=(X_valid_scaled, y_valid),
          callbacks=callbacks)

In [None]:
pd.DataFrame(history.history).plot(
    figsize=(8,5), xlim=[0,27], ylim=[0,2.5], grid=True, xlabel="Epoch",
    style=["r--", "r--", "b-", "b-*"])
plt.legend(loc="lower left")
plt.show()

In [None]:
# Get the validation loss values from the history object
val_losses = history.history['val_loss']

# Get the validation accuracy values from the history object
val_accs = history.history['val_accuracy']

# Find the index of the epoch with the lowest validation loss
best_epoch = np.argmin(val_losses)

# Get the lowest validation loss and the corresponding epoch number
lowest_val_loss = val_losses[best_epoch]
lowest_val_acc = val_accs[best_epoch]
epoch_with_lowest_loss = best_epoch + 1  # Add 1 since epochs are zero-indexed

print("Epoch with the lowest validation loss:", epoch_with_lowest_loss)
print("Lowest validation loss:", lowest_val_loss)
print("Validation accuracy at the epoch:", lowest_val_acc)

In [None]:
model.evaluate(X_valid_scaled, y_valid)

In [None]:
total_time = 5  # Total time in minutes
num_epochs = 28  # Number of epochs

time_per_epoch = total_time / num_epochs
time_per_epoch

The model reaches 46.2% accuracy on the validation set. That's worse than without dropout (49.2%). With an extensive hyperparameter search, it might be possible to do better.

Let's use MC Dropout now. We will need the MCAlphaDropout class:

In [None]:
class MCAlphaDropout(tf.keras.layers.AlphaDropout):
    def call(self, inputs,training=None):
        return super().call(inputs, training=True)

Now let's create a new model, identical to the one we just trained (with the same weights), but with MCAlphaDropout dropout layers instead of AlphaDropout layers:

In [None]:
Dropout = tf.keras.layers.Dropout
mc_model = tf.keras.Sequential([
    MCAlphaDropout(layer.rate) if isinstance(layer, Dropout) else layer
    for layer in model.layers
])


In [None]:
mc_model.set_weights(model.get_weights())

In [None]:
mc_model.summary()

Then let's add a couple utility functions. The first will run the model many times (10 by default) and it will return the mean predicted class probabilities. The second will use these mean probabilities to predict the most likely class for each instance:

In [None]:
def mc_dropout_predict_probas(mc_model, X, n_samples=10):
    Y_probas = [mc_model.predict(X) for sample in range(n_samples)]
    return np.mean(Y_probas, axis=0)

def mc_dropout_predict_classes(mc_model, X, n_samples=10):
    Y_probas = mc_dropout_predict_probas(mc_model, X, n_samples)
    return Y_probas.argmax(axis=1)

Now let's make predictions for all the instances in the validation set, and compute the accuracy:

In [None]:
tf.random.set_seed(42)

y_pred = mc_dropout_predict_classes(mc_model, X_valid_scaled)
accuracy = (y_pred == y_valid[:, 0]).mean()
accuracy

## So the best model we got in this exercise is the Batch Normalization model.

# 我自己來調參做做看

## 1cycle scheduling

In [8]:
n_epochs = 25
batch_size = 32

In [14]:
MAX_LEARNING_RATE = 0.01
START_LEARNING_RATE = 0.001
steps_per_epoch = len(X_train) // batch_size
CYCLICAL_LEARNING_RATE = tfa.optimizers.CyclicalLearningRate(
    initial_learning_rate=START_LEARNING_RATE,
    maximal_learning_rate=MAX_LEARNING_RATE,
    scale_fn=lambda x: 1/(2.**(x-1)),
    step_size=2 * steps_per_epoch
)

## Optimization

In [15]:
def build_model(hp):
    n_hidden = hp.Int("n_hidden", min_value = 15, max_value = 25, default = 20)
    n_neurons = hp.Int("n_neurons", min_value = 50, max_value = 270)
    optimizer = hp.Choice("optimizer", values = ["sgd", "nadam", "adam", "adamw"])
    if optimizer == "sge":
        optimizer = tf.keras.optimizers.SGD(momentum=0.9, nesterov=True, learning_rate=CYCLICAL_LEARNING_RATE)
    elif  optimizer == "nadam":
        optimizer = tf.keras.optimizers.Nadam(beta_1=0.9, beta_2=0.999, learning_rate=CYCLICAL_LEARNING_RATE)
    elif optimizer == "adam":
        optimizer = tf.keras.optimizers.Adam(beta_1=0.9, beta_2=0.999, learning_rate=CYCLICAL_LEARNING_RATE)
    elif optimizer == "adamw":
        optimizer = tfa.optimizers.AdamW(weight_decay=1e-5, beta_1=0.9, beta_2=0.999, learning_rate=CYCLICAL_LEARNING_RATE)
    activation_function = hp.Choice("activation_function", values = ["elu", "relu", "swish"])
    
    my_model = tf.keras.Sequential()
    my_model.add(tf.keras.layers.Flatten(input_shape = [32,32,3]))
    for _ in range(n_hidden):
        my_model.add(tf.keras.layers.Dense(n_neurons, kernel_initializer="he_normal"))
        my_model.add(tf.keras.layers.BatchNormalization())
        my_model.add(tf.keras.layers.Activation(activation=activation_function,))
    my_model.add(tf.keras.layers.Dense(10, activation="softmax"))
    my_model.compile(loss = "sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    
    return my_model
        
    

In [16]:
class MyClassificationHyperModel(kt.HyperModel):
    def build(self, hp):
        return build_model(hp)
    
    def fit(self, hp, model, X, y, **kwargs):
        if hp.Boolean("normalize"):
            norm_layer = tf.keras.layers.Normalization()
            X = norm_layer(X)
        return model.fit(X, y, **kwargs)

In [17]:
bayesian_opt_tuner = kt.BayesianOptimization(
    MyClassificationHyperModel(), objective="val_accuracy", seed=42,
    max_trials=10, alpha=1e-4, beta=2.6,
    overwrite = True, directory = "my_cifar10_best", project_name = "bayesian_opt"
)




## callbacks

In [18]:

#EarlyStopping
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=20, restore_best_weights=True)
#ModelCheckpoint
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "my_best_model", save_best_only=True)
#TensorBoard
root_logdir = Path(bayesian_opt_tuner.project_dir)/"tensorboard"
tensorboard_cb = tf.keras.callbacks.TensorBoard(root_logdir)


callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

## Tuning

Using 25 epochs to find the best hyperparameters

In [19]:
bayesian_opt_tuner.search(X_train, y_train, epochs = 25, 
                          validation_data = (X_valid, y_valid), 
                          callbacks = callbacks)

Trial 10 Complete [00h 09m 39s]
val_accuracy: 0.527400016784668

Best val_accuracy So Far: 0.5465999841690063
Total elapsed time: 01h 56m 49s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


In [20]:
bayesian_opt_tuner.get_best_models(num_models=3)

[<keras.engine.sequential.Sequential at 0x197b5323e90>,
 <keras.engine.sequential.Sequential at 0x197b4e977d0>,
 <keras.engine.sequential.Sequential at 0x197f0c9b290>]

In [21]:
best_trial = bayesian_opt_tuner.oracle.get_best_trials(num_trials=1)[0]
best_trial.summary()

Trial 02 summary
Hyperparameters:
n_hidden: 17
n_neurons: 266
optimizer: adam
activation_function: swish
normalize: False
Score: 0.5465999841690063


In [22]:
best_trial.metrics.get_last_value("val_accuracy")

0.5465999841690063

## TensorBoard

In [32]:
test_logdir = Path("C:/Users/Jonathan/Documents/BA/Python Data Science/Sklearn and TensorFlow ML Book/Deep learning/11. Training Deep Neural Networks/my_cifar10_best/bayesian_opt/tensorboard")
writer = tf.summary.create_file_writer(str(test_logdir))
with writer.as_default():
    for step in range(1, 1000 + 1):
        tf.summary.scalar("my_scalar", np.sin(step / 10), step=step)
        
        data = (np.random.randn(100) + 2) * step / 100  # gets larger
        tf.summary.histogram("my_hist", data, buckets=50, step=step)
        
        images = np.random.rand(2, 32, 32, 3) * step / 1000  # gets brighter
        tf.summary.image("my_images", images, step=step)
        
        texts = ["The step is " + str(step), "Its square is " + str(step ** 2)]
        tf.summary.text("my_text", texts, step=step)
        
        sine_wave = tf.math.sin(tf.range(12000) / 48000 * 2 * np.pi * step)
        audio = tf.reshape(tf.cast(sine_wave, tf.float32), [1, -1, 1])
        tf.summary.audio("my_audio", audio, sample_rate=48000, step=step)


In [33]:
%load_ext tensorboard
%tensorboard --logdir="C:/Users/Jonathan/Documents/BA/Python Data Science/Sklearn and TensorFlow ML Book/Deep learning/11. Training Deep Neural Networks/my_cifar10_best/bayesian_opt/tensorboard"


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 8232), started 0:06:06 ago. (Use '!kill 8232' to kill it.)

## Continue training on best model

In [40]:
best_model  = bayesian_opt_tuner.get_best_models(num_models=1)[0]

In [43]:
best_model.fit(X_train_full, y_train_full, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x197e21cd050>

In [44]:
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)

