# Challenges of Deep Neural Network Training

In [None]:
# needed modules
import tensorflow as tf
from tensorflow import keras
import sklearn
import numpy as np
import pandas as pd
import os

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## preventive measures for vanishing/exploding gradient
####   
* weight initialization strategy
* activation function 
* batch normalization
* gradient clipping

In [None]:
# use BN layers after activation
# ELU activation & variance adjusting initialization 
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape = [28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, activation = "elu", kernel_initializer = "he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation = "elu", kernel_initializer = "he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation = "softmax")
])



# customizing
'''
# He init. with fan_avg
he_avg_init = keras.initializers.VarianceScaling(scale = 2., mode = 'fan_avg',
                                                 distribution = 'uniform')
keras.layers.Dense(10, activation = "sigmoid", kernel_initializer = he_avg_init)

# using LeakyReLU
keras.layers.Dense(10, kernel_initializer = "he_normal")
keras.layers.LeakyReLU(alpha = 0.2)

# using SELU
layer = keras.layers.Dense(10, activation = "selu",`12*890
                           kernel_initializer = "lecun_normal")
                           
# using BN layers after activation                           
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape = [28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, kernel_initializer = "he_normal", use_bias = False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu")
    keras.layers.Dense(100, kernel_initializer = "he_normal", use_bias = False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu")
    keras.layers.Dense(10, activation = "softmax")
])
'''

In [None]:
model.summary()

In [None]:
# gradient clipping
optimizer = keras.optimizers.SGD(clipvalue = 1.0)
optimizer = keras.optimizers.SGD(clipnorm=1.0)
model.compile(loss = "mse", optimizer = optimizer)

## transfer learning(전이학습)

In [None]:
# a 모델의 출력층을 제외한 모든 은닉층을 모델 b에서 재사용
model_a = keras.models.load_model("my_model_a.h5")
model_b_on_a = keras.models.Sequential(model_a.layers[:-1])
# create new layer (only output layer)
model_b_on_a.add(keras.layers.Dense(1,activation="sigmoid")) 

# a 모델의 원본 가중치를 따로 복제해놓기 (b 훈련시 a도 영향받기때문)
model_a_clone = keras.models.clone_model(model_a)
model_a_clone.set_weights(model_a.get_weights())

# 재사용하는 모든 층의 가중치를 동결 (trainable = False로)
for layer in model_b_on_a.layers[:-1]:
    layer.trainable = False

# 동결 후, 동결 해제 후에는 반드시 모델을 컴파일 할 것
# 컴파일 메서드가 모델에서 훈련될 가중치를 모으기 때문
model_b_on_a.compile(loss = "binary_crossentropy", optimizer = "sgd",
                    metrics = ["accuracy"])

#--------------------------------------------------------------------------------------

# 성능 평가
history = model_b_on_a.fit(X_train_b, y_train_b, epochs = 4,
                          validation_data = (X_valid_b, y_valid_b))

# 적절한 수의 층 동결 해제하기 (trainable = True)
for layer in model_b_on_a.layers[:-1]:
    layer.trainable = True

# 동결 해제 이후에는 학습률을 낮추는 것이 좋다. - 세밀한 가중치 튜닝
optimizer = keras.optimizers.SGD(lr = 1e-4) 
model_b_on_a.compile(loss = "binary_crossentropy", optimizer = optimizer,
                    metrics = ["accuracy"])

history = model_b_on_a.fit(X_train_b, y_train_b, epochs = 16,
                          validation_data = (X_valid_b, y_valid_b))


## fast optimizer

In [None]:
optimizer = keras.optimizers.SGD(lr = 0.001, momentum = 0.9)
optimizer = keras.optimizers.SGD(lr = 0.001, momentum = 0.9, nesterov = True)
optimizer = keras.optimizers.Adagrad(lr=0.001)
optimizer = keras.optimizers.RMSProp(lr = 0.001, rho = 0.9)
optimizer = keras.optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999)
optimizer = keras.optimizers.Adamax(lr=0.001, beta_1=0.9, beta_2=0.999)
optimizer = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999)

#### LlearningRate Scheduling

In [None]:
# power scheduling(거듭제곱 기반 스케줄링)
optimizer = keras.optimizers.SGD(lr = 0.01, decay = 1e-4) # decay는 감소적용 스텝수의 역수

# exponential scheduling(지수 기반 스케줄링)
# 에포크 시작 시 마다 옵티마이저의 learning_rate 속성을 업데이트
def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1 ** (epoch / s)
    return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0 = 0.01, s = 20)
lr_schedular = keras.callbacks.LearningRateSchedulaer(exponential_decay_fn)
history = model.fit(X_train_scaled, y_train, [...], callbacks = [lr_scheduler])

# piecewise constant scheduling(구간별 고정 스케줄링)
def piecewise_constant_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else :
        return 0.001 
def piecewise_constant(boundaries, values):
    boundaries = np.array([0] + boundaries)
    values = np.array(values)
    def piecewise_constant_fn(epoch):
        return values[np.argmax(boundaries > epoch) - 1]
    return piecewise_constant_fn

piecewise_constant_fn = piecewise_constant([5, 15], [0.01, 0.005, 0.001])
lr_schedular = keras.callbacks.LearningRateSchedulaer(piecewise_constant_fn)
history = model.fit(X_train_scaled, y_train, [...], callbacks = [lr_scheduler])
    
# performance scheduling(성능 기반 스케줄링)
# 최상의 검증 손실이 연속 5번의 epoch동안 향상되지 않을 때 마다 학습률에 0.5 곱함
lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience = 5)



## regularization

### l1, l2 규제

In [None]:
# 층마다 l1, l2 혹은 l1_l2 규제 지정. 기본 강도값 0.01
layer = keras.layers.Dense(100, activation = "elu",
                          kernel_initializer = "he_normal",
                          kernel_regularizer = keras.regularizers.l2(0.01))
# l1_l2(0.1, 0.01) 이런식으로 동시에도 적용 가능

# 일반적으로 모든 은닉층에 동일한 활성화함수, 초기화 전략, 규제기법을 공통적으로 적용하므로
# 불필요한 반복을 피하기위해 반복문을 사용하거나 functools.partial()함수를 사용할 수 있다.
from functools import partial

# default 매개변수 설정을 부분저장
RegularizedDense = partial(keras.layers.Dense,
                           activation="elu",
                           kernel_initializer="he_normal",
                           kernel_regularizer=keras.regularizers.l2(0.01))

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    RegularizedDense(300), 
    RegularizedDense(100),
    RegularizedDense(10, activation="softmax",
                    kernel_initializer="glorot_uniform")
])

### Dropout

In [None]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape = [28,28]),
    keras.layers.Dropout(rate = 0.2),
    keras.layers.Dense(300, activation = "elu", kernel_initializer = "he_normal"),
    keras.layers.Dropout(rate = 0.2),
    keras.layers.Dense(100, activation = "elu", kernel_initializer = "he_normal"),
    keras.layers.Dropout(rate = 0.2),
    keras.layers.Dense(10, activation = "softmax")
])

### Monte Carlo Dropout

In [None]:
y_probas = np.stack([model(X_test_scaled, training=True)
                     for sample in range(100)])
y_proba = y_probas.mean(axis=0)
y_std = y_probas.std(axis=0)

### max norm regularization

In [None]:
keras.layers.Dense(100, activation = "elu", kernel_initializer = "he_normal",
                  kernel_constraint = keras.constraints.max_norm(1.))