# DL(w5) DNN - Avoiding Overfitting / Transfer Learning
student ID: 7110018036\
name: Chieh-An, Chou

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

## Avoiding Overfitting 
### 1. Regularization
增加 penalty term 限制 weight，避免其過大。
+ $l_1$ regularization: $J'(\theta)= J(\theta)+\alpha_1\sum_{j=1}^n|w_j|$
+ $l_2$ regularization: $J'(\theta)= J(\theta)+\alpha_2\sum_{j=1}^nw_j^2$
+ Elastic net regularization: $J'(\theta)= J(\theta)+\alpha_1\sum_{j=1}^n|w_j|+\alpha_2\sum_{j=1}^nw_j^2$

In [3]:
[name for name in dir(keras.regularizers) if not name.startswith("_")]

['L1',
 'L1L2',
 'L2',
 'OrthogonalRegularizer',
 'Regularizer',
 'deserialize',
 'get',
 'l1',
 'l1_l2',
 'l2',
 'orthogonal_regularizer',
 'serialize']

In [4]:
r_l1 = keras.regularizers.l1(l1= .01)
r_l2 = keras.regularizers.l2(l2= .01)
r_l1l2 = keras.regularizers.l1_l2(l1= .01, l2=.01)

### Ex1

In [5]:
# Load data
from tensorflow.keras.datasets import fashion_mnist
(x_train_set, y_train_set), (x_test, y_test) = fashion_mnist.load_data()

# Split data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train_set, y_train_set, random_state = 1)

# Preprocessing
pixel_means = x_train.mean(axis=0, keepdims=True) # (N, 28,28) -> (1,28,28)
pixel_stds = x_train.std(axis=0, keepdims=True)
x_train_scaled = (x_train-pixel_means)/pixel_stds
x_valid_scaled = (x_valid-pixel_means)/pixel_stds
x_test_scaled = (x_test-pixel_means)/pixel_stds

In [6]:
# clear and setting random seed
keras.backend.clear_session()
np.random.seed(1)
tf.random.set_seed(1)

In [8]:
r_l2 = keras.regularizers.l2(l2= .01)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dense(units=300, activation='relu', kernel_initializer='he_normal', kernel_regularizer=r_l2), 
    keras.layers.Dense(units=100, activation='relu', kernel_initializer='he_normal', kernel_regularizer=r_l2),
    keras.layers.Dense(units=10, activation='softmax', kernel_regularizer=r_l2)
])

In [9]:
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='nadam',
    metrics=['accuracy']
)

In [11]:
train = model.fit(x_train_scaled, y_train, epochs = 2,
                  validation_data=(x_valid_scaled, y_valid))

Epoch 1/2
Epoch 2/2


### 2. Dropout

In [12]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dropout(rate=.2), # Dropout layer
    keras.layers.Dense(units=300, activation='relu', 
    kernel_initializer='he_normal'),
    keras.layers.Dropout(rate=.2),  
    keras.layers.Dense(units=100, activation='relu', kernel_initializer='he_normal'),
    keras.layers.Dropout(rate=.2), 
    keras.layers.Dense(units=10, activation='softmax')
])

In [13]:
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='nadam',
    metrics=['accuracy']
)

In [14]:
# 每次訓練隨機去除部分神經元
train = model.fit(x_train_scaled, y_train, epochs = 2,
                  validation_data=(x_valid_scaled, y_valid))

Epoch 1/2
Epoch 2/2


In [15]:
# 使用所有神經元
model.evaluate(x_test_scaled, y_test)



[0.38988885283470154, 0.8575000166893005]

In [16]:
# 使用所有神經元
y_proba = model.predict(x_test_scaled)
y_proba[:3].round(2) # 樣本屬於各類別之機率



array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.07, 0.  , 0.29, 0.  , 0.64],
       [0.  , 0.  , 0.99, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]],
      dtype=float32)

In [17]:
y_pred = np.argmax(y_proba ,axis=1)
print(f"y_true = {y_test[:3]} , y_pred = {y_pred[:3]}")

y_true = [9 2 1] , y_pred = [9 2 1]


### 3. Monte Carlo Dropout (MC Dropout)

In [18]:
# Predict with Dropout multiple times
y_probs = np.stack([model(x_test_scaled[:3], training = True) for _ in range(100)])
y_probs.shape # 100 times predictions

(100, 3, 10)

In [19]:
y_prob_mc = y_probs.mean(axis=0) # dim 0: 100 -> 1
np.round(y_prob_mc, 2)

array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.12, 0.  , 0.32, 0.  , 0.56],
       [0.  , 0.  , 0.98, 0.  , 0.01, 0.  , 0.01, 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]],
      dtype=float32)

In [20]:
y_pred = np.argmax(y_proba ,axis=1)
print(f"y_true = {y_test[:3]} , y_pred = {y_pred[:3]}")

y_true = [9 2 1] , y_pred = [9 2 1]


### Alpha Dropout
`SELU` 需要標準化後的神經元，使用`AlphaDropout` 才能使剩下的(未被去除的)神經元滿足此條件

In [21]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.AlphaDropout(rate=.2), # Alpha Dropout
    keras.layers.Dense(units=300, activation='selu', kernel_initializer='lecun_normal'),
    # Layer with activation: SELU 
    keras.layers.AlphaDropout(rate=.2),
    keras.layers.Dense(units=100, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.AlphaDropout(rate=.2),
    keras.layers.Dense(units=10, activation='softmax')
])

In [22]:
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='nadam',
    metrics=['accuracy']
)

In [23]:
train = model.fit(x_train_scaled, y_train, epochs = 2,
                  validation_data=(x_valid_scaled, y_valid))

Epoch 1/2
Epoch 2/2


### 4. Max-Norm Regularization
限制每一層的weights $\|\mathbf{w}\|_2\le r \implies \mathbf{w}\leftarrow r\dfrac{\mathbf{w}}{\|\mathbf{w}\|_2}$

In [24]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.AlphaDropout(rate=.2), 
    keras.layers.Dense(units=300, activation='selu', 
                       kernel_initializer='lecun_normal',
                       kernel_constraint = keras.constraints.max_norm(1.)), # max-norm regularization
    keras.layers.AlphaDropout(rate=.2),
    keras.layers.Dense(units=100, activation='selu',
                        kernel_initializer='lecun_normal',
                       kernel_constraint = keras.constraints.max_norm(1.)),
    keras.layers.AlphaDropout(rate=.2),
    keras.layers.Dense(units=10, activation='softmax')
])

In [25]:
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='nadam',
    metrics=['accuracy']
)

In [26]:
train = model.fit(x_train_scaled, y_train, epochs = 2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2
Epoch 2/2


## Transfer Learning
步驟:
1. 載入已存在模型
2. 使用較低層(Reuse Lower Layers): trained
3. 替換較高層(Replace Upper Layers): untrained
4. 初期訓練(Train in the First Few Epochs)
    + Upper Layers: `trainable = True`
    + Lower Layers: `trainable = False`
5. 後期訓練(Continue Training)
    + Upper Layers: `trainable = True`
    + Lower Layers: `trainable = True`

In [27]:
# Load data
from tensorflow.keras.datasets import fashion_mnist
(x_train_set, y_train_set), (x_test, y_test) = fashion_mnist.load_data()

# Split data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train_set, y_train_set, random_state = 1)

# Preprocessing
x_train = x_train/255
x_valid = x_valid/255
x_test = x_test/255

In [29]:
# Split data into two part: coat-scandal & others
def split_data(x,y):
    idx_B = (y==4) | (y==5)
    y_B = (y[idx_B] == 5).astype(np.float32) # 4 -> 0, 5 -> 1
    y_A = y[~idx_B]
    y_A[y_A > 5] -= 2
    return (x[~idx_B], y_A), (x[idx_B], y_B)
(x_train_A, y_train_A), (x_train_B, y_train_B) = split_data(x_train, y_train)
(x_valid_A, y_valid_A), (x_valid_B, y_valid_B) = split_data(x_valid, y_valid)
(x_test_A, y_test_A), (x_test_B, y_test_B) = split_data(x_test, y_test)

x_train_B, y_train_B = x_train_B[:100], y_train_B[:100] # select 100 samples

In [35]:
print("Dataset A: " , x_train_A.shape, np.unique(y_train_A))
print("Dataset B: " , x_train_B.shape, np.unique(y_train_B))

Dataset A:  (35968, 28, 28) [0 1 2 3 4 5 6 7]
Dataset B:  (100, 28, 28) [0. 1.]


### Model A

In [36]:
# clear and setting random seed
keras.backend.clear_session()
np.random.seed(1)
tf.random.set_seed(1)

In [37]:
model_A = keras.models.Sequential()
model_A.add(keras.layers.Flatten(input_shape=[28,28]))
for hidden_i in (200, 150,100,50):
    model_A.add(keras.layers.Dense(units=hidden_i, activation='relu'))
model_A.add(keras.layers.Dense(units=8, activation='softmax'))

In [38]:
model_A.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 200)               157000    
                                                                 
 dense_1 (Dense)             (None, 150)               30150     
                                                                 
 dense_2 (Dense)             (None, 100)               15100     
                                                                 
 dense_3 (Dense)             (None, 50)                5050      
                                                                 
 dense_4 (Dense)             (None, 8)                 408       
                                                                 
Total params: 207,708
Trainable params: 207,708
Non-trai

In [39]:
model_A.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=['accuracy']
)

In [42]:
train = model_A.fit(x_train_A, y_train_A, epochs = 20,
                  validation_data=(x_valid_A, y_valid_A))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [43]:
model_A.evaluate(x_test_A, y_test_A)



[0.37990137934684753, 0.8668749928474426]

In [44]:
model_A.save('model_A.h5') # save model_A

### Model B (Without Transfer Learning)

In [45]:
model_B = keras.models.Sequential()
model_B.add(keras.layers.Flatten(input_shape=[28,28]))
for hidden_i in (300, 150,100,80):
    model_B.add(keras.layers.Dense(units=hidden_i, activation='relu'))
model_B.add(keras.layers.Dense(1, activation='sigmoid'))

In [46]:
model_B.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 784)               0         
                                                                 
 dense_5 (Dense)             (None, 300)               235500    
                                                                 
 dense_6 (Dense)             (None, 150)               45150     
                                                                 
 dense_7 (Dense)             (None, 100)               15100     
                                                                 
 dense_8 (Dense)             (None, 80)                8080      
                                                                 
 dense_9 (Dense)             (None, 1)                 81        
                                                                 
Total params: 303,911
Trainable params: 303,911
Non-tr

In [47]:
model_B.compile(
    loss='binary_crossentropy', 
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=['accuracy']
)

In [48]:
train = model_B.fit(x_train_B, y_train_B, epochs = 20,
                  validation_data=(x_valid_B, y_valid_B))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
model_B.evaluate(x_test_B, y_test_B)



[0.5432047843933105, 0.609000027179718]

### Model_tr (Transfer Learning)

#### 1. Load model_A

In [50]:
model_A = keras.models.load_model('model_A.h5')
model_A.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 200)               157000    
                                                                 
 dense_1 (Dense)             (None, 150)               30150     
                                                                 
 dense_2 (Dense)             (None, 100)               15100     
                                                                 
 dense_3 (Dense)             (None, 50)                5050      
                                                                 
 dense_4 (Dense)             (None, 8)                 408       
                                                                 
Total params: 207,708
Trainable params: 207,708
Non-trai

#### 2. Reuse Lower Layers
保留 `flatten` - `dense_2`, 捨棄最後兩層

In [52]:
model_tr = keras.models.Sequential(model_A.layers[:-2])
model_tr.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 200)               157000    
                                                                 
 dense_1 (Dense)             (None, 150)               30150     
                                                                 
 dense_2 (Dense)             (None, 100)               15100     
                                                                 
Total params: 202,250
Trainable params: 202,250
Non-trainable params: 0
_________________________________________________________________


#### 3. Replace Upper Layers

In [53]:
model_tr.add(keras.layers.Dense(80, activation='relu'))
model_tr.add(keras.layers.Dense(1, activation='sigmoid'))
model_tr.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 200)               157000    
                                                                 
 dense_1 (Dense)             (None, 150)               30150     
                                                                 
 dense_2 (Dense)             (None, 100)               15100     
                                                                 
 dense_10 (Dense)            (None, 80)                8080      
                                                                 
 dense_11 (Dense)            (None, 1)                 81        
                                                                 
Total params: 210,411
Trainable params: 210,411
Non-tr

#### 4. Train in the First Few Epochs (epoch 1-4)

In [54]:
for layer in model_tr.layers[:-2]:
    layer.trainable  = False # non-trianable lower layers
model_tr.summary(show_trainable=True)

Model: "sequential_2"
____________________________________________________________________________
 Layer (type)                Output Shape              Param #   Trainable  
 flatten (Flatten)           (None, 784)               0         N          
                                                                            
 dense (Dense)               (None, 200)               157000    N          
                                                                            
 dense_1 (Dense)             (None, 150)               30150     N          
                                                                            
 dense_2 (Dense)             (None, 100)               15100     N          
                                                                            
 dense_10 (Dense)            (None, 80)                8080      Y          
                                                                            
 dense_11 (Dense)            (None, 1)                

In [55]:
model_tr.compile(
    loss='binary_crossentropy', 
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=['accuracy']
)

In [56]:
train = model_tr.fit(x_train_B, y_train_B, epochs = 4,
                  validation_data=(x_valid_B, y_valid_B))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


#### 5. Continue Training

In [57]:
for layer in model_tr.layers[:-2]:
    layer.trainable  = True # non-trianable lower layers
model_tr.summary(show_trainable=True)

Model: "sequential_2"
____________________________________________________________________________
 Layer (type)                Output Shape              Param #   Trainable  
 flatten (Flatten)           (None, 784)               0         Y          
                                                                            
 dense (Dense)               (None, 200)               157000    Y          
                                                                            
 dense_1 (Dense)             (None, 150)               30150     Y          
                                                                            
 dense_2 (Dense)             (None, 100)               15100     Y          
                                                                            
 dense_10 (Dense)            (None, 80)                8080      Y          
                                                                            
 dense_11 (Dense)            (None, 1)                

In [58]:
model_tr.compile(
    loss='binary_crossentropy', 
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=['accuracy']
)

In [59]:
train = model_tr.fit(x_train_B, y_train_B, epochs = 16,
                  validation_data=(x_valid_B, y_valid_B))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


### Comparison: model_tr vs model_B

In [60]:
model_tr.evaluate(x_test_B, y_test_B)



[0.22385764122009277, 0.9904999732971191]

In [61]:
model_B.evaluate(x_test_B, y_test_B)



[0.5432047843933105, 0.609000027179718]