In [1]:
import os
import numpy as np
import time
import tensorflow as tf
import matplotlib.pyplot as plt

np.random.seed(1234)
tf.random.set_seed(1234)
tf.config.list_physical_devices('GPU')

[]

Set Parameter

In [2]:
from keras.datasets import mnist

# load and normalize data
#(X_train, y_train), (X_test, y_test) =  tf.keras.datasets.mnist.load_data()
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

#Normalization
X_train = tf.reshape(X_train, (X_train.shape[0],-1))/255
X_test = tf.reshape(X_test, (X_test.shape[0],-1))/255

#last 10000 training examples for validation
X_train, X_val = X_train[:-10000], X_train[-10000:]
y_train, y_val = y_train[:-10000], y_train[-10000:]

print("train_size:", X_train.shape)
print("val_size:", X_val.shape)
print("test_size:", X_test.shape)
print("train_output_size", y_train.shape)
print("val_output_size:", y_val.shape)
print("test_output_size", y_test.shape)
print("max_val:", np.max(X_train))



train_size: (50000, 784)
val_size: (10000, 784)
test_size: (10000, 784)
train_output_size (50000,)
val_output_size: (10000,)
test_output_size (10000,)
max_val: 1.0


In [3]:
size_input = X_train.shape[1]
size_output = len(set(y_train))
size_hidden1 = 256
size_hidden2 = 128

Build MLP using Eager Execution

In [4]:
class MLP(object):
    def __init__(self, size_input, size_hidden1, size_hidden2, size_output, device=None, regularizer=None, R_lambda = 1e-4, drop_prob=0):
        """
        size_input: int, size of input layer
        size_hidden: int, size of hidden layer
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
        regularizer: str or None
        R_lambda: the parameter for regularizer
        drop_prob: 0 to 1
        """
        self.size_input, self.size_hidden1, self.size_hidden2, self.size_output, self.device =\
        size_input, size_hidden1, size_hidden2, size_output, device
        
        self.regularizer, self.R_lambda, self.drop_prob = regularizer, R_lambda, drop_prob
        
        # Initialize weights between input layer and hidden layer 1
        self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1],stddev=0.1)) 
        # Initialize biases for hidden layer 1
        self.b1 = tf.Variable(tf.random.normal([1, self.size_hidden1]))# 0 or constant(0.01)

        # Initialize weights between hidden layer 1 and hidden layer 2
        self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2],stddev=0.1))
        # Initialize biases for hidden layer 2
        self.b2 = tf.Variable(tf.random.normal([1, self.size_hidden2]))

         # Initialize weights between hidden layer 2 and output layer
        self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output],stddev=0.1))
        # Initialize biases for output layer
        self.b3 = tf.Variable(tf.random.normal([1, self.size_output]))

        # Define variables to be updated during backpropagation
        self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]
        
        # Initialize the state of custom optimizer
        self.v_W1 = tf.Variable(tf.zeros([self.size_input, self.size_hidden1]))
        self.v_b1 = tf.Variable(tf.zeros([1,self.size_hidden1]))
        self.u_W1 = tf.Variable(tf.zeros([self.size_input, self.size_hidden1]))
        self.u_b1 = tf.Variable(tf.zeros([1,self.size_hidden1]))
        self.m_W1 = tf.Variable(tf.zeros([self.size_input, self.size_hidden1]))
        self.m_b1 = tf.Variable(tf.zeros([1,self.size_hidden1]))
        
        self.v_W2 = tf.Variable(tf.zeros([self.size_hidden1, self.size_hidden2]))
        self.v_b2 = tf.Variable(tf.zeros([1,self.size_hidden2]))
        self.u_W2 = tf.Variable(tf.zeros([self.size_hidden1, self.size_hidden2]))
        self.u_b2 = tf.Variable(tf.zeros([1,self.size_hidden2]))
        self.m_W2 = tf.Variable(tf.zeros([self.size_hidden1, self.size_hidden2]))
        self.m_b2 = tf.Variable(tf.zeros([1,self.size_hidden2]))
        
        self.v_W3 = tf.Variable(tf.zeros([self.size_hidden2, self.size_output]))
        self.v_b3 = tf.Variable(tf.zeros([1,self.size_output]))
        self.u_W3 = tf.Variable(tf.zeros([self.size_hidden2, self.size_output]))
        self.u_b3 = tf.Variable(tf.zeros([1,self.size_output]))
        self.m_W3 = tf.Variable(tf.zeros([self.size_hidden2, self.size_output]))
        self.m_b3 = tf.Variable(tf.zeros([1,self.size_output]))
        
        self.v_state = [self.v_W1,self.v_W2,self.v_W3,self.v_b1,self.v_b2,self.v_b3]
        self.u_state = [self.u_W1,self.u_W2,self.u_W3,self.u_b1,self.u_b2,self.u_b3]
        self.m_state = [self.m_W1,self.m_W2,self.m_W3,self.m_b1,self.m_b2,self.m_b3]
       
    def forward(self, X):
        """
        forward pass
        X: Tensor, inputs
        """
        
        if self.device is not None:
            with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)

        return self.y
    
    def loss(self, y_pred, y_true):
        '''
        y_pred - Tensor of shape (batch_size, size_output)
        y_true - Tensor of shape (batch_size, size_output)
        '''  
        #cross entropy loss for classifation mission
        return tf.losses.sparse_categorical_crossentropy(y_true,y_pred, from_logits = False)
        
    def backward(self, X_train, y_train, hyperparams, method='custom'):
        """
        backward pass
        """
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
            
            num_layer = 3
            if not self.regularizer:
                current_loss = self.loss(predicted, y_train)
            #l2 norm
            elif self.regularizer == 'l2':
                w = tf.concat([tf.reshape(w,[-1]) for w in self.variables[:num_layer]],0)#self.variable[:3] -> w1,w2,w3
                current_loss  += self.R_lambda * tf.nn.l2_loss(w)
            #l1 norm
            elif self.regularizer == 'l1':
                w = tf.concat([tf.reshape(w,[-1]) for w in self.variables[:num_layer]],0)
                current_loss  += self.R_lambda * tf.nn.l1_loss(w)
            
        grads = tape.gradient(current_loss, self.variables)
        
        if method == 'sgd':
            optimizer = tf.keras.optimizers.SGD(learning_rate = hyperparams['lr'])
            optimizer.apply_gradients(zip(grads, self.variables))
        elif method == 'adam':
            optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['lr'], beta_1=0.9, beta_2=0.999, epsilon=1e-6,amsgrad=False,\
                                                 name='Adam')
            optimizer.apply_gradients(zip(grads, self.variables))
        elif method == 'RMSprop':
            optimizer = tf.keras.optimizers.RMSprop(learning_rate = hyperparams['lr'])
            optimizer.apply_gradients(zip(grads, self.variables))
        elif method == 'custom':
            #Custom optimizer
            beta1,beta2,beta3,eps = 0.9,0.999,0.999987,1e-8

            for p,m,v,u,grad in zip(self.variables, self.m_state, self.v_state, self.u_state, grads):
                m[:].assign(beta1 * m  + (1 - beta1) * grad)
                v[:].assign(beta2 * v  + (1 - beta2) * tf.math.square(grad))
                u[:].assign(beta3 * u  + (1 - beta3) * tf.math.pow(grad, 3))
                m_bias_corr = m / (1 - beta1 ** hyperparams['t'])
                v_bias_corr = v / (1 - beta1 ** hyperparams['t'])
                u_bias_corr = u / (1 - beta2 ** hyperparams['t'])
                p[:].assign(p - hyperparams['lr'] * m_bias_corr/ (tf.math.sqrt(v_bias_corr) +\
                                                                  eps*tf.math.sign(u_bias_corr)*tf.math.pow(abs(u_bias_corr),1.0/3.0) + eps))
    def compute_output(self,X):
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
            
        #set the dropout prob
        prob = self.drop_prob

        # Compute values in hidden layer 1
        what1 = tf.matmul(X_tf, self.W1) + self.b1
        hhat1 = tf.nn.experimental.stateless_dropout(tf.nn.relu(what1), rate = prob, seed = [1,0])/(1-prob)

        # Compute values in hidden layer 2
        what2 = tf.matmul(hhat1, self.W2) + self.b2
        hhat2 = tf.nn.experimental.stateless_dropout(tf.nn.relu(what2), rate = prob, seed = [1,0])/(1-prob)

        # Compute output
        output = tf.nn.softmax(tf.matmul(hhat2, self.W3) + self.b3)
        return output

    def accuracy(self,y_pred, y_true):
        """
        compute the correct num
        y_pred: the probability distribution [[...]] or the predicted label [...]
        y_true: the 1-D true label
        """
        #detect if y_pred is a probability distribution 
        if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
            y_pred = tf.argmax(y_pred, axis=1)
            
        cmp = tf.cast(y_pred, y_true.dtype) == y_true
        
        return float(tf.reduce_sum(tf.cast(cmp, tf.int32)))

Custom Optimizer Training

In [5]:
# Set number of simulations and epochs
result_ten = []
NUM_SIM = 10
NUM_EPOCHS = 200

for num_sim in range(NUM_SIM):
    np.random.seed(num_sim)
    tf.random.set_seed(num_sim)
    '''
    Initialize model using GPU or load an exitsing MLP
    '''
    mlp_custom = MLP(size_input, size_hidden1, size_hidden2, size_output, device='GPU',\
                 regularizer='l2', R_lambda = 1e-4, drop_prob=0.)

    time_start = time.time()
    hyperparams = {'t':1, 'lr':2e-5}

    for epoch in range(NUM_EPOCHS):
        train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(25, seed=epoch*num_sim).batch(128)

        for inputs, outputs in train_ds:
            preds = mlp_custom.forward(inputs)

            #use custom optimizer to train the model
            mlp_custom.backward(inputs, outputs, hyperparams,'custom')
            hyperparams['t'] += 1
        
        if (epoch + 1)%10 == 0:
            #compute the result for the current epoch
            logits = mlp_custom.forward(X_train)
            train_loss = np.sum(mlp_custom.loss(logits, y_train))/len(y_train)
            train_acc = mlp_custom.accuracy(logits,y_train)/len(y_train)

            logits = mlp_custom.forward(X_val)
            val_loss = np.sum(mlp_custom.loss(logits, y_val))/len(y_val)
            val_acc = mlp_custom.accuracy(logits,y_val)/len(y_val)
            
            logits = mlp_custom.forward(X_test)
            test_loss = np.sum(mlp_custom.loss(logits, y_test))/len(y_test)
            test_acc = mlp_custom.accuracy(logits,y_test)/len(y_test)
            
            print('Number of Simulation = {} - Number of Epoch = {}'.format(num_sim+1, epoch + 1))
            print('Train loss:= {:.4f} - Val loss: {:.4f} - Test loss: {:.4f} - Train acc:= {:.2%} - Val acc:= {:.2%} - Test acc:= {:.2%}'\
                  .format(train_loss, val_loss, test_loss, train_acc, val_acc, test_acc))
            
    time_taken = time.time() - time_start
    result_ten.append([train_acc,val_acc,test_acc,time_taken])
    print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))

Number of Simulation = 1 - Number of Epoch = 10
Train loss:= 0.4398 - Val loss: 0.4563 - Test loss: 0.4779 - Train acc:= 84.82% - Val acc:= 83.78% - Test acc:= 82.97%
Number of Simulation = 1 - Number of Epoch = 20
Train loss:= 0.3850 - Val loss: 0.4082 - Test loss: 0.4315 - Train acc:= 86.65% - Val acc:= 85.61% - Test acc:= 84.57%
Number of Simulation = 1 - Number of Epoch = 30
Train loss:= 0.3550 - Val loss: 0.3841 - Test loss: 0.4080 - Train acc:= 87.56% - Val acc:= 86.38% - Test acc:= 85.61%
Number of Simulation = 1 - Number of Epoch = 40
Train loss:= 0.3336 - Val loss: 0.3683 - Test loss: 0.3926 - Train acc:= 88.35% - Val acc:= 86.80% - Test acc:= 86.08%
Number of Simulation = 1 - Number of Epoch = 50
Train loss:= 0.3168 - Val loss: 0.3565 - Test loss: 0.3815 - Train acc:= 88.83% - Val acc:= 87.15% - Test acc:= 86.42%
Number of Simulation = 1 - Number of Epoch = 60
Train loss:= 0.3029 - Val loss: 0.3475 - Test loss: 0.3729 - Train acc:= 89.32% - Val acc:= 87.28% - Test acc:= 86.77

Number of Simulation = 5 - Number of Epoch = 100
Train loss:= 0.2610 - Val loss: 0.3242 - Test loss: 0.3490 - Train acc:= 90.81% - Val acc:= 88.22% - Test acc:= 87.57%

Total time taken (in seconds): 981.37


Train Model with SGD

In [None]:
# Set number of simulations and epochs
NUM_SIM = 10
NUM_EPOCHS = 200
result_ten_SGD = []

for num_sim in range(NUM_SIM):
    np.random.seed(num_sim)
    tf.random.set_seed(num_sim)
    '''
    Initialize model using GPU or load an exitsing MLP
    '''
    mlp_SGD = MLP(size_input, size_hidden1, size_hidden2, size_output, device='GPU',\
                 regularizer=None, R_lambda = 1e-4, drop_prob=0.)

    time_start = time.time()
    hyperparams = {'t':1, 'lr':2e-5}

    for epoch in range(NUM_EPOCHS):
        train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(25, seed=epoch*num_sim).batch(128)

        for inputs, outputs in train_ds:
            preds = mlp_SGD.forward(inputs)

            #use custom optimizer to train the model
            mlp_SGD.backward(inputs, outputs, hyperparams,'sgd')
            hyperparams['t'] += 1
        
        if (epoch + 1)%10 == 0:
            #compute the result for the current epoch
            logits = mlp_SGD.forward(X_train)
            train_loss = np.sum(mlp_SGD.loss(logits, y_train))/len(y_train)
            train_acc = mlp_SGD.accuracy(logits,y_train)/len(y_train)

            logits = mlp_SGD.forward(X_val)
            val_loss = np.sum(mlp_SGD.loss(logits, y_val))/len(y_val)
            val_acc = mlp_SGD.accuracy(logits,y_val)/len(y_val)
            
            logits = mlp_SGD.forward(X_test)
            test_loss = np.sum(mlp_SGD.loss(logits, y_test))/len(y_test)
            test_acc = mlp_SGD.accuracy(logits,y_test)/len(y_test)
            
            print('Number of Simulation = {} - Number of Epoch = {}'.format(num_sim+1, epoch + 1))
            print('Train loss:= {:.4f} - Val loss: {:.4f} - Test loss: {:.4f} - Train acc:= {:.2%} - Val acc:= {:.2%} - Test acc:= {:.2%}'\
                  .format(train_loss, val_loss, test_loss, train_acc, val_acc, test_acc))
            
    time_taken = time.time() - time_start
    result_ten_SGD.append([train_acc,val_acc,test_acc,time_taken])
    print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))

Train Model with RMSprop

In [64]:
# Set number of simulations and epochs
NUM_SIM = 10
NUM_EPOCHS = 200

for num_sim in range(NUM_SIM):
    np.random.seed(num_sim)
    tf.random.set_seed(num_sim)
    '''
    Initialize model using GPU or load an exitsing MLP
    '''
    mlp_rms = MLP(size_input, size_hidden1, size_hidden2, size_output, device='GPU',\
                 regularizer='l2', R_lambda = 1e-4, drop_prob=0.)

    time_start = time.time()
    hyperparams = {'t':1, 'lr':2e-5}

    for epoch in range(NUM_EPOCHS):
        train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(25, seed=epoch*num_sim).batch(128)

        for inputs, outputs in train_ds:
            preds = mlp_rms.forward(inputs)

            #use custom optimizer to train the model
            mlp_rms.backward(inputs, outputs, hyperparams,'RMSprop')
            hyperparams['t'] += 1
        
        if (epoch + 1)%10 == 0:
            #compute the result for the current epoch
            logits = mlp_rms.forward(X_train)
            train_loss = np.sum(mlp_rms.loss(logits, y_train))/len(y_train)
            train_acc = mlp_rms.accuracy(logits,y_train)/len(y_train)


            logits = mlp_rms.forward(X_val)
            val_loss = np.sum(mlp_rms.loss(logits, y_val))/len(y_val)
            val_acc = mlp_rms.accuracy(logits,y_val)/len(y_val)

            
            logits = mlp_rms.forward(X_test)
            test_loss = np.sum(mlp_rms.loss(logits, y_test))/len(y_test)
            test_acc = mlp_rms.accuracy(logits,y_test)/len(y_test)

            
            print('Number of Simulation = {} - Number of Epoch = {}'.format(num_sim+1, epoch + 1))
            print('Train loss:= {:.4f} - Val loss: {:.4f} - Test loss: {:.4f} - Train acc:= {:.2%} - Val acc:= {:.2%} - Test acc:= {:.2%}'\
                  .format(train_loss, val_loss, test_loss, train_acc, val_acc, test_acc))
            
    time_taken = time.time() - time_start 
    print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))

Number of Simulation = 1 - Number of Epoch = 10
Train loss:= 0.3873 - Val loss: 0.4128 - Test loss: 0.4352 - Train acc:= 86.35% - Val acc:= 85.59% - Test acc:= 84.62%
Number of Simulation = 1 - Number of Epoch = 20
Train loss:= 0.3400 - Val loss: 0.3772 - Test loss: 0.4017 - Train acc:= 88.03% - Val acc:= 86.75% - Test acc:= 85.84%
Number of Simulation = 1 - Number of Epoch = 30
Train loss:= 0.3134 - Val loss: 0.3642 - Test loss: 0.3876 - Train acc:= 88.98% - Val acc:= 87.25% - Test acc:= 86.56%
Number of Simulation = 1 - Number of Epoch = 40
Train loss:= 0.2948 - Val loss: 0.3576 - Test loss: 0.3816 - Train acc:= 89.64% - Val acc:= 87.58% - Test acc:= 86.89%
Number of Simulation = 1 - Number of Epoch = 50
Train loss:= 0.2804 - Val loss: 0.3546 - Test loss: 0.3803 - Train acc:= 90.10% - Val acc:= 87.71% - Test acc:= 87.02%
Number of Simulation = 1 - Number of Epoch = 60
Train loss:= 0.2680 - Val loss: 0.3534 - Test loss: 0.3813 - Train acc:= 90.56% - Val acc:= 87.80% - Test acc:= 87.25

Train Model with ADAM

In [9]:

# Set number of simulations and epochs
NUM_SIM = 10
NUM_EPOCHS = 200

for num_sim in range(NUM_SIM):
    np.random.seed(num_sim)
    tf.random.set_seed(num_sim)
    '''
    Initialize model using GPU or load an exitsing MLP
    '''
    mlp_adam = MLP(size_input, size_hidden1, size_hidden2, size_output, device='GPU',\
                 regularizer='l2', R_lambda = 1e-4, drop_prob=0.)

    time_start = time.time()
    hyperparams = {'t':1, 'lr':2e-5}

    for epoch in range(NUM_EPOCHS):
        train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(25, seed=epoch*num_sim).batch(128)

        for inputs, outputs in train_ds:
            preds = mlp_adam.forward(inputs)

            #use custom optimizer to train the model
            mlp_adam.backward(inputs, outputs, hyperparams,'adam')
            hyperparams['t'] += 1
        
        if (epoch + 1)%10 == 0:
            #compute the result for the current epoch
            logits = mlp_adam.forward(X_train)
            train_loss = np.sum(mlp_adam.loss(logits, y_train))/len(y_train)
            train_acc = mlp_adam.accuracy(logits,y_train)/len(y_train)

            logits = mlp_adam.forward(X_val)
            val_loss = np.sum(mlp_adam.loss(logits, y_val))/len(y_val)
            val_acc = mlp_adam.accuracy(logits,y_val)/len(y_val)
            
            logits = mlp_adam.forward(X_test)
            test_loss = np.sum(mlp_adam.loss(logits, y_test))/len(y_test)
            test_acc = mlp_adam.accuracy(logits,y_test)/len(y_test)
            
            print('Number of Simulation = {} - Number of Epoch = {}'.format(num_sim+1, epoch + 1))
            print('Train loss:= {:.4f} - Val loss: {:.4f} - Test loss: {:.4f} - Train acc:= {:.2%} - Val acc:= {:.2%} - Test acc:= {:.2%}'\
                  .format(train_loss, val_loss, test_loss, train_acc, val_acc, test_acc))
            
    time_taken = time.time() - time_start 
    print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))
#save_object(mlp_DIY,'mlp_DIY_dropout.pkl')

Number of Simulation = 1 - Number of Epoch = 10
Train loss:= 0.4798 - Val loss: 0.4926 - Test loss: 0.5140 - Train acc:= 83.43% - Val acc:= 82.87% - Test acc:= 81.99%
Number of Simulation = 1 - Number of Epoch = 20
Train loss:= 0.4171 - Val loss: 0.4377 - Test loss: 0.4606 - Train acc:= 85.50% - Val acc:= 84.55% - Test acc:= 83.80%
Number of Simulation = 1 - Number of Epoch = 30
Train loss:= 0.3866 - Val loss: 0.4114 - Test loss: 0.4347 - Train acc:= 86.51% - Val acc:= 85.72% - Test acc:= 84.46%
Number of Simulation = 1 - Number of Epoch = 40
Train loss:= 0.3666 - Val loss: 0.3950 - Test loss: 0.4190 - Train acc:= 87.21% - Val acc:= 86.26% - Test acc:= 85.11%
Number of Simulation = 1 - Number of Epoch = 50
Train loss:= 0.3516 - Val loss: 0.3834 - Test loss: 0.4081 - Train acc:= 87.73% - Val acc:= 86.51% - Test acc:= 85.57%
Number of Simulation = 1 - Number of Epoch = 60
Train loss:= 0.3394 - Val loss: 0.3749 - Test loss: 0.4006 - Train acc:= 88.08% - Val acc:= 86.73% - Test acc:= 85.98