In [6]:
import numpy as np

In [7]:
# those some helper activation functions and thier derivatives 
def sigmoid(input):  # sigmoid 
  return 1 / (1+np.exp(-input)) 
def sigmoid_d(output) : # sigmoid derivative
  return output * (1-output)
def tanh(input) : # tanh  
  return np.tanh(input)
def tanh_d(output): # tanh derivative 
  return 1 - (output**2) 
def relu(input) : # relu
  mask = (input > 0 )
  return input * mask
def relu_d(output): # relu derivative
  mask = (output > 0)
  return output * mask

In [8]:
def deep_nn(X, Y, l ,layer_dims ,lr =.001 , num_iter =4000 , act_function = tanh , act_deriv = tanh_d , lambd = .1 , optimizer = 'adam' ,beta1 = .9, beta2= .999 , epsilon= 1e-8) :
  '''
    inputs :
    X : input data with shape(num_features , num_examples) 
    Y : input labels with shape(1 , num_examples)
    l : number of layers 
    layer_dims : list containing the hidden units for each hidden layer for example [num_features ,128 ,64,32 ]
    lr :learning_rate 
    num_iter : number of  iteration for training
    act_function : the activation function for forword propagation 
    act_deriv : the derivative of the activaion function for backowrd propagation 
    lambd : regularization parameter 
    optimizer : optimizer name 
    beta1 : optimization parameter
    beta2 : optimization parameter
    epsilon : small number to add to avoid dividing by zero   
  '''
  parameters = {}  # parameter dict 
  activations = {} # activation dict to keep the activation because we need them throw the back propagation
  losses =[] # list to store the losses 
  grads = {} # gradients dict 
  m = X.shape[-1] # number of training examples 
  activations['A' + str(0)] = X  # we set the first activation to the input data 
  
  for i in range(1, l+1 ):  # initialize the weights and biases
    parameters['W' + str(i)] = np.random.randn(layer_dims[i] ,layer_dims[i-1] ) * np.sqrt(1/layer_dims[i-1])
    parameters['b' + str(i)] = np.zeros((layer_dims[i] , 1))
  
  
  if optimizer =='rms': # this initializtion related to rms_prob optimizer you can back to it leter 
    s ={}
    for i in range(1 , l+1) : 
      s["dW" + str(i)] = np.zeros((parameters['W' + str(i)].shape))
      s["db" + str(i)] = np.zeros((parameters['b' + str(i)].shape))
  if optimizer == 'adam' : # this initializtion related to adam optimizer you can back to it leter 
    v = {}
    s = {}
    for i in range(1 , l+1) :
        v["dW" + str(i)] = np.zeros((parameters['W' + str(i)].shape))
        v["db" + str(i)] = np.zeros((parameters['b' + str(i)].shape))
        s["dW" + str(i)] = np.zeros((parameters['W' + str(i)].shape))
        s["db" + str(i)] = np.zeros((parameters['b' + str(i)].shape))

  
  for it in range(num_iter) : # iterating  
    for i in range(1, l) : # compute the forword propagation for each layer except the output layer
      Z = np.dot(parameters['W' + str(i)] , activations['A' + str(i-1)]) + parameters['b'+ str(i)] 
      activations['A' + str(i)] = act_function(Z) 
    # compute the output layer 
    lastZ = np.dot(parameters['W' + str(l)] , activations['A' + str(l-1)]) + parameters['b' + str(l)] 
    activations['A' + str(l)] = sigmoid(lastZ)

    # compute the losses and add the regularization 
    loss = - (np.dot(Y , np.log(activations['A' + str(l)]).T) + np.dot((1-Y) , np.log((1-activations['A' + str(l)]).T))) /m
    l2_reg = 0
    for i in range(1 , l+1) : 
      l2_reg += np.sum(np.square(parameters['W' + str(i)])) 
    l2_reg = lambd * l2_reg / (2 * m)
    loss = loss + l2_reg
    losses.append(loss)

    # compute the derivative of the outputs with respect to the loss 
    dAL = - (np.divide(Y, activations['A' + str(l)]) - np.divide(1 -Y , 1 - activations['A'+ str(l)])) 
    
    # check the kind of optimizer 
    if optimizer =='gd' :
      # compute the gradients for the weights and biases for the output layer
      grads['dZ' + str(l)] = dAL * sigmoid_d(activations['A' + str(l)])
      grads['dW' + str(l)] =1. /m * np.dot(  grads['dZ' + str(l)] , activations['A' + str(l-1)].T) + (lambd / m ) * parameters['W' + str(l)] 
      grads['db' + str(l)] = np.sum(grads['dZ' + str(l)] , axis = 1, keepdims =True) / m
      grads['dA' + str(l-1)] = np.dot(parameters['W' + str(l)].T ,grads['dZ' + str(l)] )  

      for i in reversed(range(1 , l)) : # iteraite backword throw the hidden layers 
        # compute the gradients for weights and biases for the hidden layers  
        grads['dZ'+ str(i)] = grads['dA' + str(i)] * act_deriv(activations['A' + str(i)])
        grads['dW' + str(i)] =1. / m *  np.dot( grads['dZ'+ str(i)] ,activations['A' + str (i-1)].T) + (lambd / m ) * parameters['W' + str(i)]
        grads['db' + str(i)] = np.sum(grads['dZ'+ str(i)] , keepdims= True , axis =1) /m
        grads['dA' + str(i-1)] = np.dot(parameters['W' + str(i)].T ,grads['dZ' + str(i)] )
      
      # update the weights and biases
      for i in range(1, l+1) : 
        parameters['W' + str(i)] -= lr * grads['dW' + str(i)]
        parameters['b' + str(i)] -= lr * grads['db' + str(i)]
    
    
    elif optimizer == 'rms' : # rms_prob
      # compute the gradients for the weights and biases for the output layer
      grads['dZ' + str(l)] = dAL * sigmoid_d(activations['A' + str(l)])
      grads['dW' + str(l)] =1. /m * np.dot(  grads['dZ' + str(l)] , activations['A' + str(l-1)].T) + (lambd / m ) * parameters['W' + str(l)] 
      grads['db' + str(l)] = np.sum(grads['dZ' + str(l)] , axis = 1, keepdims =True) / m
      s['dW' + str(l)] = beta2 * s['dW' + str(l)] + (1-beta2) * (grads['dW' + str(l)] ** 2)
      s['db' + str(l)] = beta2 * s['db' + str(l)] + (1-beta2) * (grads['db' + str(l)] ** 2)
      grads['dA' + str(l-1)] = np.dot(parameters['W' + str(l)].T ,grads['dZ' + str(l)] ) 
      
      for i in reversed(range(1 , l)) : 
        # compute the gradients for weights and biases for the hidden layers
        grads['dZ'+ str(i)] = grads['dA' + str(i)] * act_deriv(activations['A' + str(i)])
        grads['dW' + str(i)] =1. / m *  np.dot( grads['dZ'+ str(i)] ,activations['A' + str (i-1)].T) + (lambd / m ) * parameters['W' + str(i)]
        grads['db' + str(i)] = np.sum(grads['dZ'+ str(i)] , keepdims= True , axis =1) /m
        s['dW' + str(i)] = beta2 * s['dW' + str(i)] + (1-beta2) * (grads['dW' + str(i)] ** 2)
        s['db' + str(i)] = beta2 * s['db' + str(i)] + (1-beta2) * (grads['db' + str(i)] ** 2)
        grads['dA' + str(i-1)] = np.dot(parameters['W' + str(i)].T ,grads['dZ' + str(i)] )
      
      # update the parameters 
      for i in range(1, l+1) : 
        parameters['W' + str(i)] -= lr * (grads['dW' + str(i)] / (np.sqrt(s['dW' + str(i)]) + epsilon))
        parameters['b' + str(i)] -= lr * (grads['db' + str(i)] / (np.sqrt(s['db' + str(i)]) + epsilon))
    
    elif optimizer =='adam' : # adam optimizer
        # compute the gradients for the weights and biases for the output layer
        grads['dZ' + str(l)] = dAL * sigmoid_d(activations['A' + str(l)])
        grads['dW' + str(l)] =1. /m * np.dot(  grads['dZ' + str(l)] , activations['A' + str(l-1)].T) + (lambd / m ) * parameters['W' + str(l)] 
        grads['db' + str(l)] = np.sum(grads['dZ' + str(l)] , axis = 1, keepdims =True) / m
        v['dW' + str(l)] = beta1 * v['dW' + str(l)] + (1- beta1) * grads['dW' + str(l)]
        v['db' + str(l)] = beta1 * v['db' + str(l)] + (1- beta1) * grads['db' + str(l)]
        s['dW' + str(l)] = beta2 * s['dW' + str(l)] + (1-beta2) * (grads['dW' + str(l)] ** 2)
        s['db' + str(l)] = beta2 * s['db' + str(l)] + (1-beta2) * (grads['db' + str(l)] ** 2)
        grads['dA' + str(l-1)] = np.dot(parameters['W' + str(l)].T ,grads['dZ' + str(l)] ) 
        
        for i in reversed(range(1 , l)) : 
           # compute the gradients for weights and biases for the hidden layers
          grads['dZ'+ str(i)] = grads['dA' + str(i)] * act_deriv(activations['A' + str(i)])
          grads['dW' + str(i)] =1. / m *  np.dot( grads['dZ'+ str(i)] ,activations['A' + str (i-1)].T) + (lambd / m ) * parameters['W' + str(i)]
          grads['db' + str(i)] = np.sum(grads['dZ'+ str(i)] , keepdims= True , axis =1) /m
          v['dW' + str(i)] = beta1 * v['dW' + str(i)] + (1- beta1) * grads['dW' + str(i)]
          v['db' + str(i)] = beta1 * v['db' + str(i)] + (1- beta1) * grads['db' + str(i)]
          s['dW' + str(i)] = beta2 * s['dW' + str(i)] + (1-beta2) * (grads['dW' + str(i)] ** 2)
          s['db' + str(i)] = beta2 * s['db' + str(i)] + (1-beta2) * (grads['db' + str(i)] ** 2)
          grads['dA' + str(i-1)] = np.dot(parameters['W' + str(i)].T ,grads['dZ' + str(i)] )
        
        # update the parameters
        for i in range(1, l+1) : 
          parameters['W' + str(i)] -= lr * (v['dW' + str(i)]/ (np.sqrt(s['dW' + str(i)]) + epsilon))
          parameters['b' + str(i)] -= lr * (v['db' + str(i)] / (np.sqrt(s['db' + str(i)]) + epsilon))
  
  # return the parameters and losses
  return parameters, losses

In [9]:
# this function to predict and evaulate the model 
def predict(X , y , parameters) :
  '''
  X : input data with the same shape as before
  y : input labels 
  parameters : dict containing the updated weights and biases which we will use to predict  
  '''

  activations ={} # activation dict as before 
  activations['A0'] = X 
  num_correct = 0 # we will use that to compute the accuracy 
  l =int(len(parameters.keys()) /2 )  # we calculate the number of hidden layer 
  
  # predict the output with our updated weights and biases 
  for i in range(1 , l ) : 
     Z = np.dot(parameters['W' + str(i)] , activations['A' + str(i-1)]) + parameters['b'+ str(i)] 
     activations['A' + str(i)] = tanh(Z)
  lastZ = np.dot(parameters['W' + str(l)] , activations['A' + str(l-1)]) + parameters['b' + str(l)] 
  activations['A' + str(l)] = sigmoid(lastZ)

  # we convert the output activations to ones or zeros to compare it to the labels
  activations['A' + str(l)] = (activations['A' + str(l)] > .5) 
  
  # comparing the outputs to the label
  for i in range(y.shape[0]) : 
    if activations['A' + str(l)][0][i] == y[i] : 
      num_correct += 1 
  # compute the accuracy
  acc = num_correct / y.shape[0]
  return acc 

In [10]:
# load the data 
import tensorflow 
(train_data , train_labels ) , (test_data , test_labels) = tensorflow.keras.datasets.mnist.load_data()

In [11]:
# normalize the data
train_data , test_data = train_data / 255 , test_data /255

In [12]:
# reshape the data 
train_data = train_data.reshape((60000 , 28*28))

In [13]:
# take subset of the train data to train on because the dataset is so big
train_subset = train_data[:5000 , :]
train_subset_label = train_labels[:5000]

# because the mnist is multiclass problem and we want it to be binary we will change it 
# instead of predicting the number we will predict if this number is odd or even

# convert the label to binary
for i in range(5000) : 
  if train_subset_label[i] // 2 == 0 :
    train_subset_label[i] = 0
  else :
    train_subset_label[i] = 1

In [14]:
# take subset of the train data to test on because the dataset is so big
test_subset = train_data[5001 :6001 , :]  # i took the subset from the training data but you can take it from the test data
test_subset_label = train_labels[5001 : 6001]

# because the mnist is multiclass problem and we want it to be binary we will change it 
# instead of predicting the number we will predict if this number is odd or even

# convert the label to binary
for i in range(1000) : 
  if test_subset_label[i] // 2 == 0 :
    test_subset_label[i] = 0
  else :
    test_subset_label[i] = 1

In [15]:
parameters , losses = deep_nn(train_subset.T, train_subset_label.T , l = 3 ,layer_dims= [784 ,  64 , 32,1 ] ,optimizer='adam')
# we pass the subsets transposed to fit the require shape 
# we choose 3 layers 
# the layer dims [784 -> the number of features or the pixels of the images 28*28 , 64 -> hidden_units , 32-> hidden_units , 1 -> the output]
# we choose adam as optimizer 


In [16]:
# the accuracy on the training subset 
# you will get 100% accuracy 
acc = predict(train_subset.T , train_subset_label.T , parameters)
# parameters : dict include the weights and biases returned from the model 
print(acc)

1.0


In [17]:
# the accuracy on the test subset 
# you will get about 99% 
acc = predict(test_subset.T , test_subset_label.T , parameters) 
print(acc) 

0.982
