# A Configurable N-Layer Neural Net with Dropout

With this Neural Net class, you can:

* Choose the number and size of your hidden layers
* Choose an alpha (a.k.a. step size, learning rate, defaults to 1)
* Choose a bias (defaults to 1)
* Choose the number of training iterations (defaults to 1, but you probably want 10,000+)
* Enable dropout, and choose a dropout ratio

The output layer is single node.
The arguments in the running example at the end of the notebook build a neural net with 4 hidden layers. Not necessarily because that's optimal, but just to show how a basic deep neural net can be built with this class.

The provide example learns this concept given a vector `x = <x1, x2, x3, x4>` where each element is `0` or `1`.

    (x0 and x1) or (x2 and x3)
    
...and outputs 0 for false, 1 for true.

In [None]:
import numpy as np

class NN:
    
    def __init__(self):
        np.random.seed()
        
    def sigmoid(self, x):
        return 1/(1 + np.exp(-x))
    
    def d_sigmoid_output(self, out):
        return out * (1 - out)
    
    def train(self, X=np.array([]), Y=np.array([]), iterations=1, alpha=1, bias=1, batch_size=0, hl_sizes=[4], dropout=False, dropout_ratio=0.5):
        if X.size == 0 or Y.size == 0:
            raise Exception("You must specify both X and Y")
        if X.shape[0] != Y.shape[0]:
            raise Exception("Number of rows in X and Y must be the same: %d != %d" % (X.shape[0],Y.shape[0]))
        
        # how many training examples are provided?
        num_inputs = X.shape[0]
        
        # how many nodes do we need for then input layer?
        input_vector_length = X.shape[1]
        
        # if batch_size is not provided (i.e. is set to zero by default),
        # then default to full batch for gradient descent
        if batch_size <= 0:
            batch_size = num_inputs
        
        # add a bias node with a value of 1 to
        # the input layer and hidden layers
        X = np.append(X,np.ones((num_inputs,1)),axis=1)
        
        # randomly initialize weights between -1 and 1
        # for input_layer -> h1_layer
        weights = []
        weights.append(2*np.random.random((input_vector_length + 1, hl_sizes[0] + 1)) - 1)
        
        # randomly initialize weights between -1 and 1
        # between all hidden layers
        if len(hl_sizes) > 1:
            for j in range(1,len(hl_sizes)):
                weights.append(2*np.random.random((hl_sizes[j-1] + 1, hl_sizes[j] + 1)) - 1)
        
        # randomly initialize weights between -1 and 1
        # for last hidden layer -> output_layer
        weights.append(2*np.random.random((hl_sizes[len(hl_sizes)-1] + 1,1)) - 1)
            
        for i in range(iterations):
            
            batch_start = 0
            
            weight_updates = []
            for j in range(len(weights)):
                weight_updates.append(np.zeros_like(weights[j]))
            
            total_avg_training_error = 0
            
            # --------------------------------------------------------
            # TRAIN IN MINI BATCHES
            # (or full batch if no batch_size was provided)
            #
            # TODO: This current implementation isn't very useful
            #       because it still processes batches serially.
            #       The whole point of mini-batches is so you can
            #       parallelize the gradient decent during backpropagation.
            #       So, the next iteration of this code should take 
            #       adcantage of threading to actually parallelize the
            #       crunching of the mini-batches 
            # --------------------------------------------------------
            
            while batch_start <= (num_inputs - 1):
                
                batch_end = batch_start + batch_size
                
                x = X[batch_start : batch_end]
                y = Y[batch_start : batch_end]
                
                # TODO: make batch selection random so each iteration
                #       isn't always seeing the same mini-batches
                
                # initialize layers
                layers = []

                # set input layer equal to the examples in the mini-batch
                layers.append(x)
                
                # -------------------------------
                # FEED FORWARD
                # -------------------------------
                
                # feed forward to all hidden layers
                for j in range(len(hl_sizes)):
                    # compute node values
                    layer = self.sigmoid(np.dot(layers[j], weights[j]))
                    
                    # perform dropout if requested
                    if dropout == True:
                        
                        # generate an array of 1's and 0's
                        # drawn from a binomial distribution with probability of dropout_ratio
                        dropouts = np.random.binomial(1, 1 - dropout_ratio, (layer.shape[0],layer.shape[1]))
                                           
                        # multiply the node values in the layer by the randomly assigned 
                        # values in dropouts, effectively "turning off" some nodes (they get multiplied by 0)
                        layer = layer * dropouts
                    
                    # reset bias node
                    layer[:,layer.shape[1]-1] = 1
                    layers.append(layer)
                    
                    # reset bias node in the layer that was just computed
                    for k in range(batch_size):
                        try:
                            layers[j+1][k][hl_sizes[j]] = 1
                        except IndexError:
                            break
                
                # feed forward to output layer
                layers.append(self.sigmoid(np.dot(layers[j+1], weights[j+1])))
                
                # measure error
                training_error = layers[len(layers)-1] - y
                total_avg_training_error += np.mean(np.abs(training_error))
                
                # -------------------------------
                # BACKPROPAGATE
                # -------------------------------
                
                deltas = []
                errors = []

                errors.insert(0, training_error)
                deltas.insert(0, training_error * self.d_sigmoid_output(layers[len(layers)-1]))
                
                # continue backwards, calculating the error contribution from each layer
                # and an appropriate delta for determining weight updates
                for j in reversed(range(0,len(hl_sizes))):
                    errors.insert(0, np.dot(deltas[0],weights[j+1].T))
                    deltas.insert(0, errors[0] * self.d_sigmoid_output(layers[j+1]))
            
                # calculate weight updates       
                for j in range(0,len(weights)):
                    weight_updates[j] += np.dot(layers[j].T, deltas[j])
                
                # figure out what index to start grabbing our next batch from
                batch_start += batch_size
            
            # At this point, the batch has been processed, and all the updates
            # for the weights have been calculated
            
            # update weights
            for j in range(len(weights)-1):
                weights[j] -= (alpha * weight_updates[j])
            
            if i % 10000 == 0:
                print("Iteration: %d Error: %.5f" % (i,total_avg_training_error))
        
        print("Iteration: %d Error: %.5f" % (i,total_avg_training_error))

In [None]:
X = np.array([[0,0,0,1],
              [0,0,1,1],
              [1,0,0,1],
              [1,1,1,0],
              [1,1,0,0],
              [1,0,0,0],
              [1,0,1,0],
              [1,0,0,0]])

Y = np.array([[1,1,0,1,1,1,0,1]]).T

net = NN()
net.train(X, Y, hl_sizes=[8,16,32,64], iterations=100000, batch_size=8, alpha=14, dropout=True, dropout_ratio=0.5)