In [10]:
import numpy as np

class NN():
    """
    Class for a neural network. Requires input/output sizes, number of hidden layers, and number of neurons
    at each layer (we assume all hidden layers are of the same size)
    """
    def __init__(self, input_size, num_HL, hidden_size, output_size):
        # Initialize by setting random 
        self.input_size = input_size
        self.hidden_layers = num_HL
        # NOTE we are assuming all hidden layers are the same size
        self.hidden_size = hidden_size
        self.output_size = output_size
        # Activations for each neuron
        self.activations_in = np.ones(self.input_size)
        # Hidden can comprise multiple layers, so we have a matrix
        self.activations_hidden = np.ones((self.hidden_layers, self.hidden_size))
        self.activations_out = np.ones(self.output_size)
        # Weights of all the edges, randomized for good results
        self.weights_in = np.random.randn(self.input_size, self.hidden_size)
        # We will only have hidden weights if there are multiple hidden layers
        if self.hidden_layers > 1:
            self.weights_hidden = np.random.randn(self.hidden_layers - 1, self.hidden_size, self.hidden_size)
        else:
            self.weights_hidden = []
        self.weights_out = np.random.randn(self.hidden_size, self.output_size)
        # To be valued when run() is called
        self.learning_rate = 0.0
    
    def _sigmoid(self, x):
        """
        Sigmoid function for calculating a distribution over 2 classes
        """
        return 1 / (1 + np.exp(-x))
    
    def _derivative_sigmoid(self, x):
        """
        Derivative of the sigmoid function where x = the output of the sigmoid
        
        This can be used in backpropogation, wherein we would have 
        already computed the sigmoid in the forward pass, and we can draw upon its cached value
        """
        return x * (1.0 - x)
    
    def _relu(self, x):
        """
        relu function used for activation
        """
        return max(x, 0.0)
    
    def _derivative_relu(self, x):
        """
        Derivative of the relu function, the input will be the output of the relu function.
        This is because in practice we will have already performed this computation in the forward pass
        so in the backward pass, we need to find its derivative drawing upon the cached relu(x).
        
        Use 0.5 because derivative of relu is undefined at x = 0
        """
        return 1 if x > 0.0 else 0.5
    
    def _binary_cross_ent(self, y_hat, y):
        """
        This basically finds the negative of the log probability of class1 - its inverse
        """
        return (-y * np.log(y_hat)) - ((1 - y) * np.log(1 - y_hat))
    
    def _derivative_binary_cross_ent(self, y_hat, y):
        """
        Derivative of binary cross-entropy
        
        This description is misleading. 
        This is the part of the partial derivative of binary cross-entropy 
        w.r.t the parameters of our function. In practice, the other part is 
        the dot product of this and the activations (activate(w, x))
        """
        #return -(y / y_hat) - ((1 - y) / (1 - y_hat))
        return (y_hat - y)

    def _activate(self, x):
        """
        RELU for non-linear activation function
        """
        return self._relu(x)
    
    def _activate_vector(self, X):
        """
        Run on a numpy vector
        """
        activations = np.vectorize(self._activate)
        return activations(X)
    
    def _derivative_activation(self, x):
        """
        Compute the derivative of the activation function given the activation output
        
        x: activate(node)
        """
        return self._derivative_relu(x)
    
    def _derivative_vector_activation(self, X):
        """
        Derivative for each scalar in a numpy vector
        """
        derivative_activations = np.vectorize(self._derivative_activation)
        return derivative_activations(X)

    def _loss(self, y_hat, y):
        """
        Compute loss form prediction y_hat, and gold value y

        'Unlike linear models, the loss function of multi-layer neural networks 
        with respect to their parameters is not convex'

        We can still use logistic_loss/binary cross-entropy:
        """
        return self._binary_cross_ent(y_hat, y)
    
    def _derivative_loss(self, y_hat, y):
        """
        This will be used in backpropogation for finding L'(output_layer_node)
        """
        return self._derivative_binary_cross_ent(y_hat, y)
    
    def _vector_loss(self, Y_hat, Y):
        """
        Get a vector of losses from numpy vectors
        """
        loss = np.vectorize(self._loss)
        return loss(Y_hat, Y)
    
    def _derivative_vector_loss(self, Y_hat, Y):
        """
        Get a vector of derivative of losses from numpy vectors
        """
        d_loss = np.vectorize(self._derivative_loss)
        return d_loss(Y_hat, Y)
        
    def forward(self, inputs):
        """
        Forward pass: Calculate the activations of each neuron
        """
        if len(inputs) != self.input_size:
          raise Exception("That is not the size of the input layer... try %i" % self.input_size)
        
        # Set input activations
        for i, input in enumerate(inputs):
            self.activations_in[i] = input
        
        """
        Print all of the weights, to see updates
        """
        print(self.weights_in)
        for w in self.weights_hidden:
            print(w)
        print(self.weights_out)
        
        # calculate the activations for each hidden layer
        for h_layer_i in range(self.hidden_layers):
            # Need to take previous layer activation value * weights for a given layer
            # Starting with input layer X first hidden layer
            if h_layer_i == 0:
                # multiply the previous layer's activations by its weight vector for this layer's activations
                self.activations_hidden[h_layer_i] = np.dot(np.transpose(self.activations_in), self.weights_in)
                self.activations_hidden[h_layer_i] = self._activate_vector(self.activations_hidden[h_layer_i])
            else:
                # multiply the previous layer's activations by its weight vector for this layer's activations
                self.activations_hidden[h_layer_i] = np.dot(np.transpose(self.activations_hidden[h_layer_i - 1]), self.weights_hidden[h_layer_i - 1])
                self.activations_hidden[h_layer_i] = self._activate_vector(self.activations_hidden[h_layer_i])
                
        # Output activations will be the dot product of the final hidden layer, and the output weights
        self.activations_out = np.transpose(self.activations_hidden[-1]).dot(self.weights_out)
        
        self.activations_out = self._activate_vector(self.activations_out)

    def backward(self, targets):
        """
        Backpropogation for finding the partial derivative of the each node w.r.t the loss function,
        and updating weights based on those gradients
        """
        if len(targets) != len(self.activations_out):
            raise Exception("Your labels are not the same size as your output layer!")
        
        print("OUTPUT ACTIVATIONS:")
        print(self.activations_out)
        # Calculate loss - there will be a value for each node in the output layer
        # Take the simoid of the activations of the output layer, because we are doing 2 class classification
        # ***If we have >2 classes, we would use softmax***
        loss = self._vector_loss(self._sigmoid(self.activations_out), targets)
        print("LOSS: %f" % loss)
        
        """
        Now we need to calculate the partial derivative of the loss w.r.t each weight.
        Think of this as finding the amount that each node contributes to a change in the final loss.
        
        Each node has a value "delta", which represents the partial derivative of the loss w.r.t. its value:
        Use the partial derivative of the loss function, in our case binary cross-entropy
        """
        deltas_out = np.zeros([self.output_size])
        for i, activation_out in enumerate(self.activations_out):
            deltas_out[i] = self._derivative_vector_loss(activation_out, targets).dot(self._derivative_vector_activation(activation_out))
        
        """
        Find derivative of activation (activation was found in the forward pass) * derivative of the inner function,
        which is the parameter w
        """
        Deltas_hidden = np.zeros([self.hidden_layers, self.hidden_size])
        
        # Traverse in the reverse, as each delta depends on the layer in the direction of the loss
        for h_layer_i in reversed(range(self.hidden_layers)):
            # If it is the last hidden layer, then we look at the activation of the output layer,
            # not the next hidden layer
            if h_layer_i == self.hidden_layers - 1:
                #  sum(derivative of hidden activation[i] * weight from hidden node to output node * output delta[i] for i in output_deltas)
                Deltas_hidden[h_layer_i] = deltas_out.dot(self.weights_out.T) * self._derivative_vector_activation(self.activations_hidden[h_layer_i])
            else:
                # Do the same to find the hidden deltas
                Deltas_hidden[h_layer_i] = Deltas_hidden[h_layer_i + 1].dot(self.weights_hidden[h_layer_i].T) * self._derivative_vector_activation(self.activations_hidden[h_layer_i])
        
        """
        Now just do the same for L'(input layer)
        """
        deltas_in = Deltas_hidden[0].dot(self.weights_in.T) * self._derivative_vector_activation(self.activations_in)
        
        # Now we can use the deltas to adjust each weight by L'(w_i_j)    
        self.weights_out -= self.activations_out.T.dot(deltas_out) * self.learning_rate
        
        for w_i in reversed(range(len(self.weights_hidden))):
            self.weights_hidden[w_i] -= self.activations_hidden[w_i].T.dot(Deltas_hidden[w_i]) * self.learning_rate
            
        self.weights_in -= self.activations_in.T.dot(deltas_in) * self.learning_rate
    
    def run(self, inputs, targets, epochs=50, lr=.1):
        self.learning_rate = lr
        for e in range(epochs):
            print("EPOCH %i" % e)
            self.forward(inputs)
            self.backward(targets)
            

test = NN(2, 2, 3, 1)
test.run([2, 2], [3])
"""
class Node(Object):
    def __init__(self):
        
    def 
"""

EPOCH 0
[[ 0.20090393 -0.22356656 -0.86735668]
 [-2.34857919 -0.74047588  0.00639031]]
[[-1.41727425 -1.23301982  0.71657163]
 [-0.30906171 -0.02480866  2.62426889]
 [-0.84927827  0.03795672 -0.72816421]]
[[ 1.72316358]
 [-0.04285805]
 [ 0.065113  ]]
OUTPUT ACTIVATIONS:
[ 0.]
LOSS: 0.693147
EPOCH 1
[[ 0.70199398  0.27752349 -0.36626663]
 [-1.84748914 -0.23938583  0.50748037]]
[[-1.41727425 -1.23301982  0.71657163]
 [-0.30906171 -0.02480866  2.62426889]
 [-0.84927827  0.03795672 -0.72816421]]
[[ 1.72316358]
 [-0.04285805]
 [ 0.065113  ]]
OUTPUT ACTIVATIONS:
[ 0.]
LOSS: 0.693147
EPOCH 2
[[ 0.86459074  0.44012024 -0.20366988]
 [-1.68489239 -0.07678908  0.67007712]]
[[-1.45140319 -1.26714877  0.68244269]
 [-0.34319066 -0.0589376   2.59013994]
 [-0.88340722  0.00382778 -0.76229316]]
[[ 1.72316358]
 [-0.04285805]
 [ 0.065113  ]]
OUTPUT ACTIVATIONS:
[ 0.07625247]
LOSS: 0.503243
EPOCH 3
[[ 0.89947181  0.47500131 -0.16878881]
 [-1.65001132 -0.04190801  0.70495819]]
[[-1.69926728 -1.51501285  0.

'\nclass Node(Object):\n    def __init__(self):\n        \n    def \n'