In [12]:
import numpy as np

class NN():
    """
    Class for a neural network. Requires input/output sizes, number of hidden layers, and number of neurons
    at each layer (we assume all hidden layers are of the same size)
    """
    def __init__(self, input_size, num_HL, hidden_size, output_size):
        # Initialize by setting random 
        self.input_size = input_size
        self.hidden_layers = num_HL
        # NOTE we are assuming all hidden layers are the same size
        self.hidden_size = hidden_size
        self.output_size = output_size
        # Activations for each neuron
        self.activations_in = np.ones(self.input_size)
        # Hidden can comprise multiple layers, so we have a matrix
        self.activations_hidden = np.ones((self.hidden_layers, self.hidden_size))
        self.activations_out = np.ones(self.output_size)
        # Weights of all the edges, randomized for good results
        self.weights_in = np.random.randn(self.input_size, self.hidden_size)
        # We will only have hidden weights if there are multiple hidden layers
        if self.hidden_layers > 1:
            self.weights_hidden = np.random.randn(self.hidden_layers - 1, self.hidden_size, self.hidden_size)
        else:
            self.weights_hidden = []
        self.weights_out = np.random.randn(self.hidden_size, self.output_size)
    
    def _sigmoid(self, x):
        """
        Sigmoid function for calculating a distribution over 2 classes
        """
        return 1 / (1 + np.exp(-x))
    
    def _relu(self, x):
        """
        relu function used for activation
        """
        return max(x, 0.0)
    
    def _derivative_relu(self, x):
        """
        Derivative of the relu function
        
        Use 0.5 because derivative of relu is undefined at x = 0
        """
        return 1 if x > 0.0 else 0.5
    
    def _binary_cross_ent(self, y_hat, y)
        """
        This basically finds the negative of the log probability of class1 - its inverse
        """
        return (-y * np.log(y_hat)) - ((1 - y) * np.log(1 - y_hat))
    
    def _derivative_binary_cross_ent(y_hat, y):
        """
        Derivative of binary cross-entropy
        """
        return -(y / y_hat) - ((1 - y) / (1 - y_hat))

    def _activate(self, x):
        """
        RELU for non-linear activation function
        """
        return self._relu(x)
    
    def _activate_vector(self, X):
        """
        Run on a numpy vector
        """
        activations = np.vectorize(self._activate)
        return activations(X)
    
    def _derivative_activation(self, x):
        return self._derivative_relu(x)

    def _loss(self, y_hat, y):
        """
        Compute loss form prediction y_hat, and gold value y

        'Unlike linear models, the loss function of multi-layer neural networks 
        with respect to their parameters is not convex'

        We can still use logistic_loss/binary cross-entropy:
        """
        return self._binary_cross_ent(y_hat, y)
    
    def _derivative_loss(self, y_hat, y):
        """
        This will be used in backpropogation for finding L'(output_layer_node)
        """
        return self._derivative_binary_cross_ent(y_hat, y)
    
    def _vector_loss(self, Y_hat, Y):
        """
        Get a vector of losses from numpy vectors
        """
        loss = np.vectorize(self._loss)
        return loss(Y_hat, Y)
    
    def _derivative_vector_loss(self, Y_hat, Y):
        """
        Get a vector of derivative of losses from numpy vectors
        """
        d_loss = np.vectorize(self._derivative_loss)
        return d_loss(Y_hat, Y)
        
    def forward(self, inputs):
        """
        Forward pass: Calculate the activations of each neuron
        """
        if len(inputs) != self.input_size:
          raise Exception("That is not the size of the input layer... try %i" % self.input_size)
        
        # Set input activations
        self.activations_in = inputs[:]
        
        # calculate the activations for each hidden layer
        for h_layer_i in range(self.hidden_layers):
            # Need to take previous layer activation value * weights for a given layer
            # Starting with input layer X first hidden layer
            if h_layer_i == 0:
                # multiply the previous layer's activations by its weight vector for this layer's activations
                self.activations_hidden[h_layer_i] = np.dot(np.transpose(self.activations_in), self.weights_in)
                self.activations_hidden[h_layer_i] = self._activate_vector(self.activations_hidden[h_layer_i])
            else:
                # multiply the previous layer's activations by its weight vector for this layer's activations
                self.activations_hidden[h_layer_i] = np.dot(np.transpose(self.activations_hidden[h_layer_i - 1]), self.weights_hidden[h_layer_i - 1])
                self.activations_hidden[h_layer_i] = self._activate_vector(self.activations_hidden[h_layer_i])
        # Output activations will be the dot product of the final hidden layer, and the output weights
        self.activations_out = np.dot(np.transpose(self.activations_hidden[-1]), self.weights_out)
        # Take the simoid of the activations of the output layer, because we are doing 2 class classification
        # ***If we have >2 classes, we would use softmax***
        self.activations_out = self._sigmoid(self._activate_vector(self.activations_out))

    def backward(self, targets):
        """
        Backpropogation for finding the partial derivative of the each node w.r.t the loss function,
        and updating weights based on those gradients
        """
        if len(targets) != len(self.activations_out):
            raise Exception("Your labels are not the same size as your output layer!")
        
        # Calculate loss - there will be a value for each node in the output layer
        loss = self._vector_loss(self.activations_out, targets)
        print("LOSS: %f" % loss)
        
        """
        Now we need to calculate the partial derivative of the loss w.r.t each weight.
        Think of this as finding the amount that each node contributes to a change in the final loss.
        
        Each node has a value "delta", which represents the partial derivative of the loss w.r.t. its value:
        Use the partial derivative of the loss function, in our case binary cross-entropy
        """
        deltas_out = [0.0] * self.output_size
        for i, l in enumerate(loss):
            deltas_out[i] = self._derivative_vector_loss(self.activations_out, targets)
        
        print("DETLAS_OUT:")
        print(deltas_out)
        
        """
        Find derivative of activation (activation was found in the forward pass) * derivative of the inner function,
        which is either the parameter w, or the activation value of the previous node
        """
        Hidden_deltas = np.zeros([self.hidden_layers, self.hidden_size])
        for h_layer_i in range(self.hidden_layers):
            # If it is the last hidden layer, then we look at the activation of the output layer,
            # not the next hidden layer
            if h_layer_i == len(self.hidden_layers - 1):
                Hidden_deltas[h_layer_i] = self._derivative_activation(self.activations_hidden[h_layer_i])
        return True
    
        """
        We can simply add the effects of all paths to f_i in order to find the partial derivative of
        any f_i w.r.t any child pi_i. 
        """

test = NN(2, 2, 3, 1)
test.forward([2, 2])
print(test.activations_in)
print(test.activations_hidden)
print(test.activations_out)
test.backward([3])
"""
class Node(Object):
    def __init__(self):
        
    def 
"""

[2, 2]
[[ 0.  0.  0.]
 [ 0.  0.  0.]]
[ 0.5]
LOSS: 0.693147
DETLAS_OUT:
[array([-2.])]


'\nclass Node(Object):\n    def __init__(self):\n        \n    def \n'