# Neural Network
Neural Networks are one of the most used tools in machine learning. Basically, what we are doing is putting together many perceptrons into layers for a more complex system. Here, I will be using an input layer (64 nodes) -> a hidden layer (5 nodes) -> output layer (10 nodes)

In [39]:
import pandas as pd
import numpy as np
import numpy.linalg as lin
import statistics as stat
import matplotlib.pyplot as plt
from sklearn import datasets

In [51]:
digits = datasets.load_digits()
x = digits.data
x.shape

(1797, 64)

In [52]:
x[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [53]:
# We can see that since each row is an 8x8 picture, it has 64 columns in it
# Each row is an image
pd.DataFrame(x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [54]:
y = digits.target
y.shape

(1797,)

In [7]:
y[range(20)]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [69]:
# Weights between the input layer and layer 2
w1 = np.random.random((64+1)*5).reshape(65,5)

# Weights between layer 2 and the output layer
w2 = np.random.random((5+1)*10).reshape(6, 10)
w2[:,0]
# This would represent all of the weights from the first node in the 
# hidden layer to the output layer

array([0.83703249, 0.43184937, 0.02175572, 0.06054406, 0.66804577,
       0.16697953])

In [64]:
def σ(z):
    """
    Sigmoid Function that takes in a number, z, and outputs
    the sigmoid function evaluated at z, which is in the
    interval [0, 1]
    """
    return (1 + np.exp(-z))**(-1)

def loss(y, y_hat):
    """
    Loss function that takes in two arrays of equal length,
    y and y_hat, and calculates the sum of squares loss between
    the two
    """
    return sum((y - y_hat)**2)

# I'm calling all the weights between the input nodes and
# the hidden layer w1
def gradient_w1():
    """
    Function to calculate the gradient for any one of the weights
    between the input layer and the hidden layer
    """
    
def gradient_w2(y_hat, y, w2, α):
    """
    Function to calculate the gradient for any one of the set of
    weights between the hidden layer and one output node
    INPUTS:
        - y_hat, the calculated value of the output node that w2 goes to
        - y, the expected value of the output layer node
        - w2, an array of the weights from the hideen layer to the same
            output node referenced in y_hat
        - α, the learning rate with which to scale the change in weights
    OUTPUTS:
        - w2_new, an array of new weights 
    """
    coeff = 2 * (y - y_hat) * y_hat * (1 - y_hat)
    
    w2_new = w2 - (α * coeff)*w2
    return w2_new
    
def gradient_w1(y_hat, y, w1, w2, w2_node, x, α):
    """
    Function to calculate the gradient for any one of the set of
    weights between the input layer and one hidden layer node
    INPUTS:
        - y_hat, an array of size 10, the calculated values of every
                node in the output layer
        - y, an array of size 10, the expected value of the output layer nodes
        - w1, one of the sets of weights from the input layer to
            a specific node in the hidden layer, L2_node
        - w2, the set of weights from the hidden layer to the output layer 
        - w2_node, a single int, the index of the w2_node being moved through
        - α, the learning rate with which to scale the change in weights
        - x, the given array of length 64 + 1 (data + bias) that was 
            fed through the input layer
    OUTPUTS:
        - w1_new, an array of new weights 
    """
    total = 0
    z = np.dot(w1, x)
    for i in range(10):
        total += 2 * (y - y_hat) * y_hat * (1 - y_hat) * w2[:,i][w2_node]
    coeff = total * σ(z) * (1 - σ(z))
    
    w1_new = w1 - (α * coeff)*w1
    return w2_new

def feed_forward(x, w1, w2):
    """
    Function to feed forward the array, x, and give outputs
    INPUTS:
        - x, an array of length 65, the data to be passed through
        - w1, the matrix holding all the weights from the input
            layer to the hidden layer
        - w2, the matrix holding all the weights from the hidden layer
            to the output layer
    OUTPUTS:
        - outputs, an array of length 10
        - hidden_layer, an array of length 5
    """
    hidden_layer = []
    outputs = []
    
    for i in range(5):
        hidden_layer.append(σ(np.dot(w1[:,i]), x) + w1[64,i])
        
    for i in range(6):
        outputs.append(σ(np.dot(w2[:,i], np.array(hidden_layer)) + w2[5,i]))
        
    return {'o':outputs, 'hl':hidden_layer}

def NeuralNet(x, y, w1, w2, num_iterations=1000, ϵ=.01):
    """
    Function for optimizing the weights for the 3-layer Neural Network
    INPUTS:
        - x, the input data matrix with size m x n, where each row 
            is an observations
        - y, an array of length m with all the correct classifications
            of the data in x
        - w1, a matrix of size 65 x 6, where each column is a set of weights
            corresponding to one of the hidden layer nodes
        - w2, a matrix of size 6 x 10, where each column is a set of weights
            corresponding from the hidden layer to one of the output nodes
        - num_iterations, an integer, the maximum number of iterations
                        the algorithm will perform before stopping
        - epsilon, the threshold of error for stopping the algorithm
    OUTPUTS:
        - new_w1, the optimized weights for the first layer
        - new_w2, the optimized weights for the second layer
    """
    m = len(x)
    n = len(x[0])
    error = 1
    j = 0
    
    while (j <= num_iterations) and (error >= epsilon):
        j+=1
        
        # Feed Forward
        
        
        # Back Propagation
    

In [29]:
np.dot(np.array([1,1,1]), np.array([2,2,2]))

6

In [68]:
x[0,:]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])