# Neural Network
Neural Networks are one of the most used tools in machine learning. Basically, what we are doing is putting together many perceptrons into layers for a more complex system. Here, I will be using an input layer (64 nodes) -> a hidden layer (5 nodes) -> output layer (10 nodes)

In [1]:
import pandas as pd
import numpy as np
import numpy.linalg as lin
import statistics as stat
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
digits = datasets.load_digits()
x = digits.data

# Adding a bias term
temp = []
for i in range(len(x)):
    temp.append(x[i])
    temp[i] = np.append(temp[i], 1)
    temp

x = np.array(temp)
x.shape

(1797, 65)

In [3]:
x[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.,  1.])

In [4]:
# We can see that since each row is an 8x8 picture, it has 64 columns in it
# Each row is an image
pd.DataFrame(x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,1.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,1.0


In [5]:
y = digits.target
y.shape

(1797,)

In [6]:
y[range(20)]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [48]:
range(10,15)[0]

10

In [49]:
train_x = x[range(1500)]
train_y = y[range(1500)]

test_x = x[range(1500, 1797)]
tesy_y = y[range(1500, 1797)]

In [100]:
def σ(z):
    """
    Sigmoid Function that takes in a number, z, and outputs
    the sigmoid function evaluated at z, which is in the
    interval [0, 1]
    """
    return (1 + np.exp(-z))**(-1)

def loss(y, y_hat):
    """
    Loss function that takes in two arrays of equal length,
    y and y_hat, and calculates the sum of squares loss between
    the two
    """
    return sum((y - y_hat)**2)

def gradient_w2(y_hat, y, hidden_layer, w2, α):
    """
    Function to calculate the gradient for any one of the set of
    weights between the hidden layer and one output node
    INPUTS:
        - y_hat, the calculation of output node k
        - y, the target value for output node k
        - w2, the array of weights that go from the hidden layer
            to one of the output nodes k
        - α, the learning rate
    OUTPUTS:
        - w2_new, an array of new weights for the weights
                between layer two and output node k
    """
    gradient = []
    for j in range(len(w2)):
        gradient.append(2 * (y - y_hat) * y_hat * (1 - y_hat) * hidden_layer[j])
    gradient = np.array(gradient)
    
    w2_new = w2 - (α * gradient)
    return w2_new
    
def gradient_w1(y_vals, y_hats, w2_j, node_j, x, w1, α):
    """
    Function to calculate the gradient for any one of the set of
    weights between the input layer and one hidden layer node
    INPUTS:
        - y_vals, the target outputs for the output layer
        - y_hats, the calculated values for the output layer
        - w2_j, an array of each of the weights from node j
                to the outputs
        - node_j, the value of node j of the hidden layer
        - x, the input values used in the feed forward algorithm
        - w1, a set of weights from all the input nodes to node j
             in the hidden layer
        - α, the learning rate with which to scale the change in weights
        
    OUTPUTS:
        - w1_new, an array of new weights 
    """
    total = 0
    for k in range(10):
        total += (y_vals[k] - y_hats[k]) * y_hats[k] * (1 - y_hats[k]) * w2_j[k]
        
    gradient = total * 2 * node_j * (1 - node_j) * x
     
    w1_new = w1 - (α * gradient)
    return w1_new

def feed_forward(x, w1, w2):
    """
    Function to feed forward the array, x, and give outputs
    INPUTS:
        - x, an array of length 64, the data to be passed through
        - w1, the matrix holding all the weights from the input
            layer to the hidden layer
        - w2, the matrix holding all the weights from the hidden layer
            to the output layer
    OUTPUTS:
        - outputs, an array of length 10
        - hidden_layer, an array of length 5
    """
    hidden_layer = []
    outputs = []
    
    for i in range(5):
        hidden_layer.append(σ(np.dot(w1[:,i], x)))
    hidden_layer.append(1)
        
    for i in range(10):
        z = np.dot(w2[:,i], np.array(hidden_layer))
        outputs.append(σ(z))
        
    return {'o':outputs, 'hl':hidden_layer}

def NeuralNet(x, y, weights1, weights2, num_iterations=100, α=0.25):
    """
    Function for optimizing the weights for the 3-layer Neural Network
    INPUTS:
        - x, the input data matrix with size m x n, where each row 
            is an observations
        - y, an array of length m with all the correct classifications
            of the data in x
        - weights1, a matrix of size 65 x 6, where each column is a set of weights
            corresponding to one of the hidden layer nodes
        - weights2, a matrix of size 6 x 10, where each column is a set of weights
            corresponding from the hidden layer to one of the output nodes
        - num_iterations, an integer, the maximum number of iterations
                        the algorithm will perform before stopping
        - α, the learning rate for the algorithm
    OUTPUTS:
        - new_w1, the optimized weights for the first layer
        - new_w2, the optimized weights for the second layer
    """
    m = len(x)
    n = len(x[0])
    j = 0
    
    w1 = weights1
    w2 = weights2
    error = []
    while (j <= num_iterations):
        j+=1
        
        # Feed Forward
        for i in range(m):
            obs = x[i]
            target = y[i]
            
            ff = feed_forward(x[i], w1, w2)
            y_hats = ff['o']
            hidden_layer = ff['hl']
            guess = np.argmax(np.array(y_hats))
    
            # Back Propagation
            if (guess != y[i]):
                
                target_y = np.zeros(10)
                target_y[target] = 1
                
                # Updating the weights between the hidden layer and the output layer
                for k in range(10):
                    w2[:,k] = gradient_w2(y_hats[k], target_y[k], hidden_layer, w2[:,k], α)
        
                # Updating the weights between the input layer and the hidden layer
                for k in range(6):
                    w1[:,k] = gradient_w1(target_y, y_hats, w2[k,:], hidden_layer[k], obs, w1[:,k], α)
            
    
    return w1, w2

In [105]:
# Weights between the input layer and layer 2
weights1 = np.random.random((64+1)*6).reshape(65, 6)

# Weights between layer 2 and the output layer
weights2 = np.random.random((5+1)*10).reshape(6, 10)
weights2[0,:]
# This would represent all of the weights from the first node in the 
# hidden layer to the output layer

array([ 0.,  0.,  0.,  2., 13.,  0.,  0.,  0.,  0.,  0.,  0.,  8., 15.,
        0.,  0.,  0.,  0.,  0.,  5., 16.,  5.,  2.,  0.,  0.,  0.,  0.,
       15., 12.,  1., 16.,  4.,  0.,  0.,  4., 16.,  2.,  9., 16.,  8.,
        0.,  0.,  0., 10., 14., 16., 16.,  4.,  0.,  0.,  0.,  0.,  0.,
       13.,  8.,  0.,  0.,  0.,  0.,  0.,  0., 13.,  6.,  0.,  0.,  1.])

In [106]:
weights = NeuralNet(train_x, train_y, weights1, weights2, num_iterations=10, α=.25)
w1 = weights[0]
w2 = weights[1]

[0.9798553361957417, 0.9144535830647382, 0.9699225406823817, 0.9587445467269784, 0.8330068661563267, 0.9364363413933672, 0.9288797059974924, 0.8043699459604298, 0.924288399721769, 0.9555047319267361]	0
[0.9798553361957417, 0.9144535830647382, 0.9699225406823817, 0.9587445467269784, 0.8330068661563267, 0.9364363413933672, 0.9288797059974924, 0.8043699459604298, 0.924288399721769, 0.9555047319267361]	0
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0.9809693277453215, 0.9128699151307337, 0.9723025391572658, 0.9630167713532648, 0.8759602913097224, 0.9456927012439448, 0.9401218607917221, 0.8573629800932563, 0.9367982568929089, 0.9604078806541525]	0
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0.981968320787239, 0.9287083730180483, 0.9722422133668133, 0.9665118262463165, 0.903806057693601, 0.9527079296142396, 0.9484591562076071, 0.8916882975823375, 0.9459628695889385, 0.9643699397951536]	0
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0.9828690757709481, 0.9399970387837803, 0.9742893521711021, 0.9664064193764761, 0.9224407452982

In [99]:
def predict(w1, w2, x):
    """
    Function to predict what number observation x
    is based on the given weights w1 and w2
    """
    outputs = feed_forward(x, w1, w2)['o']
    return outputs, np.argmax(outputs)

x_i = x[np.random.randint(1500, 1797)]
# predict(w1, w2, x_i)

feed_forward(x_i, w1, w2)

{'o': [0.9999986237217101,
  0.9999986237771608,
  0.9999987777621369,
  0.9999986214543345,
  0.9999986272288359,
  0.9999986225668063,
  0.9999986237866277,
  0.9999986260368263,
  0.999998629454289,
  0.9999986260556065],
 'hl': [1.0, 1.0, 1.0, 1.0, 1.0, 1]}

In [90]:
w2[:,0]

array([1.89124631, 1.86769133, 2.35137273, 2.2345828 , 1.44321127,
       1.69969278])