# Multi-layer Neural Network

- ## Mult-layer forward-feed NN with stochastic gradient descent


## A feedforward neural network is an artificial neural network where connections between the units do not form a cycle. This is different from recurrent neural networks.
 

## Comparison of the different sigmoids:

## (https://en.wikipedia.org/wiki/Activation_function, and there are more)

Logistic (a.k.a Soft step)	Activation logistic:	$f(x)=\frac{1}{1+e^{-x}}$, 	$\,f'(x)=f(x)(1-f(x))$,    (0,1)	

TanH	Activation tanh:	$f(x)=\tanh(x)=\frac{2}{1+e^{-2x}}-1$,	$\,f'(x)=1-f(x)^2$,   (-1,1)

In [1]:
%matplotlib inline
# All imports

# ----------> Note: new import: division! 
# ----------> try it: print(4/3)
from __future__ import print_function, division
from random import choice
import numpy as np
import matplotlib.pyplot as plt
from copy import copy

np.set_printoptions(formatter={'float': '{:.5f}'.format})


In [2]:
# np.atleast_2d
a = 3.0
a_2d = np.atleast_2d(a)
b = np.linspace(1, 3, 3)
b_2d = np.atleast_2d(b)
c = np.random.random((2, 3))
c_2d = np.atleast_2d(c)
d = np.random.random((2, 3, 2))
d_2d = np.atleast_2d(d)

print(a)
print(a_2d, '\n')
print(b)
print(b_2d, '\n')
print(c)
print(c_2d, '\n')
print('d:', d)
print('d_2d:', d_2d)

3.0
[[3.00000]] 

[1.00000 2.00000 3.00000]
[[1.00000 2.00000 3.00000]] 

[[0.29778 0.21068 0.14650]
 [0.24008 0.16549 0.53205]]
[[0.29778 0.21068 0.14650]
 [0.24008 0.16549 0.53205]] 

d: [[[0.56773 0.51957]
  [0.61819 0.16315]
  [0.66924 0.83712]]

 [[0.30655 0.47655]
  [0.44006 0.11622]
  [0.18920 0.88075]]]
d_2d: [[[0.56773 0.51957]
  [0.61819 0.16315]
  [0.66924 0.83712]]

 [[0.30655 0.47655]
  [0.44006 0.11622]
  [0.18920 0.88075]]]


## XOR: 

## Exclusive OR is a logical operation that outputs true only when inputs differ (one is true, the other is false).

## Truth Table for XOR
| $x_0$ | $x_1$ | $y$   |
|:-:    |:-:    |:-:    |   
|   0   |   0   |   0   |
|   0   |   1   |   1   |
|   1   |   0   |   1   |
|   1   |   1   |   0   |

In [None]:
'''
Play with the learning_rate

The default is 0.2.  You will see that setting it to 0.4 gets you better result.


'''


def sigmoid(x):
    '''The logistic function as the activation'''
    return 1.0/(1.0 + np.exp(-x))


def sigmoid_prime(z):
    '''derivative of the logistic function'''
    return z*(1.0 - z)

def tanh(x):
    return (np.exp(2*x) - 1)/(np.exp(2*x) + 1)

def tanh_prime(z):
    return 1. - z**2



class NeuralNetwork:
    
    '''
    Implementing multi-layer forward-feed ANN
    '''

    def __init__(self, layers, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime
        self.L = len(layers) - 1

        # Set weights
        self.weights = []
        # layers = [2,2,1]
        # range of weight values (0,1) -- it's an arbitrary choice 
        # and it seems to work.
        # input and hidden layers -- random((2+1, 2+1)) : 3 x 3
        for l in range(1, self.L):
            w = np.random.random((layers[l-1] + 1, layers[l] + 1)) 
            self.weights.append(w)
        # output layer -- random((2+1, 1)) : 3 x 1
        w = np.random.random( (layers[l] + 1, layers[l+1])) 
        self.weights.append(w)
#         print(len(self.weights))
#         print(self.weights)
#         raise KeyboardInterrupt

    def fit(self, X, y, learning_rate=0.4, epochs=100000):
        '''
        Backpropagation
        '''
        # Add column of ones to X
        # This is to add the bias unit to the input layer
        ones = np.atleast_2d(np.ones(X.shape[0]))
        X = np.concatenate((ones.T, X), axis=1)
        print('Input X with bias added:\n', X)
         
        for k in range(epochs):
            if k % 10000 == 0: print('epochs:', k)
            # i is an integer randomly chosen between 0 and 3 --
            # this is the stochastic part.
            i = np.random.randint(X.shape[0])
            # turn a into a list of arrays.
            # The 0th element in the list of a is simply the inputs
            a = [X[i]]
        
            # Now that we know a[0], the inputs, let's forward propagate 
            # to find the rest of the a's;
            # remember the last one, a[-1] is the output, z.
            for l in range(self.L):
                a[l][0] = 1.    
                tot_input_l = np.dot(a[l], self.weights[l])
                a_lplusOne = self.activation(tot_input_l)
                a.append(a_lplusOne)

            # Now preparing for backpropagation by finding all the delta's
            # and store them in a list, deltas:                    
                    
            # output layer; a[-1] is the output, z
            error = y[i] - a[-1]
            # deltas at this point is list that contains a single vector,
            # the delta closest to the output -- eqn (3) on slide 9.
            deltas = [error * self.activation_prime(a[-1])]

            # now find the rest of the deltas.
            # remember len(a) is L + 1, and len(self.weights) is L,
            # where L is the number of layers.  In our case, L = 2.
            for l in range(self.L - 1, 0, -1): 
                delta_l = deltas[-1]
                # This is eqn (4) on slide 9.                
                delta_lminusOne = delta_l.dot(self.weights[l].T)*self.activation_prime(a[l])
                deltas.append(delta_lminusOne)
                
            # reverse
            # [layer 2(output)->layer 1(hidden)]  => [layer 1(hidden)->layer 2(output)]
            deltas.reverse()

            # backpropagation: adjusting weights
            for l in range(self.L):
                a_l = np.atleast_2d(a[l])
                delta_l = np.atleast_2d(deltas[l])
                # implementing eqn (5) on slide 12
                # (to be precise, the equation just above (5))
                self.weights[l] += learning_rate * a_l.T.dot(delta_l)

    def predict(self, x): 
        '''
        forward propagation
        (Note -- possibly improvement:
        a) create a method that adds bias to be used here and in the fit method.
        b) create a method that does forward propagation to be used here and 
        in the method fit.)
        '''
        # adding bias -- np.ones(1) seems silly: 
        # But to concatenate, both objects have to be numpy arrays.
        a = np.concatenate((np.ones(1).T, np.array(x)), axis=1)  
        # forward propagation
        for l in range(0, self.L):
            a[0] = 1.
            a = self.activation(np.dot(a, self.weights[l]))
        return a

if __name__ == '__main__':

    # activation='sigmoid' doesn't seem to work so well.
    nn = NeuralNetwork([2,2,1], tanh, tanh_prime)

    X = np.array([[0, 0],
                  [0, 1],
                  [1, 0],
                  [1, 1]])

    y = np.array([0, 1, 1, 0])

    nn.fit(X, y)

    for e in X:
        print(e,nn.predict(e))

Input X with bias added:
 [[1.00000 0.00000 0.00000]
 [1.00000 0.00000 1.00000]
 [1.00000 1.00000 0.00000]
 [1.00000 1.00000 1.00000]]
epochs: 0
> <ipython-input-4-e483c5b152bd>(83)fit()
-> a.append(a_lplusOne)
(Pdb) l
 78  	                tot_input_l = np.dot(a[l], self.weights[l])
 79  	                a_lplusOne = self.activation(tot_input_l)
 80  	
 81  	                a_lplusOne[0] = 1.
 82  	                import pdb;pdb.set_trace()
 83  ->	                a.append(a_lplusOne)
 84  	
 85  	            # Now preparing for backpropagation by finding all the delta's
 86  	            # and store them in a list, deltas:                    
 87  	                    
 88  	            # output layer; a[-1] is the output, z
(Pdb) p a_lplusOne
array([1.00000, 0.57686, 0.80135])
(Pdb) c
> <ipython-input-4-e483c5b152bd>(82)fit()
-> import pdb;pdb.set_trace()
(Pdb) l
 77  	#                 a[l][0] = 1.    
 78  	                tot_input_l = np.dot(a[l], self.weights[l])
 79  	        

## Breakout Exercise
- ## Allow the NN to have the flexibility of using either the logistic function or the tanh function

- ## Implement the bias correctly

In [None]:
'''
Lab breakout solution:

A general Perceptron classifier.

Takes in training and testing data.

'''
def perceptron_classifier(x_test, training_data = None, w_trained = None, show_train = False):    

    if w_trained == None:
        if training_data == None:
            raise Exception("Training data not provided")
        else:
            w_trained = training(training_data, show_train = show_train)
    
    x_test = np.append(x_test, 1)
    return step_fun(np.dot(x_test, w_trained)), w_trained    



# AND
# training_data = [
#     (np.array([0,0,1]), 0),
#     (np.array([0,1,1]), 0),
#     (np.array([1,0,1]), 0),
#     (np.array([1,1,1]), 1),
# ]


# "down the middle"
training_data = [
    (np.array([0,0,1]), 0),
    (np.array([0,1,1]), 0),
    (np.array([1,0,1]), 1),
    (np.array([1,1,1]), 1),
]

# XOR
# training_data = [
#     (np.array([0,0,1]), 0),
#     (np.array([0,1,1]), 1),
#     (np.array([1,0,1]), 1),
#     (np.array([1,1,1]), 0),
# ]


x_arr = np.random.rand(1000, 2)
z_arr = np.array([])
        
for i, x in enumerate(x_arr):
    if i == 0:
        z, w_trained = perceptron_classifier(x, training_data = training_data)
    else:
        z, _ = perceptron_classifier(x, w_trained = w_trained)
    z_arr = np.append(z_arr, z)

for i, x in enumerate(x_arr):
    if z_arr[i]:
        plt.plot(x[0], x[1], 'r.')
    else:
        plt.plot(x[0], x[1], 'b.')
              
x1 = np.array([-1, 1])
x2 = x1*(-w_trained[0]/w_trained[1]) - w_trained[2]/w_trained[1]
plt.plot(x1, x2, 'g-')
plt.axis([-0.1, 1.1, -0.1, 1.1])
x_train = [np.array([0, 0, 1, 1]), np.array([0,1,0,1])]

for x_train, target in training_data:
    if target:
        plt.plot(x_train[0], x_train[1], 'ro', ms = 10)
    else:
        plt.plot(x_train[0], x_train[1], 'bo', ms = 10)

plt.show()

## End of Week 6-2

In [None]:
from random import choice
from numpy import array, dot, random, sign

step_fn = lambda x: 0 if x < 0 else 1

training_data = [
    (array([0,0,1]), 0),
    (array([0,1,1]), 1),
    (array([1,0,1]), 1),
    (array([1,1,1]), 1),
]

# usu. random numbers for weights is not a bad starting point
w = random.rand(3)
errors = []

# "learning rate"
alfa = 0.2

# use 100 training steps
n = 100

for i in xrange(n):
    for j in range(len(training_data)):
        x, target = training_data[j]
        y = dot(w, x)
        error = target - step_fun(y)
        errors.append(error)
        w += alfa * error * x

print('weights:', w)
    
for x, _ in training_data:
    y = dot(x, w)
    print("{}: {} -> {}".format(x[:2], y, step_fn(y)))

In [None]:
from random import choice
from numpy import array, dot, random, sign

step_fn = lambda x: 0 if x < 0 else 1

training_data = [
    (array([0,0,1]), 0),
    (array([0,1,1]), 1),
    (array([1,0,1]), 1),
    (array([1,1,1]), 1),
]

# usu. random numbers for weights is not a bad starting point
w = random.rand(3)
errors = []

# "learning rate"
alfa = 0.2

# use 100 training steps
n = 100

for i in xrange(n):
    x, target = choice(training_data)
    y = dot(w, x)
    error = target - step_fn(y)
    errors.append(error)
    w += alfa * error * x

print('weights:', w)
    
for x, _ in training_data:
    y = dot(x, w)
    print("{}: {} -> {}".format(x[:2], y, step_fn(y)))

## Lab: github??

## ask students to set up a mirror file system on their laptop (either through dropbox, which requires them have their individual account on the lab iMac's(?) or github -- this could be the topic during Week 5-2)

In [None]:
x_arr = (np.random.rand(1000, 2)*2 - 1)*10

In [None]:
'''
Not perfect, because there are only *four* training point!!

Figuring out the line takes thinking

'''
z_arr = np.array([])
for x in x_arr:
    z, w = NOR_perceptron_classifier(x)
    z_arr = np.append(z_arr, z)

for i, x in enumerate(x_arr):
    if z_arr[i]:
        plt.plot(x[0], x[1], 'r.')
    else:
        plt.plot(x[0], x[1], 'b.')
x1 = np.array([-10, 10])
x2 = x1*(-w[0]/w[1]) - w[2]/w[1]
print(x1, x2)
print(w[2])
plt.plot(x1, x2, 'g-')
plt.axis([-10, 10, -10, 10])
x_train = [np.array([0, 0, 1, 1]), np.array([0,1,0,1])]
plt.plot([0, 1, 1], [1, 0, 1], 'bo', ms = 10)
plt.plot([0,], [0,], 'ro', ms = 10)

plt.show()

## Breakout: Generate 100 pairs of random numbers between [-10, 10]

- ## Classify them according their "z" value.

- ## Plot them, with color coding according their "z" value.

- ## Plot the decision boundary (a line) using w.

## Now consider two inputs, and therefore two weights, w0 and w1.

In [None]:
w0, w1 = 0.4, 0.8
w0_guess, w1_guess = 0.1, 0.2
plt.plot(w0,  w1, 'x')
plt.plot(w0_guess, w1_guess, '.')
plt.axis([0, 1, 0, 1])
plt.plot()