In [29]:
import math
import random
import numpy as np
from numpy.random import default_rng
import matplotlib.pyplot as plt
import keras
from keras.datasets import mnist

def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    pass_variable = Z
    return A, pass_variable

def relu(Z):
    A = np.maximum(0,Z)  
    pass_variable = Z 
    return A, pass_variable
                                          
def sigmoid_backward(dA, pass_variable):
    Z = pass_variable 
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    return dZ

def relu_backward(dA, pass_variable):
    Z = pass_variable
    dZ = np.array(dA, copy=True) 
    dZ[Z <= 0] = 0
    return dZ

def initialize_Layer(input_layer, hidden_layer, output_layer):
    # initialize 1st layer output and input with random values
    W1 = np.random.randn(hidden_layer, input_layer) * 0.01
    # initialize 1st layer output bias
    b1 = np.zeros((hidden_layer, 1))
    # initialize 2nd layer output and input with random values
    W2 = np.random.randn(output_layer, hidden_layer) * 0.01
    # initialize 2nd layer output bias
    b2 = np.zeros((output_layer,1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

def initialize_parameters_deep(layer_dimension):
    parameters = {}

    L = len(layer_dimension)

    for l in range(1, L):
        parameters["W" + str(l)] = np.random.randn(layer_dimension[l], layer_dimension[l-1]) * 0.01
        parameters["b" + str(l)] = np.zeros((layer_dimension[l], 1))

    return parameters

def feedforward(A, W, b):

    Z = np.dot(W,A)+b

    pass_variable = (A, W, b)
    
    return Z, pass_variable

def activation_feedforward(A_prev, W, b, activation):
    
    if activation == "sigmoid":     
        Z, linear_pass_variable = feedforward(A_prev,W,b)
        A, activation_pass_variable = sigmoid(Z)      
    
    elif activation == "relu":     
        Z, linear_pass_variable  = feedforward(A_prev,W,b)
        A, activation_pass_variable = relu(Z)
    
  #   A, activation_pass_variable = softmax(Z)
    pass_variable = (linear_pass_variable , activation_pass_variable)
    
    return A, pass_variable

def model_forward(X, parameters):
    pass_variables = []
    A = X

    # number of layers in the neural network
    L = len(parameters) // 2
    
    
    for l in range(1, L):
        A_prev = A 
        A, pass_variable = linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu")
        pass_variables.append(pass_variable)
   
    AL, pass_variable = linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation = "sigmoid")

    pass_variables.append(pass_variable)
            
    return AL, pass_variables

def compute_cost(AL, Y):
    # number of examples
    m = Y.shape[1]
    # Compute loss from AL and y.
    cost = -np.sum(Y*np.log(AL)+(1-Y)*np.log(1-AL))/m
    cost = np.squeeze(cost)  
    return cost

def backwardpass(dZ, pass_variable):
    A_prev, W, b = pass_variable
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ, A_prev.T)
    db = 1./m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

def activation_backwardpass(dA, pass_variable, activation):

    linear_pass_variable, activation_pass_variable = pass_variable
    
    if activation == "relu":
        dZ = relu_backward(dA, pass_variable[1])
        dA_prev, dW, db = backwardpass(dZ, pass_variable[0])
   
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, pass_variable[1])
        dA_prev, dW, db = backwardpass(dZ, pass_variable[0])
    
    return dA_prev, dW, db

def model_backwardpass(AL, Y, pass_variables):
    grads = {}
    # the number of layers
    L = len(pass_variables)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    current_pass_variable = pass_variables[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_pass_variable, "sigmoid")

    for l in reversed(range(L-1)):

        current_pass_variable = pass_variables[l]
        dA_prev_temp, dW_temp, db_temp = activation_backwardpass(grads["dA"+str(l+1)], current_pass_variable, "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

def update_parameters(parameters, grads, learning_rate):
    # number of layers in the neural network
    L = len(parameters) // 2 

    # Update rule for each parameter
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]

    return parameters

def layer_model(X, Y, layers_dims, learning_rate = 0.01, num_iterations = 100, print_cost=False):
    
    # keep track of cost
    costs = []
    
    # Parameters initialization.
    parameters = initialize_parameters_deep(layers_dims)

    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation
        AL, pass_variables = model_forward(X, parameters)
        
        # Compute cost.
        cost = compute_cost(AL, Y)
    
        # Backward propagation.
        grads = model_backwardpass(AL, Y, pass_variables)
 
        # Update parameters.
        parameters = update_parameters(parameters, grads, learning_rate)
                
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters
                                          
layer_dims = [4,3,2,2,1]
parameters = initialize_parameters_deep(layer_dims)
X = np.random.rand(4, 4)
Y = np.array([[1, 1, 0, 0]])
AL, pass_variables = model_forward(X, parameters)

print("X.shape =", X.shape)
print("AL =", AL)
print("Length of pass_variable list = ", len(caches))
print("parameters:", parameters)
print("cost = ", compute_cost(AL, Y))


X.shape = (4, 4)
AL = [[0.5 0.5 0.5 0.5]]
Length of pass_variable list =  9
parameters: {'W1': array([[ 0.01132424, -0.00831164,  0.00245591,  0.01590985],
       [-0.00692956, -0.00052132,  0.01127494, -0.01044558],
       [-0.00389963, -0.01496864,  0.00289895,  0.02719597]]), 'b1': array([[0.],
       [0.],
       [0.]]), 'W2': array([[-0.02006203, -0.02200051, -0.0081802 ],
       [-0.00288442,  0.01046203, -0.00958938]]), 'b2': array([[0.],
       [0.]]), 'W3': array([[ 0.00926302, -0.01061401],
       [ 0.00788708,  0.00640199]]), 'b3': array([[0.],
       [0.]]), 'W4': array([[-0.00109395,  0.00150953]]), 'b4': array([[0.]])}
cost =  0.6931471805599453


In [None]:
import math
import random
import numpy as np
from numpy.random import default_rng
import matplotlib.pyplot as plt
import keras
from keras.datasets import mnist



(x_train, y_train), (x_test, y_test) = mnist.load_data()
print("Training label shape: ", y_train.shape)
print("First 5 training labels: ", y_train[:5])

image_vector_size = 28*28
x_train = torch.from_numpy(x_train.reshape(x_train.shape[0], image_vector_size))
x_test = torch.from_numpy(x_test.reshape(x_test.shape[0], image_vector_size ))
y_train = y_train
y_test = y_test

class Network(object): 
    
    def _init_(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
           """a = Relu(np.dot(w, a)+b)""" 
        output = softmax(np.dot(w, a)+b)
        return output
                                   
    def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None):  
                                   
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in xrange(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in xrange(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print "Epoch {0}: {1} / {2}".format(
                    j, self.evaluate(test_data), n_test)
            else:
                print "Epoch {0} complete".format(j)
                                   
    def update_mini_batch(self, mini_batch, eta):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            """activation = relu(z)"""
            activations.append(activation)
        # backward pass
        """  delta = self.cost_derivative(activations[-1], y) * \
            softmax_prime(zs[-1])"""
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        """delta = self.cost_derivative(activations[-1], y) * \
            relu_prime(zs[-1])"""
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        for l in xrange(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            """delta = np.dot(self.weights[-l+1].transpose(), delta) * relu_prime(z)"""
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
    
    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y)
        for (x, y) in test_data]
            return sum(int(x == y) for (x, y) in test_results)
        
    def cost_derivative(self, output_activations, y):   
            return (output_activations-y)
        
    def sigmoid(z):
    
        return 1.0/(1.0+np.exp(-z))

    def sigmoid_prime(z):
        return sigmoid(z)*(1-sigmoid(z))

    def ReLU(z):
        return z * (z > 0)

    def ReLU_prime(z):
        return 1. * (z > 0)
    
    def softmax(z):
        # Numerically stable with large exponentials
        exps = np.exp(x - x.max())
        return exps / np.sum(exps, axis=0)
    
    def softmax_prime(z):
        exps = np.exp(x - x.max())
        return  exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0)
                                       
net = network.Network([784, 30, 10])
net.SGD(x_train, 30, 10, 3.0, test_data=test_data)

In [1]:
""""Extra code not needed""""
def softmax(z):
    # Numerically stable with large exponentials
    exps = np.exp(x - x.max())
    return exps / np.sum(exps, axis=0)

def softmax_backward(z):
    exps = np.exp(x - x.max())
    return exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0)


train_X, train_Y, test_X, test_Y =  mnist.load_data()
image_vector_size = 28*28                                          
x_train = torch.from_numpy(x_train.reshape(x_train.shape[0], image_vector_size))
x_test = torch.from_numpy(x_test.reshape(x_test.shape[0], image_vector_size ))
                                          
layers_dims = [2, 4, 1]
parameters = L_layer_model(x_train, y_train, layers_dims, learning_rate = 0.2, num_iterations = 15000, print_cost = True)

print("train accuracy: {} %".format(100 - np.mean(np.abs(predict(x_train, parameters) -  y_train)) * 100))
plot_decision_boundary(lambda x: predict(x.T, parameters), x_train, y_train)

SyntaxError: invalid syntax (<ipython-input-1-1e752760f240>, line 11)