## Content:
- [Part 1](#part1)- Importing the libraries, packages
- [Part 2](#part2)- Useful Functions
- [Part 3](#part3) -  One Hidden Layer Class
- [Part 4](#part4) -  Testing with Iris
- [Part 5](#part5) -  Two Hidden Layers Class 
- [Part 6](#part6)-  Loading Fashion MNIST
- [Part 7](#part7)-  Fashion MNIST One Hidden Layer
- [Part 8](#part8)) -  Fashion MNIST Two Hidden Layers
- [Part 9](#part9) -  Results 
- [Part 10](#part10) -  --
- [Part 11](#part11) -  --

Weight initialisation :

- https://machinelearningmastery.com/weight-initialization-for-deep-learning-neural-networks/
- https://www.deeplearning.ai/ai-notes/initialization/
- https://datascience-enthusiast.com/DL/Improving-DeepNeural-Networks-Initialization.html

[Back to top](#Content:)


<a id='part1'></a>

### Part 1 -   Importing the libraries, packages

In [727]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import io
import requests
import random 

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

from scipy.special import expit as activation_function
from scipy.stats import truncnorm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import datasets

[Back to top](#Content:)


<a id='part2'></a>

### Part 2 -   Useful Functions

In [703]:
rng = np.random.default_rng() 

In [704]:
def truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def softmax(X):
    e = np.exp(X - np.max(X))
    return e / e.sum(axis=0, keepdims=True)


def cross_entropy(target, output):
    return -np.mean(target*np.log(output))

def cross_entropy_matrix(output, target):
    target = np.array(target)
    output = np.array(output)
    product = target*np.log(output)
    errors = -np.sum(product, axis=1)
    m = len(errors)
    errors = np.sum(errors) / m
    return errors

def sigmoid(x):
    return 1/(1+np.exp(-x))

def ds(x):
    return sigmoid(x)*(1-sigmoid(x))

def relu(x):
    return np.maximum(x,0)
  

def dr(x):
    dr = (np.sign(x) + 1) / 2
    return dr

def tanh(x):
    a = np.exp(x)
    b = np.exp(-x)
    return (a-b)/(a+b)

def dt(x):
    return 1-tanh(x)**2
    
def leaky(x,a):
    leaky = np.maximum(x,0)*x + a*np.minimum(x,0)
    return leaky

def dl(x,a):
    dl = (np.sign(x)+1)/2 - a*(np.sign(x)-1)/2
    return dl

def derivative(f):
    if f == sigmoid :
        return ds
    if f == tanh :
        return dt
    if f == relu :
        return dr
    if f == leaky :
        return dl
    return None

def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N,K))
    for i in range(N):
        ind[i][y[i]]=1
    return ind

def classification_rate(Y, P):
    return np.mean(Y==P)

In [694]:
def xavier(n):
    lower, upper = -(1.0 / np.sqrt(n)), (1.0 / np.sqrt(n))
    numbers = rng.random(1000)
    scaled = lower + numbers * (upper - lower)
    return scaled
    

[Back to top](#Content:)


<a id='part3'></a>

### Part 3 -   One Hidden Layer Class

# One Hidden Layer

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden x N_features
- **b1**    : Hidden
- **W2**    : Output x Hidden
- **b2**    : Output

In [734]:
class HiddenOne:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes,
                 activation_hidden,
                 learning_rate=0.01,
                 optimizer = None,
                 beta1 = 0.9,   #ADAM optimization parameter, default value taken from practical experience
                 beta2 = 0.999, #ADAM optimization parameter, default value taken from practical experience
                 batch_size = None,
                 delta_stop = None,
                 patience = 1,
                 leaky_intercept=0.01
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes = hidden_nodes          
        self.learning_rate = learning_rate 
        self.activation_hidden = activation_hidden
        self.hidden_derivative = derivative(self.activation_hidden)
        self.beta1 = beta1
        self.beta2 = beta2
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.delta_stop = delta_stop
        self.patience = patience
        self.leaky_intercept = leaky_intercept
        self.create_weight_matrices()
        self.create_biases()
        self.reset_adam()
             
    def create_weight_matrices(self):       
        if self.activation_hidden == relu : # He initialization
            self.W1 = np.random.randn(self.hidden_nodes, self.input_nodes )/np.sqrt(self.input_nodes/2 ) # hidden x features
            self.W2 = np.random.randn(self.output_nodes, self.hidden_nodes )/np.sqrt(self.hidden_nodes/2 )  # output x hidden
        else : # Xavier initialization
            self.W1 = np.random.randn(self.hidden_nodes, self.input_nodes )/np.sqrt(self.input_nodes ) # hidden x features
            self.W2 = np.random.randn(self.output_nodes, self.hidden_nodes )/np.sqrt(self.hidden_nodes )  # output x hidden
            
            
    
        
    
    def create_biases(self):    
        #tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        #self.b1 = tn.rvs(self.hidden_nodes).reshape(-1,1) 
        #self.b2 = tn.rvs(self.output_nodes).reshape(-1,1) 
        self.b1 =  np.zeros((self.hidden_nodes, 1 ))
        self.b2 = np.zeros((self.output_nodes, 1 ))
          
    def reset_adam(self):
        '''
        Creates Adam optimizations variables
        '''
        self.Vdw1 = np.zeros((self.hidden_nodes, self.input_nodes ))
        self.Vdw2 = np.zeros((self.output_nodes, self.hidden_nodes ))
        self.Vdb1 = np.zeros((self.hidden_nodes, 1 ))
        self.Vdb2 = np.zeros((self.output_nodes, 1 ))
        self.Sdw1 = np.zeros((self.hidden_nodes, self.input_nodes ))
        self.Sdw2 = np.zeros((self.output_nodes, self.hidden_nodes ))
        self.Sdb1 = np.zeros((self.hidden_nodes, 1 ))
        self.Sdb2 = np.zeros((self.output_nodes, 1 ))
        
        
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1 # Hidden x N_samples
        A1 = self.activation_hidden(Z1)      # Hidden x N_samples
        Z2 = self.W2.dot(A1) + self.b2  # Output x N_samples
        A2 = softmax(Z2)      #Output x N_samples
        return A2, Z2, A1, Z1
    
    
    def backprop(self, X, target):
        # Forward prop
        A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A2)
        # N samples
        m = X.shape[0]
        # deltas
        dZ2 = A2 - target                                       #Output x N_samples
        dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Update
        lr = self.learning_rate
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
        return cost
        
    def backpropSGD(self, X, target):
        m = X.shape[0]                  #N_samples
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(m))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        for i in range(m) :
            # Forward prop
            x = X_SGD[i,:].reshape(1,-1)                   # 1 x N_features
            a2, z2, a1, z1 = self.forward(x)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,i].reshape(-1,1), a2)/m
            # deltas
            dz2 = a2 - target[:,i].reshape(-1,1)                    #Output x 1
            dW2 = dz2.dot(a1.T)                                     #Output x hidden
            db2 = dz2                                               #Output x 1
            dz1 = self.W2.T.dot(dz2)*self.hidden_derivative(z1)     # Hidden x 1
            dW1 = dz1.dot(x)                                        # Hidden x N_Features
            db1 = dz1                                               # Hidden x 1
            # Update
            lr = self.learning_rate
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
        
    def backprop_minibatch(self, X, target):
        n = X.shape[1]               # N_features
        batch_size = X.shape[0]      # N_samples
        if self.batch_size == None :
            batch_size = self.minibatch_size(batch_size)
        else :
            batch_size = self.batch_size
            
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(X.shape[0] ))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        
        pass_length = int(X.shape[0]/batch_size)
        for i in range(pass_length) :
            k = i*batch_size
            # Forward prop
            X = X_SGD[k:k+batch_size,:].reshape(batch_size,-1)              #batch_size x N_features
            A2, Z2, A1, Z1 = self.forward(X)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,k:k+batch_size].reshape(-1,batch_size), A2)/pass_length
            # deltas
            dZ2 = A2 - target_SGD[:,k:k+batch_size].reshape(-1,batch_size)   #Output x batch_size
            dW2 = dZ2.dot(A1.T)/batch_size                                   #Output x hidden
            db2 = np.sum(dZ2, axis=1, keepdims=True)/batch_size              #Output x 1
            dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)              # Hidden x batch_size
            dW1 = dZ1.dot(X)/batch_size                                      # Hidden x N_Features
            db1 = np.sum(dZ1, axis=1, keepdims=True)/batch_size              #Hidden x1                                            # Hidden x 1
            # Update
            lr = self.learning_rate
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
    
    def backprop_adam_minibatch(self, X, target):
        n = X.shape[1]               # N_features
        batch_size = X.shape[0]      # N_samples
        if self.batch_size == None :
            batch_size = self.minibatch_size(batch_size)
        else :
            batch_size = self.batch_size
            
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(X.shape[0] ))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        
        pass_length = int(X.shape[0]/batch_size)
        for i in range(pass_length) :
            k = i*batch_size
            X = X_SGD[k:k+batch_size,:].reshape(batch_size,-1)  
            t = target_SGD[:,k:k+batch_size].reshape(-1,batch_size)
            cost = cost + self.backpropADAM(X, t)/pass_length
        return cost
        
    
    def backpropADAM(self, X, target):
        # Forward prop
        A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A2)
        # N samples
        m = X.shape[0]
        # deltas
        dZ2 = A2 - target                                       #Output x N_samples
        dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Adam updates
        beta1 = self.beta1
        beta2 = self.beta2
        # V
        self.Vdw1 = beta1*self.Vdw1 + (1-beta1)*dW1
        self.Vdw2 = beta1*self.Vdw2 + (1-beta1)*dW2
        self.Vdb1 = beta1*self.Vdb1 + (1-beta1)*db1
        self.Vdb2 = beta1*self.Vdb2 + (1-beta1)*db2
        # S
        self.Sdw1 = beta2*self.Sdw1 + (1-beta2)*dW1**2
        self.Sdw2 = beta2*self.Sdw2 + (1-beta2)*dW2**2
        self.Sdb1 = beta2*self.Sdb1 + (1-beta2)*db1**2
        self.Sdb2 = beta2*self.Sdb2 + (1-beta2)*db2**2    
        # Update
        lr = self.learning_rate
        self.W2 -= lr * self.Vdw2 / (np.sqrt(self.Sdw2)+1e-8)
        self.b2 -= lr * self.Vdb2 / (np.sqrt(self.Sdb2)+1e-8)
        self.W1 -= lr * self.Vdw1 / (np.sqrt(self.Sdw1)+1e-8)
        self.b1 -= lr * self.Vdb1 / (np.sqrt(self.Sdb1)+1e-8)
        return cost  
    
    def predict(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        return A2
    
    def predict_class(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A2, axis=0)
        return y_pred
                   
    def run(self, X_train, target, epochs=10):
        costs = [1e-10]
        if self.delta_stop == None : 
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 1epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
             
            elif self.optimizer == 'mini_adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backprop_adam_minibatch(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :
                for i in range(epochs):  
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            
        else :
            counter = 0
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
            elif self.optimizer == 'mini_adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backprop_adam_minibatch(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs     
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :  
                for i in range(epochs): 
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                        else :
                            counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')        
                print(f'Loss after epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
          
            
        
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
        
       
    def minibatch_size(self, n_samples):
        '''
        Compute minibatch size in case its not provided
        '''
        if n_samples < 2000:
            return n_samples
        if n_samples < 12800:
            return 64
        if n_samples < 25600:
            return 128
        if n_samples < 51200:
            return 256
        if n_samples < 102400:
            return 512
        return 1024
    
        
        
            

[Back to top](#Content:)


<a id='part4'></a>

### Part 4 -   Testing with Iris Dataset


## Loading and preparing Data

In [729]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

In [730]:
from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

In [731]:
M = 5
D = data.shape[1]
K = len(set(target))
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T



In [732]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

M = 5
D = data.shape[1]
K = len(set(target))
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

## One Hidden Layer 
### Activation Function Tests :


#### Sigmoid

In [735]:
nn_sigmoid = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = sigmoid,
               #optimizer='minibatch',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [736]:
c=nn_sigmoid.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.41585016740146774
Loss after epoch 20 : 0.40573786848963195
Loss after epoch 30 : 0.3974720817119876
Loss after epoch 40 : 0.39066877389972465
Loss after epoch 50 : 0.38502452166532963
Loss after epoch 60 : 0.3803001885314539
Loss after epoch 70 : 0.37630737578505624
Loss after epoch 80 : 0.3728975266059803
Loss after epoch 90 : 0.36995331277373605
Loss after epoch 100 : 0.3673818862354835
Loss after epoch 110 : 0.36510961353677895
Loss after epoch 120 : 0.36307797244997947
Loss after epoch 130 : 0.3612403526835652
Loss after epoch 140 : 0.35955955743298385
Loss after epoch 150 : 0.358005847639898
Loss after epoch 160 : 0.3565554067220696
Loss after epoch 170 : 0.3551891316139499
Loss after epoch 180 : 0.35389167770363383
Loss after epoch 190 : 0.35265070198511606
Loss after epoch 200 : 0.3514562615617964
Loss after epoch 210 : 0.3503003344222473
Loss after epoch 220 : 0.3491764368655973
Loss after epoch 230 : 0.34807931763469385
Loss after epoch 240 : 0.3470047

In [737]:
acc = nn_sigmoid.evaluate(X_test, y_test)

Accuracy : 0.6578947368421053


In [653]:
nn_sigmoid = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
                hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = tanh,
                activation_hidden_2 = tanh,
               #optimizer='minibatch',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [654]:
c=nn_sigmoid.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.3777789630926805
Loss after epoch 20 : 0.37470535527658955
Loss after epoch 30 : 0.3721889410177829
Loss after epoch 40 : 0.3701269316497885
Loss after epoch 50 : 0.36842850784899417
Loss after epoch 60 : 0.36701440319611284
Loss after epoch 70 : 0.3658160474793729
Loss after epoch 80 : 0.36477446893962095
Loss after epoch 90 : 0.36383918718767655
Loss after epoch 100 : 0.36296733893118754
Loss after epoch 110 : 0.36212325372862025
Loss after epoch 120 : 0.36127858379418915
Loss after epoch 130 : 0.3604128143363051
Loss after epoch 140 : 0.3595135399334522
Loss after epoch 150 : 0.3585755492100958
Loss after epoch 160 : 0.3575980497487224
Loss after epoch 170 : 0.3565805331672189
Loss after epoch 180 : 0.3555190180750677
Loss after epoch 190 : 0.3544043682732772
Loss after epoch 200 : 0.35322295849879215
Loss after epoch 210 : 0.3519585950991524
Loss after epoch 220 : 0.35059440369655703
Loss after epoch 230 : 0.3491140527462868
Loss after epoch 240 : 0.34750234

In [655]:
acc = nn_sigmoid.evaluate(X_test, y_test)

Accuracy : 0.631578947368421


#### tanh

In [738]:
nn_tanh = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               #optimizer='minibatch',
               #batch_size = 28,
                delta_stop = 1e-3,
                patience = 5,
              )

In [739]:
c=nn_tanh.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.25910031125641897
Loss after epoch 20 : 0.24821932418995585
Loss after epoch 30 : 0.23853716433771743
Loss after epoch 40 : 0.22971347770735823
Loss after epoch 50 : 0.22186966639877345
Loss after epoch 60 : 0.21518766419640623
Loss after epoch 70 : 0.2095691330359506
Loss after epoch 80 : 0.20475465539027948
Loss after epoch 90 : 0.20051147760286078
Loss after epoch 100 : 0.19668108986829622
Loss after epoch 110 : 0.193162530038196
Loss after epoch 120 : 0.18989049688740942
Loss after epoch 130 : 0.18682082409245693
Loss after epoch 140 : 0.18392208803378093
Loss after epoch 150 : 0.18117085991788534
Loss after epoch 160 : 0.17854896608556578
Loss after epoch 170 : 0.17604185086199434
Loss after epoch 180 : 0.17363755826551253
Loss after epoch 190 : 0.17132607229102573
Loss after epoch 200 : 0.16909887232743698
Loss after epoch 210 : 0.1669486222735241
Loss after epoch 220 : 0.16486894564594703
Loss after epoch 230 : 0.16285425784842295
Loss after epoch 240 : 0

In [740]:
acc = nn_tanh.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


In [678]:
nn_tanh = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
                hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
                activation_hidden_2 = relu,
                
               #optimizer='minibatch',
               #batch_size = 28,
                delta_stop = 1e-5,
                
                #patience = 5,
              )

In [679]:
c = nn_tanh.run(X_train, y_train_cat, epochs=1000)

Loss after epoch 10 : 0.4050681286131978
Loss after epoch 20 : 0.37308098007748347
Loss after epoch 30 : 0.36714758430342376
Loss after epoch 40 : 0.36258427677266697
Loss after epoch 50 : 0.3579589929824973
Loss after epoch 60 : 0.35313811785107835
Loss after epoch 70 : 0.34804318146871394
Loss after epoch 80 : 0.3426008985880527
Loss after epoch 90 : 0.33674126386489933
Loss after epoch 100 : 0.3303986046576607
Loss after epoch 110 : 0.3235139916546409
Loss after epoch 120 : 0.3160390799967083
Loss after epoch 130 : 0.30794141289220905
Loss after epoch 140 : 0.299210971170549
Loss after epoch 150 : 0.2898673226639232
Loss after epoch 160 : 0.27996614418701204
Loss after epoch 170 : 0.2696033057443923
Loss after epoch 180 : 0.2589144280660621
Loss after epoch 190 : 0.2480682408216655
Loss after epoch 200 : 0.2372534119879216
Loss after epoch 210 : 0.22666054387690332
Loss after epoch 220 : 0.21646291464101416
Loss after epoch 230 : 0.20680026841360633
Loss after epoch 240 : 0.19776902

In [680]:
acc = nn_tanh.evaluate(X_test, y_test)

Accuracy : 1.0


#### ReLU

In [741]:
nn_relu = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               #optimizer='minibatch',
               #batch_size = 28,
                delta_stop = 1e-3,
                patience = 5,
              )

In [742]:
c=nn_relu.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.36477918208715227
Loss after epoch 20 : 0.3586379317243374
Loss after epoch 30 : 0.3426363199708246
Loss after epoch 40 : 0.3284720014171839
Loss after epoch 50 : 0.3169393833120648
Loss after epoch 60 : 0.31114633406627706
Loss after epoch 70 : 0.3070230318161493
Loss after epoch 80 : 0.3031201023923956
Loss after epoch 90 : 0.29942907192387574
Loss after epoch 100 : 0.2959388676809844
Loss after epoch 110 : 0.2926129399584263
Loss after epoch 120 : 0.28946454694371215
Loss after epoch 130 : 0.28644129985660977
Loss after epoch 140 : 0.28357047569234506
Loss after epoch 150 : 0.28081222899281805
Loss after epoch 160 : 0.27817949077939275
Loss after epoch 170 : 0.2756495513122701
Loss after epoch 180 : 0.2732229985751936
Loss after epoch 190 : 0.2708945298133796
Loss after epoch 200 : 0.2686434778386828
Loss after epoch 210 : 0.2664731781528326
Loss after epoch 220 : 0.2643729598462644
Loss after epoch 230 : 0.2623226150683516
Loss after epoch 240 : 0.2603095869

In [743]:
acc = nn_relu.evaluate(X_test, y_test)

Accuracy : 0.8421052631578947


### Conclusion :

ReLU works better, need to confirm that later with Fashion-MNIST + multiple tests

### Optimizer  Tests :

### SGD :

Doing 1 sample each time

In [267]:
nn_SGD = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='SGD',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [268]:
c=nn_SGD.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.11178621200805929
Loss after epoch 20 : 0.06337019175301992
Loss after epoch 30 : 0.04610374353463256
Loss after epoch 40 : 0.03910961806157379
Loss after epoch 50 : 0.03602939429402628
Loss after epoch 60 : 0.034022388227977
Loss after epoch 70 : 0.0324823731979065
Loss after epoch 80 : 0.031209572960497433
Loss after epoch 90 : 0.03010116153760969
Loss after epoch 100 : 0.029103675610530535
Loss after epoch 110 : 0.028193257740260907
Loss after epoch 120 : 0.027361656010159573
Loss after epoch 130 : 0.02660611274483757
Loss after epoch 140 : 0.02592430512348329
Loss after epoch 150 : 0.025312164015108718
Loss after epoch 160 : 0.024764532903852384
Loss after epoch 170 : 0.02427552201412039
Loss after epoch 180 : 0.02383906911936654
Loss after epoch 190 : 0.023449477155096706
Loss after epoch 200 : 0.02310110629125307
Loss after epoch 210 : 0.022790075729744633
Loss after epoch 220 : 0.0225108672290472
Loss after epoch 230 : 0.02225898852349367
Loss after epoch

In [269]:
acc = nn_SGD.evaluate(X_test, y_test)

Accuracy : 0.9210526315789473


# Adam

In [681]:
nn_adam = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               optimizer='adam',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [682]:
c=nn_adam.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.2423529033168527
Loss after epoch 20 : 0.11808621810521255
Loss after epoch 30 : 0.057681940018810524
Loss after epoch 40 : 0.0363068780814127
Loss after epoch 50 : 0.02857660818256934
Loss after epoch 60 : 0.02544099789404296
Loss after epoch 70 : 0.023815722897536015
Loss after epoch 80 : 0.022837755598797405
Loss after epoch 90 : 0.022171546257504456
Loss after epoch 100 : 0.02167328608262237
Loss after epoch 110 : 0.021277490210395527
Loss after epoch 120 : 0.02095036787427598
Loss after epoch 130 : 0.020672246407231327
Loss after epoch 140 : 0.02043061367083427
Loss after epoch 150 : 0.020216989476848444
Loss after epoch 160 : 0.020025363525124004
Loss after epoch 170 : 0.019851346349154416
Loss after epoch 180 : 0.019691664087888888
Loss after epoch 190 : 0.019543837200721323
Loss after epoch 200 : 0.019405967716993284
Loss after epoch 210 : 0.019276592381687035
Loss after epoch 220 : 0.019154578530320408
Loss after epoch 230 : 0.019039047952167994
Loss af

In [683]:
acc = nn_adam.evaluate(X_test, y_test)

Accuracy : 1.0


In [684]:
nn_adam = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
                hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = tanh,
                activation_hidden_2 = tanh,
               optimizer='adam',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_adam.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.3500144298537724
Loss after epoch 20 : 0.18573294678518648
Loss after epoch 30 : 0.12166271754655844
Loss after epoch 40 : 0.06105085373084974
Loss after epoch 50 : 0.03800788955076141
Loss after epoch 60 : 0.029745519624044127
Loss after epoch 70 : 0.026640336179486895
Loss after epoch 80 : 0.024949249099609334
Loss after epoch 90 : 0.023799460738369575
Loss after epoch 100 : 0.02297742203157648
Loss after epoch 110 : 0.02235699790527103
Loss after epoch 120 : 0.021860217912635937
Loss after epoch 130 : 0.021449439293187617
Loss after epoch 140 : 0.021102364227618502
Loss after epoch 150 : 0.02080416820535786
Loss after epoch 160 : 0.02054402480440697
Loss after epoch 170 : 0.020313954197251256
Loss after epoch 180 : 0.020108116042629572
Loss after epoch 190 : 0.019922196903320935
Loss after epoch 200 : 0.019752897201227256
Loss after epoch 210 : 0.01959766781336581
Loss after epoch 220 : 0.019454487207697324
Loss after epoch 230 : 0.019321725640316097
Loss aft

In [685]:
acc = nn_adam.evaluate(X_test, y_test)

Accuracy : 1.0


### SGD :

Testing different minibatch sizes

#### Minibatch size = 1

In [273]:
nn_mini1 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 1,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini1.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.09248320772599761
Loss after epoch 20 : 0.05695343488349709
Loss after epoch 30 : 0.04333175086150384
Loss after epoch 40 : 0.03713486874640103
Loss after epoch 50 : 0.0342721444044974
Loss after epoch 60 : 0.03244756431960605
Loss after epoch 70 : 0.03105749187255549
Loss after epoch 80 : 0.029895099503326177
Loss after epoch 90 : 0.028897275523321917
Loss after epoch 100 : 0.027985381546761327
Loss after epoch 110 : 0.027161892378088588
Loss after epoch 120 : 0.02640868153214715
Loss after epoch 130 : 0.025737243030114036
Loss after epoch 140 : 0.025108700711564718
Loss after epoch 150 : 0.024565576089699795
Loss after epoch 160 : 0.02407098097279119
Loss after epoch 170 : 0.023629793867818362
Loss after epoch 180 : 0.023236222530092603
Loss after epoch 190 : 0.022886431940656468
Loss after epoch 200 : 0.022584777644226582
Loss after epoch 210 : 0.022298031878595137
Loss after epoch 220 : 0.02205415737647696
Loss after epoch 230 : 0.02182746404648229
Loss afte

In [274]:
acc = nn_mini1.evaluate(X_test, y_test)

Accuracy : 0.9210526315789473


#### Minibatch size = 2

In [275]:
nn_mini2 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 2,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [276]:
c=nn_mini2.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.10282358897185562
Loss after epoch 20 : 0.05929504528579455
Loss after epoch 30 : 0.04233529683615283
Loss after epoch 40 : 0.03499504634068722
Loss after epoch 50 : 0.03116344868111496
Loss after epoch 60 : 0.028864041263092922
Loss after epoch 70 : 0.027333551470736576
Loss after epoch 80 : 0.026229954603241082
Loss after epoch 90 : 0.025394955051465745
Loss after epoch 100 : 0.024749739980176478
Loss after epoch 110 : 0.024212422487534184
Loss after epoch 120 : 0.02376463508817755
Loss after epoch 130 : 0.023379204557115955
Loss after epoch 140 : 0.02304245845541183
Loss after epoch 150 : 0.022742797938772444
Loss after epoch 160 : 0.02247380128769662
Loss after epoch 170 : 0.022229128701572863
Loss after epoch 180 : 0.022004833945765304
Loss after epoch 190 : 0.021798369706974546
Loss after epoch 200 : 0.021608011690900753
Loss after epoch 210 : 0.021431347314564364
Loss after epoch 220 : 0.02126492719471676
Loss after epoch 230 : 0.021111108609688428
Loss a

In [277]:
acc = nn_mini2.evaluate(X_test, y_test)

Accuracy : 0.9473684210526315


# Minibatch size = 2 Adam

In [278]:
nn_miniadam2 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='mini_adam',
               batch_size = 2,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [279]:
c=nn_miniadam2.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.027900512884640422
Loss after epoch 20 : 0.0247428761477872
Loss after epoch 30 : 0.02471131744002746
Loss after epoch 40 : 0.028278536875041217
Loss after epoch 50 : 0.03706337001086922
Loss after epoch 60 : 0.031872339371941734
Loss after epoch 70 : 0.03075162566693863
Loss after epoch 80 : 0.02884076251029113
Loss after epoch 90 : 0.030960874549734813
Loss after epoch 100 : 0.027044146020956387
Loss after epoch 110 : 0.02641229269214752
Loss after epoch 120 : 0.025373377539337338
Loss after epoch 130 : 0.02491685676544792
Loss after epoch 140 : 0.02524965360807673
Loss after epoch 150 : 0.033615457472911746
Loss after epoch 160 : 0.024565990916517705
Loss after epoch 170 : 0.024453305688557248
Loss after epoch 180 : 0.022613608019364434
Loss after epoch 190 : 0.015859082855148202
Loss after epoch 200 : 0.01689771839525179
Loss after epoch 210 : 0.016531787049362097
Loss after epoch 220 : 0.01625680725106396
Loss after epoch 230 : 0.01657475732507573
Loss afte

In [280]:
acc = nn_miniadam2.evaluate(X_test, y_test)

Accuracy : 0.9210526315789473


#### Minibatch size = 8


In [686]:
nn_mini8 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               optimizer='minibatch',
               batch_size = 8,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini8.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.34955779747014204
Loss after epoch 20 : 0.3093301480699783
Loss after epoch 30 : 0.26446663642882684
Loss after epoch 40 : 0.23327138931022703
Loss after epoch 50 : 0.21330501097112967
Loss after epoch 60 : 0.19982639334151892
Loss after epoch 70 : 0.1863919922603852
Loss after epoch 80 : 0.17502950673787115
Loss after epoch 90 : 0.16541417887066587
Loss after epoch 100 : 0.15611995377285431
Loss after epoch 110 : 0.14658186950209742
Loss after epoch 120 : 0.13664142077293603
Loss after epoch 130 : 0.1264493817051361
Loss after epoch 140 : 0.11633959113819174
Loss after epoch 150 : 0.10667752776511091
Loss after epoch 160 : 0.09774764897457794
Loss after epoch 170 : 0.08970965860087132
Loss after epoch 180 : 0.08260859385010708
Loss after epoch 190 : 0.07640904341555924
Loss after epoch 200 : 0.07103094496010426
Loss after epoch 210 : 0.06637659921025435
Loss after epoch 220 : 0.062347328741051235
Loss after epoch 230 : 0.05885206265752019
Loss after epoch 240 :

In [687]:
acc = nn_mini8.evaluate(X_test, y_test)

Accuracy : 1.0


In [689]:
nn_mini8 = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
                hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = tanh,
                activation_hidden_2 = tanh,
               optimizer='minibatch',
               batch_size = 8,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini8.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 10 : 0.3437912498193548
Loss after epoch 20 : 0.30006489984258555
Loss after epoch 30 : 0.24463139557617544
Loss after epoch 40 : 0.20213509984141115
Loss after epoch 50 : 0.1751779640241112
Loss after epoch 60 : 0.15630524888265312
Loss after epoch 70 : 0.14051291620136064
Loss after epoch 80 : 0.12610339713826918
Loss after epoch 90 : 0.11288690828389258
Loss after epoch 100 : 0.10106579532291171
Loss after epoch 110 : 0.09074828098797538
Loss after epoch 120 : 0.08188522776860784
Loss after epoch 130 : 0.07431476625812905
Loss after epoch 140 : 0.06783066565326304
Loss after epoch 150 : 0.06224081300813419
Loss after epoch 160 : 0.057397494522004835
Loss after epoch 170 : 0.05320644573500711
Loss after epoch 180 : 0.04962412454560188
Loss after epoch 190 : 0.04664372088742082
Loss after epoch 200 : 0.04426695235320462
Loss after epoch 210 : 0.04246722960141518
Loss after epoch 220 : 0.04115981184129717
Loss after epoch 230 : 0.04020226257632146
Loss after epoch 240 

In [690]:
acc = nn_mini8.evaluate(X_test, y_test)

Accuracy : 1.0


# minibatch size 8 adam

In [231]:
nn_miniadam8 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='mini_adam',
               batch_size = 8,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [232]:
c=nn_miniadam8.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 0 : 0.3632315261812418
Loss after epoch 1 : 0.20940654083564966
Loss after epoch 2 : 0.1409206736361145
Loss after epoch 3 : 0.11549881353845838
Loss after epoch 4 : 0.10194251691073343
Loss after epoch 5 : 0.09632635924935
Loss after epoch 6 : 0.08839683711279037
Loss after epoch 7 : 0.07399376829706299
Loss after epoch 8 : 0.06171369620750588
Loss after epoch 9 : 0.05477466371668464
Loss after epoch 10 : 0.05076560423785906
Loss after epoch 11 : 0.047215905483743076
Loss after epoch 12 : 0.043516284396157075
Loss after epoch 13 : 0.04017795164183857
Loss after epoch 14 : 0.037479590419841946
Loss after epoch 15 : 0.0352782442398913
Loss after epoch 16 : 0.03335760843522057
Loss after epoch 17 : 0.03165566543762084
Loss after epoch 18 : 0.030174050270327237
Loss after epoch 19 : 0.028874165318321498
Loss after epoch 20 : 0.02771552748300487
Loss after epoch 21 : 0.02667618891842569
Loss after epoch 22 : 0.025740357290919218
Loss after epoch 23 : 0.02489252624264768
Lo

Loss after epoch 283 : 0.006927305436733612
Loss after epoch 284 : 0.00691879306967563
Loss after epoch 285 : 0.006910331938896355
Loss after epoch 286 : 0.006901921550267489
Loss after epoch 287 : 0.006893561415573222
Loss after epoch 288 : 0.006885251052431789
Loss after epoch 289 : 0.006876989984216641
Loss after epoch 290 : 0.006868777739978831
Loss after epoch 291 : 0.0068606138543708586
Loss after epoch 292 : 0.006852497867570368
Loss after epoch 293 : 0.006844429325205572
Loss after epoch 294 : 0.006836407778281335
Loss after epoch 295 : 0.006828432783106354
Loss after epoch 296 : 0.006820503901220594
Loss after epoch 297 : 0.006812620699324951
Loss after epoch 298 : 0.00680478274921049
Loss after epoch 299 : 0.006796989627689449
Loss after epoch 300 : 0.006789240916527281
Loss after epoch 300 : 0.006789240916527281
Loss after epoch 301 : 0.006781536202374945
Loss after epoch 302 : 0.006773875076702782
Loss after epoch 303 : 0.006766257135734871
Loss after epoch 304 : 0.00675868

Loss after epoch 566 : 0.005573523594731344
Loss after epoch 567 : 0.005570624739063235
Loss after epoch 568 : 0.005567732246987232
Loss after epoch 569 : 0.005564846081061455
Loss after epoch 570 : 0.005561966204100495
Loss after epoch 571 : 0.005559092579173239
Loss after epoch 572 : 0.005556225169600726
Loss after epoch 573 : 0.005553363938954537
Loss after epoch 574 : 0.0055505088510540024
Loss after epoch 575 : 0.005547659869964647
Loss after epoch 576 : 0.005544816959995709
Loss after epoch 577 : 0.005541980085698853
Loss after epoch 578 : 0.005539149211865199
Loss after epoch 579 : 0.00553632430352417
Loss after epoch 580 : 0.00553350532594108
Loss after epoch 581 : 0.005530692244615347
Loss after epoch 582 : 0.005527885025278682
Loss after epoch 583 : 0.0055250836338926945
Loss after epoch 584 : 0.005522288036647758
Loss after epoch 585 : 0.005519498199960469
Loss after epoch 586 : 0.005516714090472296
Loss after epoch 587 : 0.005513935675047332
Loss after epoch 588 : 0.0055111

Loss after epoch 854 : 0.004902866847498431
Loss after epoch 855 : 0.004900880900270782
Loss after epoch 856 : 0.004898896258468803
Loss after epoch 857 : 0.004896912914843524
Loss after epoch 858 : 0.00489493086218025
Loss after epoch 859 : 0.004892950093298538
Loss after epoch 860 : 0.004890970601051452
Loss after epoch 861 : 0.004888992378326189
Loss after epoch 862 : 0.004887015418042931
Loss after epoch 863 : 0.0048850397131557435
Loss after epoch 864 : 0.004883065256651209
Loss after epoch 865 : 0.004881092041549168
Loss after epoch 866 : 0.004879120060901935
Loss after epoch 867 : 0.004877149307794738
Loss after epoch 868 : 0.0048751797753446265
Loss after epoch 869 : 0.004873211456701117
Loss after epoch 870 : 0.004871244345045524
Loss after epoch 871 : 0.004869278433591219
Loss after epoch 872 : 0.004867313715582621
Loss after epoch 873 : 0.004865350184296049
Loss after epoch 874 : 0.0048633878330389645
Loss after epoch 875 : 0.004861426655149497
Loss after epoch 876 : 0.00485

In [233]:
acc = nn_miniadam8.evaluate(X_test, y_test)

Accuracy : 0.9473684210526315


#### minibatch size 16

In [234]:
nn_mini16 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = sigmoid,
               optimizer='minibatch',
               batch_size = 16,
               delta_stop = 1e-3,
               patience = 5,
              )
c=nn_mini16.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.2743453475158421
Loss after epoch 200 : 0.20954586960829621
Loss after epoch 300 : 0.17529970402180023
Loss after epoch 400 : 0.15236567416199326
Loss after epoch 500 : 0.13268326362261756
Loss after epoch 600 : 0.11489683916849842
Loss after epoch 700 : 0.09933158655232376
Loss after epoch 800 : 0.08627951065055557
Loss after epoch 900 : 0.07563322472100956
Loss after 7epoch 1001 : 0.06711845536990935


In [235]:
acc = nn_mini16.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


## Minibatch size 16 adam

In [236]:
nn_miniadam16 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 16,
               delta_stop = 1e-3,
               patience = 5,
              )
c=nn_miniadam16.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.10528785498700588
Loss after epoch 200 : 0.05313376837042597
Loss after epoch 300 : 0.03517328058472924
Loss after epoch 400 : 0.02744492827823468
Loss after epoch 500 : 0.02318547889239907
Loss after epoch 600 : 0.0204636690594039
Early stop at epoch 643, the cost is : 0.019568655025386034


In [237]:
acc = nn_miniadam16.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


#### Minibatch size 32

In [121]:
nn_mini32 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 32,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini32.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.24423174555697824
Loss after epoch 200 : 0.10864390175002289
Loss after epoch 300 : 0.07464147547420144
Loss after epoch 400 : 0.05800286097313573
Loss after epoch 500 : 0.04877020258397195
Loss after epoch 600 : 0.04319664204297226
Loss after epoch 700 : 0.03958215983536185
Loss after epoch 800 : 0.03710860405512004
Loss after epoch 900 : 0.03531851438494933
Loss after epoch 1001 : 0.03398528894579797


In [122]:
acc = nn_mini32.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


#### Minibatch size 64

In [123]:
nn_mini64 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 64,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini64.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.26834528254850826
Loss after epoch 200 : 0.2058438974711505
Loss after epoch 300 : 0.1689162735929759
Loss after epoch 400 : 0.14561402715011804
Loss after epoch 500 : 0.12773139351036716
Loss after epoch 600 : 0.11274114420052123
Loss after epoch 700 : 0.09998996333689264
Loss after epoch 800 : 0.08925000737231119
Loss after epoch 900 : 0.0803031409095878
Loss after epoch 1001 : 0.07295611140430698


In [124]:
acc = nn_mini64.evaluate(X_test, y_test)

Accuracy : 1.0


#### Minibactch size not specified

In [125]:
nn_mini = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = sigmoid,
               optimizer='minibatch',
               #batch_size = 64,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.362080412007166
Loss after epoch 200 : 0.3553161314697229
Loss after epoch 300 : 0.3485307292971733
Loss after epoch 400 : 0.3399950463358105
Loss after epoch 500 : 0.33008496343476673
Loss after epoch 600 : 0.3192536157965336
Loss after epoch 700 : 0.30788814053331465
Loss after epoch 800 : 0.29630978638834077
Loss after epoch 900 : 0.2848041400661859
Loss after epoch 1001 : 0.273725451778514


In [126]:
acc = nn_mini.evaluate(X_test, y_test)

Accuracy : 0.8157894736842105


## To add :
- Stopping test


[Back to top](#Content:)


<a id='part5'></a>

### Part 5 -   Two Hidden Layers Class

# Two Hidden Layers

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden1 x N_features
- **b1**    : Hidden1
- **W2**    : Hidden2 x Hidden1
- **b2**    : Hidden2
- **W3**    : Output x Hidden
- **b3**    : Output

In [585]:
class HiddenTwo:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes_1,
                 hidden_nodes_2,
                 activation_hidden_1,
                 activation_hidden_2,
                 learning_rate=0.01,
                 optimizer = None,
                 beta1 = 0.9,   #ADAM optimization parameter, default value taken from practical experience
                 beta2 = 0.999, #ADAM optimization parameter, default value taken from practical experience
                 batch_size = None,
                 delta_stop = None,
                 patience = 1,
                 leaky_intercept=0.01
                 
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes_1 = hidden_nodes_1    
        self.hidden_nodes_2 = hidden_nodes_2    
        self.learning_rate = learning_rate 
        self.activation_hidden_1 = activation_hidden_1
        self.activation_hidden_2 = activation_hidden_2
        self.hidden_derivative_1 = derivative(self.activation_hidden_1)
        self.hidden_derivative_2 = derivative(self.activation_hidden_2)
        self.beta1 = beta1
        self.beta2 = beta2
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.delta_stop = delta_stop
        self.patience = patience
        self.leaky_intercept = leaky_intercept
        self.create_weight_matrices()
        self.create_biases()
        self.reset_adam()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n1 = self.input_nodes * self.hidden_nodes_1
        self.W1 = tn.rvs(n1).reshape((self.hidden_nodes_1, self.input_nodes )) # hidden1 x features
        # W2 of size hidden2 x hidden1
        n2 = self.hidden_nodes_2 * self.hidden_nodes_1
        self.W2 = tn.rvs(n2).reshape((self.hidden_nodes_2, self.hidden_nodes_1 )) # hidden1 x features
        # W3 of size output x hidden2
        n3 = self.hidden_nodes_2  * self.output_nodes
        self.W3 = tn.rvs(n3).reshape((self.output_nodes, self.hidden_nodes_2 )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes_1).reshape(-1,1) 
        self.b2 = tn.rvs(self.hidden_nodes_2).reshape(-1,1) 
        self.b3 = tn.rvs(self.output_nodes).reshape(-1,1) 
        
    def reset_adam(self):
        '''
        Creates Adam optimizations variables
        '''
        self.Vdw1 = np.zeros((self.hidden_nodes_1, self.input_nodes ))
        self.Vdw2 = np.zeros((self.hidden_nodes_2, self.hidden_nodes_1 ))
        self.Vdw3 = np.zeros((self.output_nodes, self.hidden_nodes_2))
       
        self.Vdb1 = np.zeros((self.hidden_nodes_1, 1 ))
        self.Vdb2 = np.zeros((self.hidden_nodes_2, 1 ))
        self.Vdb3 = np.zeros((self.output_nodes, 1 ))
        
        self.Sdw1 = np.zeros((self.hidden_nodes_1, self.input_nodes ))
        self.Sdw2 = np.zeros((self.hidden_nodes_2, self.hidden_nodes_1 ))
        self.Sdw3 = np.zeros((self.output_nodes, self.hidden_nodes_2))
       
        self.Sdb1 = np.zeros((self.hidden_nodes_1, 1 ))
        self.Sdb2 = np.zeros((self.hidden_nodes_2, 1 ))
        self.Sdb3 = np.zeros((self.output_nodes, 1 ))
                
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1      # Hidden1 x N_samples
        A1 = self.activation_hidden_1(Z1)      # Hidden1 x N_samples
        Z2 = self.W2.dot(A1) + self.b2      # Hidden2 x N_samples
        A2 = self.activation_hidden_2(Z2)      # Hidden2 x N_samples
        Z3 = self.W3.dot(A2) + self.b3       # Output x N_samples
        A3 = softmax(Z3)                     #Output x N_samples
        return A3, Z3, A2, Z2, A1, Z1
    
    def backprop(self, X, target):
        # Forward prop
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A3)
        # N_samples
        m = X.shape[0]
        # deltas
        dZ3 = A3 - target                                      #Output x N_samples
        dW3 = dZ3.dot(A2.T)/m                                  #Output x Hidden_2
        db3 = np.sum(dZ3, axis=1, keepdims=True)/m             #Output x 1
        dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)    # Hidden2 x N_samples
        dW2 = dZ2.dot(A1.T)/m                                     # Hidden2 x Hidden1 
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m             # Hidden2 x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
     
        # Update
        lr = self.learning_rate
        self.W3 -= lr*dW3
        self.b3 -= lr*db3
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
        
        return cost
        
    
    def backprop_minibatch(self, X, target):
        n = X.shape[1]               # N_features
        batch_size = X.shape[0]      # N_samples
        if self.batch_size == None :
            batch_size = self.minibatch_size(batch_size)
        else :
            batch_size = self.batch_size
            
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(X.shape[0] ))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        
        pass_length = int(X.shape[0]/batch_size)
        for i in range(pass_length) :
            k = i*batch_size
            # Forward prop
            X = X_SGD[k:k+batch_size,:].reshape(batch_size,-1)              #batch_size x N_features
            A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,k:k+batch_size].reshape(-1,batch_size), A3)/pass_length
            # deltas
            dZ3 = A3 - target_SGD[:,k:k+batch_size].reshape(-1,batch_size)   #Output x batch_size
            dW3 = dZ3.dot(A2.T)/batch_size                                   #Output x hidden_2
            db3 = np.sum(dZ3, axis=1, keepdims=True)/batch_size              #Output x 1
            dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)            # Hidden2 x batch_size
            dW2 = dZ2.dot(A1.T)/batch_size                                   # Hidden2 x Hidden1 
            db2 = np.sum(dZ2, axis=1, keepdims=True)/batch_size              # Hidden2 x 1
            dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)            # Hidden x batch_size
            dW1 = dZ1.dot(X)/batch_size                                      # Hidden x N_Features
            db1 = np.sum(dZ1, axis=1, keepdims=True)/batch_size              # Hidden x 1                        
            # Update
            lr = self.learning_rate
            self.W3 -= lr*dW3
            self.b3 -= lr*db3
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
    
    def backpropADAM(self, X, target):
        # Forward prop
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A3)
        # N samples
        m = X.shape[0]   
        # deltas
        dZ3 = A3 - target                                      #Output x N_samples
        dW3 = dZ3.dot(A2.T)/m                                  #Output x Hidden_2
        db3 = np.sum(dZ3, axis=1, keepdims=True)/m             #Output x 1
        dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)    # Hidden2 x N_samples
        dW2 = dZ2.dot(A1.T)/m                                     # Hidden2 x Hidden1 
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m             # Hidden2 x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Adam updates
        beta1 = self.beta1
        beta2 = self.beta2
        # V
        self.Vdw1 = beta1*self.Vdw1 + (1-beta1)*dW1
        self.Vdw2 = beta1*self.Vdw2 + (1-beta1)*dW2
        self.Vdw3 = beta1*self.Vdw3 + (1-beta1)*dW3
        self.Vdb1 = beta1*self.Vdb1 + (1-beta1)*db1
        self.Vdb2 = beta1*self.Vdb2 + (1-beta1)*db2
        self.Vdb3 = beta1*self.Vdb3 + (1-beta1)*db3
        # S
        self.Sdw1 = beta2*self.Sdw1 + (1-beta2)*dW1**2
        self.Sdw2 = beta2*self.Sdw2 + (1-beta2)*dW2**2
        self.Sdw3 = beta2*self.Sdw3 + (1-beta2)*dW3**2
        self.Sdb1 = beta2*self.Sdb1 + (1-beta2)*db1**2
        self.Sdb2 = beta2*self.Sdb2 + (1-beta2)*db2**2
        self.Sdb3 = beta2*self.Sdb3 + (1-beta2)*db3**2  
        # Update
        lr = self.learning_rate
        self.W3 -= lr * self.Vdw3 / (np.sqrt(self.Sdw3)+1e-8)
        self.b3 -= lr * self.Vdb3 / (np.sqrt(self.Sdb3)+1e-8)
        self.W2 -= lr * self.Vdw2 / (np.sqrt(self.Sdw2)+1e-8)
        self.b2 -= lr * self.Vdb2 / (np.sqrt(self.Sdb2)+1e-8)
        self.W1 -= lr * self.Vdw1 / (np.sqrt(self.Sdw1)+1e-8)
        self.b1 -= lr * self.Vdb1 / (np.sqrt(self.Sdb1)+1e-8)
        return cost  
    
      
    def predict(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        return A3
    
    def predict_class(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A3, axis=0)
        return y_pred
    # To be deleted               
    def xrun(self, X_train, target, epochs=10):
        costs = [1e-10]
        for i in range(epochs):
            cost = self.backprop(X_train, target)
            costs.append(cost)
            if i%10 == 0:
                print(f'Loss after epoch {i} : {cost}')
        costs.pop(0)
        return costs  
         
    def run(self, X_train, target, epochs=10):
        costs = [1e-10]
        if self.delta_stop == None : 
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 1epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                    
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 2epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 3epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :
                for i in range(epochs):  
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 4epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            
        else :
            counter = 0
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    print(f'Loss after epoch {i} : {cost}')
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 5epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                    
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 6epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 7epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :  
                for i in range(epochs): 
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                        else :
                            counter =0
                    if i%10 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')        
                print(f'Loss after 8epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
          
            
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
    
    def minibatch_size(self, n_samples):
        '''
        Compute minibatch size in case its not provided
        '''
        if n_samples < 2000:
            return n_samples
        if n_samples < 12800:
            return 64
        if n_samples < 25600:
            return 128
        if n_samples < 51200:
            return 256
        if n_samples < 102400:
            return 512
        return 1024
        
        
        
        
            

In [595]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = 9,
               hidden_nodes_2 = 7,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu,
               optimizer='adam',
              #batch_size=200
              )


In [596]:
c=nn.run(X_train, y_train_cat, epochs=400 )

  import sys
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


Loss after epoch 10 : nan
Loss after epoch 20 : nan
Loss after epoch 30 : nan
Loss after epoch 40 : nan
Loss after epoch 50 : nan
Loss after epoch 60 : nan
Loss after epoch 70 : nan
Loss after epoch 80 : nan
Loss after epoch 90 : nan
Loss after epoch 100 : nan
Loss after epoch 110 : nan


KeyboardInterrupt: 

In [592]:
nn.evaluate(X_test, y_test)

Accuracy : 0.4152


0.4152

In [451]:
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = 5,
               #hidden_nodes_2 = 1,
               learning_rate = 0.01,
               activation_hidden = relu,
               #activation_hidden_2 = relu,
               optimizer='minibatch',
              batch_size=200)


In [452]:
c=nn.run(X1, y11, epochs=400 )

Loss after epoch 10 : 0.08597256371026611
Loss after epoch 20 : 0.07169481260925979
Loss after epoch 30 : 0.06567010778371164
Loss after epoch 40 : 0.0620478213421242
Loss after epoch 50 : 0.05950785690640602
Loss after epoch 60 : 0.05757478829274955
Loss after epoch 70 : 0.056051360056106406
Loss after epoch 80 : 0.05482564820920087
Loss after epoch 90 : 0.05382021171029689
Loss after epoch 100 : 0.052978879908718995
Loss after epoch 110 : 0.05226140707559768
Loss after epoch 120 : 0.05163947120984345
Loss after epoch 130 : 0.051092929718120156
Loss after epoch 140 : 0.05060807398578545
Loss after epoch 150 : 0.050173258883087074
Loss after epoch 160 : 0.04978128511246916
Loss after epoch 170 : 0.04942493618253959
Loss after epoch 180 : 0.04909974908371374
Loss after epoch 190 : 0.04880092430208747
Loss after epoch 200 : 0.048525469148654576
Loss after epoch 210 : 0.048270381760029984
Loss after epoch 220 : 0.048033530775136404
Loss after epoch 230 : 0.04781163041760578
Loss after epo

In [453]:
acc = nn.evaluate(X2, y2)

Accuracy : 0.8292666666666667


In [415]:
cost = nn.backprop(X_train, y_train_cat)

In [417]:
A3, Z3, A2, Z2, A1, Z1 = nn.forward(X_train)

In [418]:
print(A3-y_train_cat)

[[ 0.10485694 -0.89514306 -0.89514306 ...  0.10485694 -0.89514306
   0.10485694]
 [ 0.06021112  0.06021112  0.06021112 ...  0.06021112  0.06021112
   0.06021112]
 [ 0.0907259   0.0907259   0.0907259  ...  0.0907259   0.0907259
   0.0907259 ]
 ...
 [ 0.09401693  0.09401693  0.09401693 ...  0.09401693  0.09401693
   0.09401693]
 [ 0.09631682  0.09631682  0.09631682 ...  0.09631682  0.09631682
   0.09631682]
 [-0.86754949  0.13245051  0.13245051 ...  0.13245051  0.13245051
   0.13245051]]


[Back to top](#Content:)


<a id='part6'></a>

### Part 6 -  Loading Fashion MNIST

In [744]:
from tensorflow.keras.datasets import fashion_mnist


In [745]:
fashion = fashion_mnist.load_data()

In [746]:
(X_train, y_train),(X_test, y_test) = fashion

In [747]:
print(X_train.shape)

(60000, 28, 28)


In [748]:
M = X_train.shape[1]
N_train = X_train.shape[0]
N_test = X_test.shape[0]

In [749]:
X_train = X_train.reshape(N_train, M*M, 1).squeeze()
X_test = X_test.reshape(N_test, M*M, 1).squeeze()

In [750]:
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

In [751]:
#MAX = 255
#X_train = X_train/ MAX
#X_test =X_test/ MAX

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [613]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

# Fashion MNIST with 1 hidden layer

In [752]:
D = X_train.shape[1]
K = y_train_cat.shape[0]
M=5
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh)

In [753]:
c = nn.run(X_train, y_train_cat, epochs=500 )

Loss after epoch 10 : 0.19154971216465122
Loss after epoch 20 : 0.17858632406608232
Loss after epoch 30 : 0.1717064770369094
Loss after epoch 40 : 0.16710013143339192
Loss after epoch 50 : 0.16371579160276498
Loss after epoch 60 : 0.16103720289587167
Loss after epoch 70 : 0.15878692740868564
Loss after epoch 80 : 0.15681958484004502
Loss after epoch 90 : 0.15505234653845995
Loss after epoch 100 : 0.15343426493386386
Loss after epoch 110 : 0.1519320800152953
Loss after epoch 120 : 0.15052300191383247
Loss after epoch 130 : 0.14919077639074257
Loss after epoch 140 : 0.1479234156744191
Loss after epoch 150 : 0.1467118237828564
Loss after epoch 160 : 0.14554892750955123
Loss after epoch 170 : 0.14442910716277196
Loss after epoch 180 : 0.14334781260580529
Loss after epoch 190 : 0.14230129791537785
Loss after epoch 200 : 0.14128643414071335
Loss after epoch 210 : 0.14030057468320448
Loss after epoch 220 : 0.1393414568467086
Loss after epoch 230 : 0.13840712872140412
Loss after epoch 240 : 0.

In [754]:
acc = nn.evaluate(X_test, y_test)


Accuracy : 0.7104


In [755]:
D = X_train.shape[1]
K = y_train_cat.shape[0]
M=5
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
                delta_stop = 1e-7,
                patience = 5,
              )

In [756]:
c = nn.run(X_train, y_train_cat, epochs=200 )

Loss after epoch 10 : 0.08346670437041341
Loss after epoch 20 : 0.06607534078996419
Loss after epoch 30 : 0.05813084668547769
Loss after epoch 40 : 0.05410777141837007
Loss after epoch 50 : 0.05170879212531041
Loss after epoch 60 : 0.05006668268871798
Loss after epoch 70 : 0.0488503714771906
Loss after epoch 80 : 0.0478913971249985
Loss after epoch 90 : 0.04715041353048665
Loss after epoch 100 : 0.0465531032047038
Loss after epoch 110 : 0.04606523530199924
Loss after epoch 120 : 0.045659155225557094
Loss after epoch 130 : 0.04531193081977425
Loss after epoch 140 : 0.045006333891696004
Loss after epoch 150 : 0.04473485907331465
Loss after epoch 160 : 0.044491193855320174
Loss after epoch 170 : 0.04426974266248947
Loss after epoch 180 : 0.044071448571480897
Loss after epoch 190 : 0.04388706955877613
Loss after epoch 201 : 0.04373499407484345


In [757]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.827


### SGD

In [315]:
nn_SGD = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='SGD',
               #batch_size = 28,
                delta_stop = 1e-3,
                patience = 5,
              )

In [316]:
c = nn_SGD.run(X_train, y_train_cat, epochs=50 )

Loss after epoch 10 : 0.05229169746879698
Loss after epoch 20 : 0.050989022653112885
Loss after epoch 30 : 0.05047409297339392
Loss after epoch 40 : 0.05006307293211981
Loss after epoch 51 : 0.04998152154949375


In [317]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.7947


# ADAM

In [318]:
nn_adam = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='adam',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [319]:
c = nn_adam.run(X_train, y_train_cat, epochs=300 )

Loss after epoch 10 : 0.23329427329855534
Loss after epoch 20 : 0.22581268191815174
Loss after epoch 30 : 0.21756016625246444
Loss after epoch 40 : 0.2123454866753959
Loss after epoch 50 : 0.20782633720502175
Loss after epoch 60 : 0.20225578209607875
Loss after epoch 70 : 0.19722409237726474
Loss after epoch 80 : 0.1927940246394797
Loss after epoch 90 : 0.18775465784580125
Loss after epoch 100 : 0.18286143393355006
Loss after epoch 110 : 0.179449483118376
Loss after epoch 120 : 0.17719147826304704
Loss after epoch 130 : 0.17550718613772454
Loss after epoch 140 : 0.1741688920517013
Loss after epoch 150 : 0.17307389693182942
Loss after epoch 160 : 0.17215258888543558
Loss after epoch 170 : 0.1713067040286042
Loss after epoch 180 : 0.1704599507829086
Loss after epoch 190 : 0.16944868175105815
Loss after epoch 200 : 0.16803511558947992
Loss after epoch 210 : 0.16629823343822747
Loss after epoch 220 : 0.16506756526536606
Loss after epoch 230 : 0.16371251732833814
Loss after epoch 240 : 0.16

In [320]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.7947


# 2 layers

In [629]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = 5,
               hidden_nodes_2 = 7,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu,
               optimizer='adam',
              #delta_stop = 1e-5,
              #patience=5
              )


In [630]:
c= nn.run(X_train, y_train_cat, epochs=600 )

Loss after epoch 10 : 0.23009918657484552
Loss after epoch 20 : 0.20925003257289768
Loss after epoch 30 : 0.19687719692833824
Loss after epoch 40 : 0.1830438426783252
Loss after epoch 50 : 0.17300164977829482
Loss after epoch 60 : 0.1633852997230326
Loss after epoch 70 : 0.15475056906538215
Loss after epoch 80 : 0.14603352547099815
Loss after epoch 90 : 0.13740840593385814
Loss after epoch 100 : 0.1305420250692707
Loss after epoch 110 : 0.12421605815376298
Loss after epoch 120 : 0.11855536991873979
Loss after epoch 130 : 0.11297379956453496
Loss after epoch 140 : 0.1063691771139391
Loss after epoch 150 : 0.09924395750899022
Loss after epoch 160 : 0.09293506478708678
Loss after epoch 170 : 0.0884201466092327
Loss after epoch 180 : 0.08514571794258906
Loss after epoch 190 : 0.08252271186843954
Loss after epoch 200 : 0.08016558721847178
Loss after epoch 210 : 0.07820037675723318
Loss after epoch 220 : 0.07662357319133095
Loss after epoch 230 : 0.07534951891270372
Loss after epoch 240 : 0.

In [631]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.7435


In [643]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = 22,
               hidden_nodes_2 = 22,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu,
               optimizer='adam',
              #delta_stop = 1e-5,
              #patience=5
              )


In [644]:
c= nn.run(X_train, y_train_cat, epochs=100 )

  import sys
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


Loss after epoch 10 : nan
Loss after epoch 20 : nan
Loss after epoch 30 : nan
Loss after epoch 40 : nan
Loss after epoch 50 : nan
Loss after epoch 60 : nan
Loss after epoch 70 : nan


KeyboardInterrupt: 

In [642]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.3731


# Optimizer

In [366]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
               hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu,
              optimizer = 'adam')
c= nn.run(X_train, y_train_cat, epochs=200 )

Loss after epoch 0 : 4.0248719069912555
Loss after epoch 1 : 2.276420196595539
Loss after epoch 2 : 1.4171308091484893
Loss after epoch 3 : 0.4909401642439696
Loss after epoch 4 : 0.23462532199671818
Loss after epoch 5 : 0.23362828058409388
Loss after epoch 6 : 0.2339391955562715
Loss after epoch 7 : 0.23403101872804918
Loss after epoch 8 : 0.23408034835165503
Loss after epoch 9 : 0.23408641247524933
Loss after epoch 10 : 0.23405133302759268
Loss after epoch 11 : 0.2339796667265713
Loss after epoch 12 : 0.23387689997194383
Loss after epoch 13 : 0.23374862251877318
Loss after epoch 14 : 0.2336030481667226
Loss after epoch 15 : 0.23349178793024195
Loss after epoch 16 : 0.23336433962006922
Loss after epoch 17 : 0.23322363763084164
Loss after epoch 18 : 0.23307246338027093
Loss after epoch 19 : 0.23291342250478173
Loss after epoch 20 : 0.23274895271160082
Loss after epoch 21 : 0.23258134424862817
Loss after epoch 22 : 0.23241275768847888
Loss after epoch 23 : 0.23224522931747094
Loss after

Loss after epoch 196 : 0.23025850931269418
Loss after epoch 197 : 0.23025850931118047
Loss after epoch 198 : 0.23025850930993022
Loss after epoch 199 : 0.2302585093088975
Loss after 1epoch 201 : 0.2302585093088975


In [367]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.1


In [372]:
pred = nn.predict(X_test)

In [374]:
pred.shape

(10, 10000)

In [93]:
tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
# W1 of size hidden x features
n = D * M
W1 = tn.rvs(n).reshape((M, D )) # hidden x features
# W2 of size output x hidden
m = M  * K
W2 = tn.rvs(m).reshape((K, M)) # output x hidden
b1 = tn.rvs(M).reshape(-1,1) 
b2 = tn.rvs(K).reshape(-1,1) 
                

In [95]:
X = X_train
Z1 = W1.dot(X.T) + b1 # Hidden x N_samples
A1 = tanh(Z1)      # Hidden x N_samples
Z2 = W2.dot(A1) + b2  # Output x N_samples
A2 = softmax(Z2)      #Output x N_samples

      

In [98]:
print(A2.shape)

(10, 60000)


In [99]:
print(y_train_cat.shape)

(10, 60000)


In [101]:
m = X.shape[0]
# deltas
dZ2 = A2 - y_train_cat                                   #Output x N_samples
dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
dZ1 = W2.T.dot(dZ2)*dt(Z1)     # Hidden x N_samples
dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1


In [102]:
# Update
lr = 0.01
W2 -= lr*dW2
b2 -= lr*db2
W1 -= lr*dW1
b1 -= lr*db1

In [103]:
cost = cross_entropy(y_train_cat, A2)

In [104]:
cost

0.23973236540442144

# Testing Adam

In [21]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

M = 5
D = data.shape[1]
K = len(set(target))
beta1 = 0.9   
beta2 = 0.999
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

In [22]:
Vdw1 = np.zeros((M, D ))
Vdw2 = np.zeros((K, M))
Vdb1 = np.zeros((M, 1 ))
Vdb2 = np.zeros((K, 1 ))
Sdw1 = np.zeros((M, D))
Sdw2 = np.zeros((K, M))
Sdb1 = np.zeros((M, 1 ))
Sdb2 = np.zeros((K, 1 ))


In [29]:
def forward(X):
    Z1 = W1.dot(X.T) + b1 # Hidden x N_samples
    A1 = tanh(Z1)      # Hidden x N_samples
    Z2 = W2.dot(A1) + b2  # Output x N_samples
    A2 = softmax(Z2)      #Output x N_samples
    return A2, Z2, A1, Z1

In [30]:
def backpropADAM(X, target):
    # Forward prop
    A2, Z2, A1, Z1 = forward(X)
    # Compute cost
    cost = cross_entropy(target, A2)
    # N samples
    m = X.shape[0]
    # deltas
    dZ2 = A2 - target                                       #Output x N_samples
    dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
    db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
    dZ1 = W2.T.dot(dZ2)*(1-tanh(Z1)**2)     # Hidden x N_samples
    dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
    db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
    # Adam updates
    beta1 = beta1
    beta2 = beta2
    # V
    Vdw1 = beta1*Vdw1 + (1-beta1)*dW1
    Vdw2 = beta1*Vdw2 + (1-beta1)*dW2
    Vdb1 = beta1*Vdb1 + (1-beta1)*db1
    Vdb2 = beta1*Vdb2 + (1-beta1)*db2
    # S
    Sdw1 = beta2*Sdw1 + (1-beta2)*dW1**2
    Sdw2 = beta2*Sdw2 + (1-beta2)*dW2**2
    Sdb1 = beta2*Sdb1 + (1-beta2)*db1**2
    Sdb2 = beta2*Sdb2 + (1-beta2)*db2**2    
    # Update
    lr = learning_rate
    W2 -= lr * Vdw2 / (np.sqrt(Sdw2)+1e-8)
    b2 -= lr * Vdb2 / (np.sqrt(Sdb2)+1e-8)
    W1 -= lr * Vdw1 / (np.sqrt(Sdw1)+1e-8)
    b1 -= lr * Vdb1 / (np.sqrt(Sdb1)+1e-8)
    return cost  