## Content:
- [Part 1](#part1)- Importing the libraries, packages
- [Part 2](#part2)- Useful Functions
- [Part 3](#part3) -  One Hidden Layer Class
- [Part 4](#part4) -  Two Hidden Layers Class 
- [Part 5](#part5) -  Loading Fashion MNIST
- [Part 6](#part6) -  Fashion MNIST One Hidden Layer
- [Part 7](#part7) -  Fashion MNIST Two Hidden Layers
- [Part 8](#part8) -  Results 
- [Part 9](#part9) -  --
- [Part 10](#part10) -  --
- [Part 11](#part11) -  --

[Back to top](#Content:)


<a id='part1'></a>

### Part 1 -   Importing the libraries, packages

In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import io
import requests
import random

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

from scipy.special import expit as activation_function
from scipy.stats import truncnorm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import datasets

[Back to top](#Content:)


<a id='part2'></a>

### Part 2 -   Useful Functions

In [113]:
rng = np.random.default_rng() 

In [114]:
def truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def softmax(X):
    e = np.exp(X - np.max(X))
    return e / e.sum(axis=0, keepdims=True)


def cross_entropy(target, output):
    return -np.mean(target*np.log(output))

def cross_entropy_matrix(output, target):
    target = np.array(target)
    output = np.array(output)
    product = target*np.log(output)
    errors = -np.sum(product, axis=1)
    m = len(errors)
    errors = np.sum(errors) / m
    return errors

def sigmoid(x):
    return 1/(1+np.exp(-x))

def ds(x):
    return sigmoid(x)*(1-sigmoid(x))

def relu(x):
    return np.maximum(x,0)
  

def dr(x):
    dr = (np.sign(x) + 1) / 2
    return dr

def tanh(x):
    a = np.exp(x)
    b = np.exp(-x)
    return (a-b)/(a+b)

def dt(x):
    return 1-tanh(x)**2
    
def leaky(x,a):
    leaky = np.maximum(x,0)*x + a*np.minimum(x,0)
    return leaky

def dl(x,a):
    dl = (np.sign(x)+1)/2 - a*(np.sign(x)-1)/2
    return dl

def derivative(f):
    if f == sigmoid :
        return ds
    if f == tanh :
        return dt
    if f == relu :
        return dr
    if f == leaky :
        return dl
    return None

def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N,K))
    for i in range(N):
        ind[i][y[i]]=1
    return ind

def classification_rate(Y, P):
    return np.mean(Y==P)

[Back to top](#Content:)


<a id='part3'></a>

### Part 3 -   One Hidden Layer Class

# One Hidden Layer

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden x N_features
- **b1**    : Hidden
- **W2**    : Output x Hidden
- **b2**    : Output

In [134]:
class HiddenOne:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes,
                 activation_hidden,
                 learning_rate=0.01,
                 optimizer = None,
                 beta1 = 0.9,   #ADAM optimization parameter, default value taken from practical experience
                 beta2 = 0.999, #ADAM optimization parameter, default value taken from practical experience
                 batch_size = None,
                 delta_stop = None,
                 patience = 1,
                 leaky_intercept=0.01
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes = hidden_nodes          
        self.learning_rate = learning_rate 
        self.activation_hidden = activation_hidden
        self.hidden_derivative = derivative(self.activation_hidden)
        self.beta1 = beta1
        self.beta2 = beta2
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.delta_stop = delta_stop
        self.patience = patience
        self.leaky_intercept = leaky_intercept
        self.create_weight_matrices()
        self.create_biases()
        self.reset_adam()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n = self.input_nodes * self.hidden_nodes
        self.W1 = tn.rvs(n).reshape((self.hidden_nodes, self.input_nodes )) # hidden x features
        # W2 of size output x hidden
        m = self.hidden_nodes  * self.output_nodes
        self.W2 = tn.rvs(m).reshape((self.output_nodes, self.hidden_nodes )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes).reshape(-1,1) 
        self.b2 = tn.rvs(self.output_nodes).reshape(-1,1) 
          
    def reset_adam(self):
        '''
        Creates Adam optimizations variables
        '''
        self.Vdw1 = np.zeros((self.hidden_nodes, self.input_nodes ))
        self.Vdw2 = np.zeros((self.output_nodes, self.hidden_nodes ))
        self.Vdb1 = np.zeros((self.hidden_nodes, 1 ))
        self.Vdb2 = np.zeros((self.output_nodes, 1 ))
        self.Sdw1 = np.zeros((self.hidden_nodes, self.input_nodes ))
        self.Sdw2 = np.zeros((self.output_nodes, self.hidden_nodes ))
        self.Sdb1 = np.zeros((self.hidden_nodes, 1 ))
        self.Sdb2 = np.zeros((self.output_nodes, 1 ))
        
        
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1 # Hidden x N_samples
        A1 = self.activation_hidden(Z1)      # Hidden x N_samples
        Z2 = self.W2.dot(A1) + self.b2  # Output x N_samples
        A2 = softmax(Z2)      #Output x N_samples
        return A2, Z2, A1, Z1
    
    
    def backprop(self, X, target):
        # Forward prop
        A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A2)
        # N samples
        m = X.shape[0]
        # deltas
        dZ2 = A2 - target                                       #Output x N_samples
        dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Update
        lr = self.learning_rate
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
        return cost
        
    def backpropSGD(self, X, target):
        m = X.shape[0]                  #N_samples
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(m))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        for i in range(m) :
            # Forward prop
            x = X_SGD[i,:].reshape(1,-1)                   # 1 x N_features
            a2, z2, a1, z1 = self.forward(x)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,i].reshape(-1,1), a2)/m
            # deltas
            dz2 = a2 - target[:,i].reshape(-1,1)                    #Output x 1
            dW2 = dz2.dot(a1.T)                                     #Output x hidden
            db2 = dz2                                               #Output x 1
            dz1 = self.W2.T.dot(dz2)*self.hidden_derivative(z1)     # Hidden x 1
            dW1 = dz1.dot(x)                                        # Hidden x N_Features
            db1 = dz1                                               # Hidden x 1
            # Update
            lr = self.learning_rate
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
        
    def backprop_minibatch(self, X, target):
        n = X.shape[1]               # N_features
        batch_size = X.shape[0]      # N_samples
        if self.batch_size == None :
            batch_size = self.minibatch_size(batch_size)
        else :
            batch_size = self.batch_size
            
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(X.shape[0] ))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        
        pass_length = int(X.shape[0]/batch_size)
        for i in range(pass_length) :
            k = i*batch_size
            # Forward prop
            X = X_SGD[k:k+batch_size,:].reshape(batch_size,-1)              #batch_size x N_features
            A2, Z2, A1, Z1 = self.forward(X)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,k:k+batch_size].reshape(-1,batch_size), A2)/pass_length
            # deltas
            dZ2 = A2 - target_SGD[:,k:k+batch_size].reshape(-1,batch_size)   #Output x batch_size
            dW2 = dZ2.dot(A1.T)/batch_size                                   #Output x hidden
            db2 = np.sum(dZ2, axis=1, keepdims=True)/batch_size              #Output x 1
            dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)              # Hidden x batch_size
            dW1 = dZ1.dot(X)/batch_size                                      # Hidden x N_Features
            db1 = np.sum(dZ1, axis=1, keepdims=True)/batch_size              #Hidden x1                                            # Hidden x 1
            # Update
            lr = self.learning_rate
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
    
    def backpropADAM(self, X, target):
        # Forward prop
        A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A2)
        # N samples
        m = X.shape[0]
        # deltas
        dZ2 = A2 - target                                       #Output x N_samples
        dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Adam updates
        beta1 = self.beta1
        beta2 = self.beta2
        # V
        self.Vdw1 = beta1*self.Vdw1 + (1-beta1)*dW1
        self.Vdw2 = beta1*self.Vdw2 + (1-beta1)*dW2
        self.Vdb1 = beta1*self.Vdb1 + (1-beta1)*db1
        self.Vdb2 = beta1*self.Vdb2 + (1-beta1)*db2
        # S
        self.Sdw1 = beta2*self.Sdw1 + (1-beta2)*dW1**2
        self.Sdw2 = beta2*self.Sdw2 + (1-beta2)*dW2**2
        self.Sdb1 = beta2*self.Sdb1 + (1-beta2)*db1**2
        self.Sdb2 = beta2*self.Sdb2 + (1-beta2)*db2**2    
        # Update
        lr = self.learning_rate
        self.W2 -= lr * self.Vdw2 / (np.sqrt(self.Sdw2)+1e-8)
        self.b2 -= lr * self.Vdb2 / (np.sqrt(self.Sdb2)+1e-8)
        self.W1 -= lr * self.Vdw1 / (np.sqrt(self.Sdw1)+1e-8)
        self.b1 -= lr * self.Vdb1 / (np.sqrt(self.Sdb1)+1e-8)
        return cost  
    
    def predict(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        return A2
    
    def predict_class(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A2, axis=0)
        return y_pred
                   
    def run(self, X_train, target, epochs=10):
        costs = [1e-10]
        if self.delta_stop == None : 
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    print(f'Loss after epoch {i} : {cost}')
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 1epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                    
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 2epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 3epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :
                for i in range(epochs):  
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 4epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            
        else :
            counter = 0
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 5epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                    
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 6epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 7epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :  
                for i in range(epochs): 
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                        else :
                            counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')        
                print(f'Loss after 8epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
          
            
        
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
        
       
    def minibatch_size(self, n_samples):
        '''
        Compute minibatch size in case its not provided
        '''
        if n_samples < 2000:
            return n_samples
        if n_samples < 12800:
            return 64
        if n_samples < 25600:
            return 128
        if n_samples < 51200:
            return 256
        if n_samples < 102400:
            return 512
        return 1024
    
        
        
            

# Testing with Iris Dataset

## Loading and preparing Data

In [116]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

In [117]:
from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

In [138]:
M = 5
D = data.shape[1]
K = len(set(target))
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T



In [139]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

M = 5
D = data.shape[1]
K = len(set(target))
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

## One Hidden Layer 
### Activation Function Tests :


#### Sigmoid

In [142]:
nn_sigmoid = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = sigmoid,
               #optimizer='minibatch',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [143]:
c=nn_sigmoid.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.35676017500207446
Loss after epoch 200 : 0.3472682711296739
Loss after epoch 300 : 0.33854210728765455
Loss after epoch 400 : 0.3294690112848555
Loss after epoch 500 : 0.31985351610787616
Loss after epoch 600 : 0.30983931166269574
Loss after epoch 700 : 0.2996361829855944
Loss after epoch 800 : 0.28943582049763994
Loss after epoch 900 : 0.2793878076310757
Loss after 4epoch 1001 : 0.26969608896776515


In [144]:
acc = nn_sigmoid.evaluate(X_test, y_test)

Accuracy : 0.631578947368421


#### tanh

In [145]:
nn_tanh = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               #optimizer='minibatch',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [146]:
c=nn_tanh.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.2744357421754285
Loss after epoch 200 : 0.22683344791989082
Loss after epoch 300 : 0.1972871159734545
Loss after epoch 400 : 0.17864683118690947
Loss after epoch 500 : 0.16575669158759046
Loss after epoch 600 : 0.1557300948682545
Loss after epoch 700 : 0.1470532213499895
Loss after epoch 800 : 0.13896702172314118
Loss after epoch 900 : 0.1311374054753101
Loss after 4epoch 1001 : 0.12354874274310496


In [147]:
acc = nn_sigmoid.evaluate(X_test, y_test)

Accuracy : 0.631578947368421


#### ReLU

In [148]:
nn_relu = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               #optimizer='minibatch',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [149]:
c=nn_relu.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.3241253156822885
Loss after epoch 200 : 0.2894202518695222
Loss after epoch 300 : 0.2617697618570672
Loss after epoch 400 : 0.24017915755583838
Loss after epoch 500 : 0.22313809221428157
Loss after epoch 600 : 0.20956649456747894
Loss after epoch 700 : 0.1986137938463622
Loss after epoch 800 : 0.1896541243394405
Loss after epoch 900 : 0.18221320452530698
Loss after 4epoch 1001 : 0.1759931220690625


In [150]:
acc = nn_relu.evaluate(X_test, y_test)

Accuracy : 0.6578947368421053


### Conclusion :

ReLU works better, need to confirm that later with Fashion-MNIST + multiple tests

### Optimizer  Tests :

### SGD :

Doing 1 sample each time

In [151]:
nn_SGD = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='SGD',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [152]:
c=nn_SGD.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.03827870972564428
Loss after epoch 200 : 0.03618489968071316
Loss after epoch 300 : 0.03341739996126434
Loss after epoch 400 : 0.030471604346987674
Loss after epoch 500 : 0.02878433407857863
Loss after epoch 600 : 0.027608701361995336
Loss after epoch 700 : 0.026784484092169845
Loss after epoch 800 : 0.026105913799405033
Loss after epoch 900 : 0.02550563780422803
Loss after 2epoch 1001 : 0.02496559944908063


In [153]:
acc = nn_SGD.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


In [154]:
nn_adam = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               optimizer='adam',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [155]:
c=nn_adam.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 0 : 0.40682387171799034
Loss after epoch 1 : 0.3829398441394841
Loss after epoch 2 : 0.3609550589014274
Loss after epoch 3 : 0.3497270430641767
Loss after epoch 4 : 0.34750346341525057
Loss after epoch 5 : 0.3403243537931096
Loss after epoch 6 : 0.3252660275570091
Loss after epoch 7 : 0.30335429251045526
Loss after epoch 8 : 0.2785379921944592
Loss after epoch 9 : 0.25355752281034544
Loss after epoch 10 : 0.23528971354772685
Loss after epoch 11 : 0.21860961788822308
Loss after epoch 12 : 0.2052060248479696
Loss after epoch 13 : 0.19583745591975235
Loss after epoch 14 : 0.18472669140056347
Loss after epoch 15 : 0.1730269104569562
Loss after epoch 16 : 0.16516311340565826
Loss after epoch 17 : 0.15597321446753562
Loss after epoch 18 : 0.1481242514677245
Loss after epoch 19 : 0.14264421381798462
Loss after epoch 20 : 0.1358787110883372
Loss after epoch 21 : 0.12981567229237945
Loss after epoch 22 : 0.12466891321449515
Loss after epoch 23 : 0.11758602509877214
Loss after e

Loss after epoch 784 : 0.016560603141897313
Loss after epoch 785 : 0.01657703934231696
Loss after epoch 786 : 0.01660622535746788
Loss after epoch 787 : 0.016629840257530375
Loss after epoch 788 : 0.016628380356321495
Loss after epoch 789 : 0.016574770316826474
Loss after epoch 790 : 0.01651102189355972
Loss after epoch 791 : 0.016488139090638447
Loss after epoch 792 : 0.016511749399734592
Loss after epoch 793 : 0.01653710822926187
Loss after epoch 794 : 0.01652140879839141
Loss after epoch 795 : 0.016479634712216223
Loss after epoch 796 : 0.016456487535797756
Loss after epoch 797 : 0.01646739445970464
Loss after epoch 798 : 0.01648068114200846
Loss after epoch 799 : 0.01646631877765051
Loss after epoch 800 : 0.016438324282598567
Loss after epoch 800 : 0.016438324282598567
Loss after epoch 801 : 0.016426598000353603
Loss after epoch 802 : 0.016433732523502807
Loss after epoch 803 : 0.016435977524846372
Loss after epoch 804 : 0.01642032669040346
Loss after epoch 805 : 0.0164019242730511

In [156]:
acc = nn_adam.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


### SGD :

Testing different minibatch sizes

#### Minibatch size = 2

In [113]:
nn_mini2 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 2,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [114]:
c=nn_mini2.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.039968910926929654
Loss after epoch 200 : 0.03776559212464175
Loss after epoch 300 : 0.03546285589429685
Loss after epoch 400 : 0.03364404483304866
Loss after epoch 500 : 0.03226520398508366
Loss after epoch 600 : 0.03119983447174907
Loss after epoch 700 : 0.030373181868311932
Loss after epoch 800 : 0.0296842286398998
Loss after epoch 900 : 0.02910116205558126
Loss after epoch 1001 : 0.02860504375086494


In [115]:
acc = nn_mini2.evaluate(X_test, y_test)

Accuracy : 1.0


#### Minibatch size = 8


In [116]:
nn_mini8 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               optimizer='minibatch',
               batch_size = 8,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini8.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.11888176773455536
Loss after epoch 200 : 0.06132759649653567
Loss after epoch 300 : 0.04623305383163878
Loss after epoch 400 : 0.04046450039785886
Loss after epoch 500 : 0.03746791143051225
Loss after epoch 600 : 0.03558795780184025
Loss after epoch 700 : 0.034264738269116496
Loss after epoch 800 : 0.03326249394273867
Loss after epoch 900 : 0.03246390753995277
Loss after epoch 1001 : 0.03180929064900143


In [117]:
acc = nn_mini8.evaluate(X_test, y_test)

Accuracy : 1.0


#### minibatch size 16

In [118]:
nn_mini16 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = sigmoid,
               optimizer='minibatch',
               batch_size = 16,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini16.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.3243897917934674
Loss after epoch 200 : 0.2561084343985056
Loss after epoch 300 : 0.2136039045778299
Loss after epoch 400 : 0.18848324876452216
Loss after epoch 500 : 0.17213766425768745
Loss after epoch 600 : 0.16027435713105817
Loss after epoch 700 : 0.1496555757990457
Loss after epoch 800 : 0.13870465628954357
Loss after epoch 900 : 0.1274764365402452
Loss after epoch 1001 : 0.11588206617477194


In [120]:
acc = nn_mini16.evaluate(X_test, y_test)

Accuracy : 1.0


#### Minibatch size 32

In [121]:
nn_mini32 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 32,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini32.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.24423174555697824
Loss after epoch 200 : 0.10864390175002289
Loss after epoch 300 : 0.07464147547420144
Loss after epoch 400 : 0.05800286097313573
Loss after epoch 500 : 0.04877020258397195
Loss after epoch 600 : 0.04319664204297226
Loss after epoch 700 : 0.03958215983536185
Loss after epoch 800 : 0.03710860405512004
Loss after epoch 900 : 0.03531851438494933
Loss after epoch 1001 : 0.03398528894579797


In [122]:
acc = nn_mini32.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


#### Minibatch size 64

In [123]:
nn_mini64 = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
               batch_size = 64,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini64.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.26834528254850826
Loss after epoch 200 : 0.2058438974711505
Loss after epoch 300 : 0.1689162735929759
Loss after epoch 400 : 0.14561402715011804
Loss after epoch 500 : 0.12773139351036716
Loss after epoch 600 : 0.11274114420052123
Loss after epoch 700 : 0.09998996333689264
Loss after epoch 800 : 0.08925000737231119
Loss after epoch 900 : 0.0803031409095878
Loss after epoch 1001 : 0.07295611140430698


In [124]:
acc = nn_mini64.evaluate(X_test, y_test)

Accuracy : 1.0


#### Minibactch size not specified

In [125]:
nn_mini = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = sigmoid,
               optimizer='minibatch',
               #batch_size = 64,
                #delta_stop = 1e-3,
                #patience = 5,
              )
c=nn_mini.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.362080412007166
Loss after epoch 200 : 0.3553161314697229
Loss after epoch 300 : 0.3485307292971733
Loss after epoch 400 : 0.3399950463358105
Loss after epoch 500 : 0.33008496343476673
Loss after epoch 600 : 0.3192536157965336
Loss after epoch 700 : 0.30788814053331465
Loss after epoch 800 : 0.29630978638834077
Loss after epoch 900 : 0.2848041400661859
Loss after epoch 1001 : 0.273725451778514


In [126]:
acc = nn_mini.evaluate(X_test, y_test)

Accuracy : 0.8157894736842105


## To add :
- Stopping test


[Back to top](#Content:)


<a id='part4'></a>

### Part 4 -   Two Hidden Layers Class

# Two Hidden Layers

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden1 x N_features
- **b1**    : Hidden1
- **W2**    : Hidden2 x Hidden1
- **b2**    : Hidden2
- **W3**    : Output x Hidden
- **b3**    : Output

In [194]:
class HiddenTwo:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes_1,
                 hidden_nodes_2,
                 activation_hidden_1,
                 activation_hidden_2,
                 learning_rate=0.01,
                 optimizer = None,
                 beta1 = 0.9,   #ADAM optimization parameter, default value taken from practical experience
                 beta2 = 0.999, #ADAM optimization parameter, default value taken from practical experience
                 batch_size = None,
                 delta_stop = None,
                 patience = 1,
                 leaky_intercept=0.01
                 
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes_1 = hidden_nodes_1    
        self.hidden_nodes_2 = hidden_nodes_2    
        self.learning_rate = learning_rate 
        self.activation_hidden_1 = activation_hidden_1
        self.activation_hidden_2 = activation_hidden_2
        self.hidden_derivative_1 = derivative(self.activation_hidden_1)
        self.hidden_derivative_2 = derivative(self.activation_hidden_2)
        self.beta1 = beta1
        self.beta2 = beta2
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.delta_stop = delta_stop
        self.patience = patience
        self.leaky_intercept = leaky_intercept
        self.create_weight_matrices()
        self.create_biases()
        self.reset_adam()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n1 = self.input_nodes * self.hidden_nodes_1
        self.W1 = tn.rvs(n1).reshape((self.hidden_nodes_1, self.input_nodes )) # hidden1 x features
        # W2 of size hidden2 x hidden1
        n2 = self.hidden_nodes_2 * self.hidden_nodes_1
        self.W2 = tn.rvs(n2).reshape((self.hidden_nodes_2, self.hidden_nodes_1 )) # hidden1 x features
        # W3 of size output x hidden2
        n3 = self.hidden_nodes_2  * self.output_nodes
        self.W3 = tn.rvs(n3).reshape((self.output_nodes, self.hidden_nodes_2 )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes_1).reshape(-1,1) 
        self.b2 = tn.rvs(self.hidden_nodes_2).reshape(-1,1) 
        self.b3 = tn.rvs(self.output_nodes).reshape(-1,1) 
        
    def reset_adam(self):
        '''
        Creates Adam optimizations variables
        '''
        self.Vdw1 = np.zeros((self.hidden_nodes_1, self.input_nodes ))
        self.Vdw2 = np.zeros((self.hidden_nodes_2, self.hidden_nodes_1 ))
        self.Vdw3 = np.zeros((self.output_nodes, self.hidden_nodes_2))
       
        self.Vdb1 = np.zeros((self.hidden_nodes_1, 1 ))
        self.Vdb2 = np.zeros((self.hidden_nodes_2, 1 ))
        self.Vdb3 = np.zeros((self.output_nodes, 1 ))
        
        self.Sdw1 = np.zeros((self.hidden_nodes_1, self.input_nodes ))
        self.Sdw2 = np.zeros((self.hidden_nodes_2, self.hidden_nodes_1 ))
        self.Sdw3 = np.zeros((self.output_nodes, self.hidden_nodes_2))
       
        self.Sdb1 = np.zeros((self.hidden_nodes_1, 1 ))
        self.Sdb2 = np.zeros((self.hidden_nodes_2, 1 ))
        self.Sdb3 = np.zeros((self.output_nodes, 1 ))
                
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1      # Hidden1 x N_samples
        A1 = self.activation_hidden_1(Z1)      # Hidden1 x N_samples
        Z2 = self.W2.dot(A1) + self.b2      # Hidden2 x N_samples
        A2 = self.activation_hidden_2(Z2)      # Hidden2 x N_samples
        Z3 = self.W3.dot(A2) + self.b3       # Output x N_samples
        A3 = softmax(Z3)                     #Output x N_samples
        return A3, Z3, A2, Z2, A1, Z1
    
    def backprop(self, X, target):
        # Forward prop
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A3)
        # N_samples
        m = X.shape[0]
        # deltas
        dZ3 = A3 - target                                      #Output x N_samples
        dW3 = dZ3.dot(A2.T)/m                                  #Output x Hidden_2
        db3 = np.sum(dZ3, axis=1, keepdims=True)/m             #Output x 1
        dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)    # Hidden2 x N_samples
        dW2 = dZ2.dot(A1.T)/m                                     # Hidden2 x Hidden1 
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m             # Hidden2 x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
     
        # Update
        lr = self.learning_rate
        self.W3 -= lr*dW3
        self.b3 -= lr*db3
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
        
        return cost
        
    
    def backprop_minibatch(self, X, target):
        n = X.shape[1]               # N_features
        batch_size = X.shape[0]      # N_samples
        if self.batch_size == None :
            batch_size = self.minibatch_size(batch_size)
        else :
            batch_size = self.batch_size
            
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(X.shape[0] ))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        
        pass_length = int(X.shape[0]/batch_size)
        for i in range(pass_length) :
            k = i*batch_size
            # Forward prop
            X = X_SGD[k:k+batch_size,:].reshape(batch_size,-1)              #batch_size x N_features
            A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,k:k+batch_size].reshape(-1,batch_size), A3)/pass_length
            # deltas
            dZ3 = A3 - target_SGD[:,k:k+batch_size].reshape(-1,batch_size)   #Output x batch_size
            dW3 = dZ3.dot(A2.T)/batch_size                                   #Output x hidden_2
            db3 = np.sum(dZ3, axis=1, keepdims=True)/batch_size              #Output x 1
            dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)            # Hidden2 x batch_size
            dW2 = dZ2.dot(A1.T)/batch_size                                   # Hidden2 x Hidden1 
            db2 = np.sum(dZ2, axis=1, keepdims=True)/batch_size              # Hidden2 x 1
            dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)            # Hidden x batch_size
            dW1 = dZ1.dot(X)/batch_size                                      # Hidden x N_Features
            db1 = np.sum(dZ1, axis=1, keepdims=True)/batch_size              # Hidden x 1                        
            # Update
            lr = self.learning_rate
            self.W3 -= lr*dW3
            self.b3 -= lr*db3
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
    
    def backpropADAM(self, X, target):
        # Forward prop
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A3)
        # N samples
        m = X.shape[0]   
        # deltas
        dZ3 = A3 - target                                      #Output x N_samples
        dW3 = dZ3.dot(A2.T)/m                                  #Output x Hidden_2
        db3 = np.sum(dZ3, axis=1, keepdims=True)/m             #Output x 1
        dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)    # Hidden2 x N_samples
        dW2 = dZ2.dot(A1.T)/m                                     # Hidden2 x Hidden1 
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m             # Hidden2 x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Adam updates
        beta1 = self.beta1
        beta2 = self.beta2
        # V
        self.Vdw1 = beta1*self.Vdw1 + (1-beta1)*dW1
        self.Vdw2 = beta1*self.Vdw2 + (1-beta1)*dW2
        self.Vdw3 = beta1*self.Vdw3 + (1-beta1)*dW3
        self.Vdb1 = beta1*self.Vdb1 + (1-beta1)*db1
        self.Vdb2 = beta1*self.Vdb2 + (1-beta1)*db2
        self.Vdb3 = beta1*self.Vdb3 + (1-beta1)*db3
        # S
        self.Sdw1 = beta2*self.Sdw1 + (1-beta2)*dW1**2
        self.Sdw2 = beta2*self.Sdw2 + (1-beta2)*dW2**2
        self.Sdw3 = beta2*self.Sdw3 + (1-beta2)*dW3**2
        self.Sdb1 = beta2*self.Sdb1 + (1-beta2)*db1**2
        self.Sdb2 = beta2*self.Sdb2 + (1-beta2)*db2**2
        self.Sdb3 = beta2*self.Sdb3 + (1-beta2)*db3**2  
        # Update
        lr = self.learning_rate
        self.W3 -= lr * self.Vdw3 / (np.sqrt(self.Sdw3)+1e-8)
        self.b3 -= lr * self.Vdb3 / (np.sqrt(self.Sdb3)+1e-8)
        self.W2 -= lr * self.Vdw2 / (np.sqrt(self.Sdw2)+1e-8)
        self.b2 -= lr * self.Vdb2 / (np.sqrt(self.Sdb2)+1e-8)
        self.W1 -= lr * self.Vdw1 / (np.sqrt(self.Sdw1)+1e-8)
        self.b1 -= lr * self.Vdb1 / (np.sqrt(self.Sdb1)+1e-8)
        return cost  
    
      
    def predict(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        return A3
    
    def predict_class(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A3, axis=0)
        return y_pred
                   
    def xrun(self, X_train, target, epochs=10):
        costs = []
        for i in range(epochs):
            A3, Z3, A2, Z2, A1, Z1 = self.forward(X_train)
            cost = cross_entropy(target, A3)
            costs.append(cost)
            if i%100 == 0:
                print(f'Loss after epoch {i} : {cost}')
            self.backprop(X_train, target)
        return costs  
         
    def run(self, X_train, target, epochs=10):
        costs = [1e-10]
        if self.delta_stop == None : 
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    print(f'Loss after epoch {i} : {cost}')
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 1epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                    
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 2epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 3epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :
                for i in range(epochs):  
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 4epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            
        else :
            counter = 0
            if self.optimizer == 'adam':
                self.reset_adam()
                for i in range(epochs):
                    cost = self.backpropADAM(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 5epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                    
            elif self.optimizer == 'SGD' :
                for i in range(epochs):
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 6epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            elif self.optimizer == 'minibatch' :
                for i in range(epochs):
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')
                print(f'Loss after 7epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
                
            else :  
                for i in range(epochs): 
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                        else :
                            counter =0
                    if i%100 == 0 and i>0 :
                        print(f'Loss after epoch {i} : {cost}')        
                print(f'Loss after 8epoch {len(costs)} : {costs[-1]}')        
                costs.pop(0)
                return costs
          
            
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
    
    def minibatch_size(self, n_samples):
        '''
        Compute minibatch size in case its not provided
        '''
        if n_samples < 2000:
            return n_samples
        if n_samples < 12800:
            return 64
        if n_samples < 25600:
            return 128
        if n_samples < 51200:
            return 256
        if n_samples < 102400:
            return 512
        return 1024
        
        
        
        
            

In [191]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
               hidden_nodes_2 = M-1,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu)


In [192]:
nn.run(X_train, y_train_cat, epochs=1000 )

Loss after epoch 0 : 1.233171162332996
Loss after epoch 100 : 0.21597558644323173
Loss after epoch 200 : 0.20028702353854805
Loss after epoch 300 : 0.1869508340775123
Loss after epoch 400 : 0.15817168500537604
Loss after epoch 500 : 0.14096785701019318
Loss after epoch 600 : 0.13359149740797818
Loss after epoch 700 : 0.1273766388234808
Loss after epoch 800 : 0.12225297351401736
Loss after epoch 900 : 0.11798193686301858


[1.233171162332996,
 1.0249252712277452,
 0.7202910852378479,
 0.5210404751424742,
 0.6043331680601531,
 0.26305352151937117,
 0.2429786069297435,
 0.23868718338907358,
 0.23648939443658523,
 0.23528441132509573,
 0.23456695947894565,
 0.23407789196230452,
 0.23369525776347358,
 0.23336250273311468,
 0.23305723366877573,
 0.23277026554252842,
 0.23249479451062963,
 0.23222888540560277,
 0.23197088509871053,
 0.23171946063335602,
 0.23147408152418353,
 0.23123465165915696,
 0.23100072188221152,
 0.23077271409728117,
 0.23055115605667006,
 0.23033615241504443,
 0.2301263551636262,
 0.22992124959263174,
 0.22971997703263794,
 0.2295215798755038,
 0.22932447456614924,
 0.2291273009964786,
 0.22892893999897793,
 0.22872848811272906,
 0.22852499774299626,
 0.22831825901967226,
 0.22810915671130716,
 0.22789830757102314,
 0.2276857328511048,
 0.2274723671593447,
 0.2272601303827865,
 0.2270492609006314,
 0.22683985074388754,
 0.22663194642386325,
 0.22642575440701077,
 0.22622108763449278,
 0

In [193]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.5174


[Back to top](#Content:)


<a id='part5'></a>

### Part 5 -  Loading Fashion MNIST

In [157]:
from tensorflow.keras.datasets import fashion_mnist


In [158]:
fashion = fashion_mnist.load_data()

In [159]:
(X_train, y_train),(X_test, y_test) = fashion

In [160]:
print(X_train.shape)

(60000, 28, 28)


In [161]:
M = X_train.shape[1]
N_train = X_train.shape[0]
N_test = X_test.shape[0]

In [162]:
X_train = X_train.reshape(N_train, M*M, 1).squeeze()
X_test = X_test.reshape(N_test, M*M, 1).squeeze()

# Fashion MNIST with 1 hidden layer

In [163]:
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

In [164]:
print(X_train.shape)

(60000, 784)


In [165]:
D = X_train.shape[1]
K = y_train_cat.shape[0]
M=5
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh)

In [166]:
MAX = 255
X_train = X_train/ MAX
X_test =X_test/ MAX

In [167]:
X_train.shape

(60000, 784)

In [176]:
D = X_train.shape[1]
K = y_train_cat.shape[0]
M=5
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
                delta_stop = 1e-7,
                patience = 5,
              )

In [177]:
c = nn.run(X_train, y_train_cat, epochs=200 )

Loss after epoch 100 : 0.058933279699759554
Loss after 7epoch 201 : 0.053734693616789286


In [178]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.799


### SGD

In [171]:
nn_SGD = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='SGD',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [172]:
c = nn_SGD.run(X_train, y_train_cat, epochs=300 )

Loss after epoch 100 : 0.05166687255871911
Loss after epoch 200 : 0.0519095842593376
Loss after 2epoch 301 : 0.05161651827328838


In [173]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.8019


# ADAM

In [179]:
nn_adam = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='adam',
               #batch_size = 28,
                #delta_stop = 1e-3,
                #patience = 5,
              )

In [180]:
c = nn_adam.run(X_train, y_train_cat, epochs=300 )

Loss after epoch 0 : 3.2217382596449964
Loss after epoch 1 : 2.016094800870134
Loss after epoch 2 : 0.8334400666002773
Loss after epoch 3 : 0.6489400468550212
Loss after epoch 4 : 0.4847626263412157
Loss after epoch 5 : 0.31312675858433453
Loss after epoch 6 : 0.22652070062418284
Loss after epoch 7 : 0.2310943814192747
Loss after epoch 8 : 0.23151645106743207
Loss after epoch 9 : 0.23187100720981477
Loss after epoch 10 : 0.23219146863839502
Loss after epoch 11 : 0.23246233230605784
Loss after epoch 12 : 0.2326751100799611
Loss after epoch 13 : 0.2328266577928193
Loss after epoch 14 : 0.2329174611060478
Loss after epoch 15 : 0.23294982194050157
Loss after epoch 16 : 0.23292833213103503
Loss after epoch 17 : 0.23285896208261053
Loss after epoch 18 : 0.23274942869402696
Loss after epoch 19 : 0.23260678766768073
Loss after epoch 20 : 0.23243856229710835
Loss after epoch 21 : 0.2322525300155777
Loss after epoch 22 : 0.2320554668081128
Loss after epoch 23 : 0.23185295750386228
Loss after epo

Loss after epoch 196 : 0.17261708488557384
Loss after epoch 197 : 0.1723337713577593
Loss after epoch 198 : 0.1720582110326311
Loss after epoch 199 : 0.1717935099727183
Loss after epoch 200 : 0.1715395109307802
Loss after epoch 200 : 0.1715395109307802
Loss after epoch 201 : 0.17130442638492654
Loss after epoch 202 : 0.17108599151222426
Loss after epoch 203 : 0.17088283613209343
Loss after epoch 204 : 0.17068983835851842
Loss after epoch 205 : 0.17050161868140704
Loss after epoch 206 : 0.1703184498648945
Loss after epoch 207 : 0.17014119009398904
Loss after epoch 208 : 0.16997239700996924
Loss after epoch 209 : 0.16981278082002035
Loss after epoch 210 : 0.16966045691870263
Loss after epoch 211 : 0.16951430112721788
Loss after epoch 212 : 0.169372385279318
Loss after epoch 213 : 0.1692343912017149
Loss after epoch 214 : 0.1691008165677335
Loss after epoch 215 : 0.1689711313563907
Loss after epoch 216 : 0.16884537502049257
Loss after epoch 217 : 0.16872368416678404
Loss after epoch 218 :

In [181]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.799


# 2 layers

In [195]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
               hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu)


In [196]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 100 : 0.23209225137947403
Loss after epoch 200 : 0.22986403893323532
Loss after epoch 300 : 0.22702817902183567
Loss after epoch 400 : 0.2224261825710151
Loss after epoch 500 : 0.21352530665882163
Loss after epoch 600 : 0.2067784009860351
Loss after epoch 700 : 0.2024421780077031
Loss after epoch 800 : 0.19911217753688473


KeyboardInterrupt: 

In [170]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.7163


In [93]:
tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
# W1 of size hidden x features
n = D * M
W1 = tn.rvs(n).reshape((M, D )) # hidden x features
# W2 of size output x hidden
m = M  * K
W2 = tn.rvs(m).reshape((K, M)) # output x hidden
b1 = tn.rvs(M).reshape(-1,1) 
b2 = tn.rvs(K).reshape(-1,1) 
                

In [95]:
X = X_train
Z1 = W1.dot(X.T) + b1 # Hidden x N_samples
A1 = tanh(Z1)      # Hidden x N_samples
Z2 = W2.dot(A1) + b2  # Output x N_samples
A2 = softmax(Z2)      #Output x N_samples

      

In [98]:
print(A2.shape)

(10, 60000)


In [99]:
print(y_train_cat.shape)

(10, 60000)


In [101]:
m = X.shape[0]
# deltas
dZ2 = A2 - y_train_cat                                   #Output x N_samples
dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
dZ1 = W2.T.dot(dZ2)*dt(Z1)     # Hidden x N_samples
dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1


In [102]:
# Update
lr = 0.01
W2 -= lr*dW2
b2 -= lr*db2
W1 -= lr*dW1
b1 -= lr*db1

In [103]:
cost = cross_entropy(y_train_cat, A2)

In [104]:
cost

0.23973236540442144

# Testing Adam

In [21]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

M = 5
D = data.shape[1]
K = len(set(target))
beta1 = 0.9   
beta2 = 0.999
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

In [22]:
Vdw1 = np.zeros((M, D ))
Vdw2 = np.zeros((K, M))
Vdb1 = np.zeros((M, 1 ))
Vdb2 = np.zeros((K, 1 ))
Sdw1 = np.zeros((M, D))
Sdw2 = np.zeros((K, M))
Sdb1 = np.zeros((M, 1 ))
Sdb2 = np.zeros((K, 1 ))


In [29]:
def forward(X):
    Z1 = W1.dot(X.T) + b1 # Hidden x N_samples
    A1 = tanh(Z1)      # Hidden x N_samples
    Z2 = W2.dot(A1) + b2  # Output x N_samples
    A2 = softmax(Z2)      #Output x N_samples
    return A2, Z2, A1, Z1

In [30]:
def backpropADAM(X, target):
    # Forward prop
    A2, Z2, A1, Z1 = forward(X)
    # Compute cost
    cost = cross_entropy(target, A2)
    # N samples
    m = X.shape[0]
    # deltas
    dZ2 = A2 - target                                       #Output x N_samples
    dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
    db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
    dZ1 = W2.T.dot(dZ2)*(1-tanh(Z1)**2)     # Hidden x N_samples
    dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
    db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
    # Adam updates
    beta1 = beta1
    beta2 = beta2
    # V
    Vdw1 = beta1*Vdw1 + (1-beta1)*dW1
    Vdw2 = beta1*Vdw2 + (1-beta1)*dW2
    Vdb1 = beta1*Vdb1 + (1-beta1)*db1
    Vdb2 = beta1*Vdb2 + (1-beta1)*db2
    # S
    Sdw1 = beta2*Sdw1 + (1-beta2)*dW1**2
    Sdw2 = beta2*Sdw2 + (1-beta2)*dW2**2
    Sdb1 = beta2*Sdb1 + (1-beta2)*db1**2
    Sdb2 = beta2*Sdb2 + (1-beta2)*db2**2    
    # Update
    lr = learning_rate
    W2 -= lr * Vdw2 / (np.sqrt(Sdw2)+1e-8)
    b2 -= lr * Vdb2 / (np.sqrt(Sdb2)+1e-8)
    W1 -= lr * Vdw1 / (np.sqrt(Sdw1)+1e-8)
    b1 -= lr * Vdb1 / (np.sqrt(Sdb1)+1e-8)
    return cost  

In [39]:
tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
# W1 of size hidden x features
n = D * M
global W1 
W1 = tn.rvs(n).reshape((M, D )) # hidden x features
# W2 of size output x hidden
m = M  * K
global W2
W2 = tn.rvs(m).reshape((K, M)) # output x hidden
global b1
b1 = tn.rvs(M).reshape(-1,1) 
global b2 
b2 = tn.rvs(K).reshape(-1,1) 

epochs = 20
for i in range(epochs):
    # Bias correction for V
    Vdw1 = Vdw1 / (1-beta1**i)
    Vdw2 = Vdw2 / (1-beta1**i)
    Vdb1 = Vdb1 / (1-beta1**i)
    Vdb2 = Vdb2 / (1-beta1**i)
    # S
    Sdw1 = Sdw1 / (1-beta2**i)
    Sdw2 = Sdw2 / (1-beta2**i)
    Sdb1 = Sdb1 / (1-beta2**i)
    Sdb2 = Sdb2 / (1-beta2**i)
    
    
    # Forward prop
    A2, Z2, A1, Z1 = forward(X_train)
    # Compute cost
    cost = cross_entropy(y_train, A2)
    # N samples
    m = X_train
    # deltas
    dZ2 = A2 - y_train                                       #Output x N_samples
    dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
    db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
    dZ1 = W2.T.dot(dZ2)*(1-tanh(Z1)**2)     # Hidden x N_samples
    dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
    db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
   
    # V
    Vdw1 = beta1*Vdw1 + (1-beta1)*dW1
    Vdw2 = beta1*Vdw2 + (1-beta1)*dW2
    Vdb1 = beta1*Vdb1 + (1-beta1)*db1
    Vdb2 = beta1*Vdb2 + (1-beta1)*db2
    # S
    Sdw1 = beta2*Sdw1 + (1-beta2)*dW1**2
    Sdw2 = beta2*Sdw2 + (1-beta2)*dW2**2
    Sdb1 = beta2*Sdb1 + (1-beta2)*db1**2
    Sdb2 = beta2*Sdb2 + (1-beta2)*db2**2    
    # Update
    lr = learning_rate
    W2 -= lr * Vdw2 / (np.sqrt(Sdw2)+1e-8)
    b2 -= lr * Vdb2 / (np.sqrt(Sdb2)+1e-8)
    W1 -= lr * Vdw1 / (np.sqrt(Sdw1)+1e-8)
    b1 -= lr * Vdb1 / (np.sqrt(Sdb1)+1e-8)
    
    
    print(f'Loss after epoch {i} : {cost}')
    costs.append(cost)
    if i%100 == 0 and i>0 :
        print(f'Loss after epoch {i} : {cost}')
    print(f'Loss after 1epoch {len(costs)} : {costs[-1]}')        
    costs.pop(0)
    

ValueError: operands could not be broadcast together with shapes (3,5) (112,4) 

In [40]:
A2, Z2, A1, Z1 = forward(X_train)


(112,)

In [None]:
cost = cross_entropy(y_train_cat, A2)

In [16]:
i = 5
beta1**i

TypeError: unsupported operand type(s) for ** or pow(): 'tuple' and 'int'

In [33]:
W2

array([[-1.72788095e-01,  4.10790325e-01,  4.57705823e-01,
         1.16515290e-01,  1.55025966e-01],
       [ 2.73742100e-01, -1.42290534e-01,  4.54997999e-01,
         3.48680215e-01,  4.83674311e-01],
       [ 7.20732218e-02,  4.60749837e-01,  9.71750370e-03,
         4.71877878e-01, -2.05773248e-04]])