In [157]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import io
import requests
import random

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

from scipy.special import expit as activation_function
from scipy.stats import truncnorm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import datasets

In [158]:
rng = np.random.default_rng() 

In [159]:
def truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def softmax(X):
    e = np.exp(X - np.max(X))
    return e / e.sum(axis=0, keepdims=True)


def cross_entropy(target, output):
    return -np.mean(target*np.log(output))

def cross_entropy_matrix(output, target):
    target = np.array(target)
    output = np.array(output)
    product = target*np.log(output)
    errors = -np.sum(product, axis=1)
    m = len(errors)
    errors = np.sum(errors) / m
    return errors

def sigmoid(x):
    return 1/(1+np.exp(-x))

def ds(x):
    return sigmoid(x)*(1-sigmoid(x))

def relu(x):
    return np.maximum(x,0)
  

def dr(x):
    dr = (np.sign(x) + 1) / 2
    return dr

def tanh(x):
    a = np.exp(x)
    b = np.exp(-x)
    return (a-b)/(a+b)

def dt(x):
    return 1-tanh(x)**2
    
def leaky(x,a):
    leaky = np.maximum(x,0)*x + a*np.minimum(x,0)
    return leaky

def dl(x,a):
    dl = (np.sign(x)+1)/2 - a*(np.sign(x)-1)/2
    return dl

def derivative(f):
    if f == sigmoid :
        return ds
    if f == tanh :
        return dt
    if f == relu :
        return dr
    if f == leaky :
        return dl
    return None

def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N,K))
    for i in range(N):
        ind[i][y[i]]=1
    return ind

def classification_rate(Y, P):
    return np.mean(Y==P)

# One Hidden Layer

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden x N_features
- **b1**    : Hidden
- **W2**    : Output x Hidden
- **b2**    : Output

In [175]:
class HiddenOne:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes,
                 learning_rate,
                 activation_hidden,
                 optimizer = None,
                 batch_size = None,
                 delta_stop = None,
                 patience = 1,
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes = hidden_nodes          
        self.learning_rate = learning_rate 
        self.activation_hidden = activation_hidden
        self.hidden_derivative = derivative(self.activation_hidden)
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.delta_stop = delta_stop
        self.patience = patience
        self.create_weight_matrices()
        self.create_biases()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n = self.input_nodes * self.hidden_nodes
        self.W1 = tn.rvs(n).reshape((self.hidden_nodes, self.input_nodes )) # hidden x features
        # W2 of size output x hidden
        m = self.hidden_nodes  * self.output_nodes
        self.W2 = tn.rvs(m).reshape((self.output_nodes, self.hidden_nodes )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes).reshape(-1,1) 
        self.b2 = tn.rvs(self.output_nodes).reshape(-1,1) 
                
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1 # Hidden x N_samples
        A1 = self.activation_hidden(Z1)      # Hidden x N_samples
        Z2 = self.W2.dot(A1) + self.b2  # Output x N_samples
        A2 = softmax(Z2)      #Output x N_samples
        return A2, Z2, A1, Z1
    
    
    def backprop(self, X, target):
        # Forward prop
        A2, Z2, A1, Z1 = self.forward(X)
        # Compute cost
        cost = cross_entropy(target, A2)
        # N samples
        m = X.shape[0]
        # deltas
        dZ2 = A2 - target                                       #Output x N_samples
        dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Update
        lr = self.learning_rate
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
        return cost
        
    def backpropSGD(self, X, target):
        m = X.shape[0]                  #N_samples
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(m))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        for i in range(m) :
            # Forward prop
            x = X_SGD[i,:].reshape(1,-1)                   # 1 x N_features
            a2, z2, a1, z1 = self.forward(x)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,i].reshape(-1,1), a2)/m
            # deltas
            dz2 = a2 - target[:,i].reshape(-1,1)                    #Output x 1
            dW2 = dz2.dot(a1.T)                                     #Output x hidden
            db2 = dz2                                               #Output x 1
            dz1 = self.W2.T.dot(dz2)*self.hidden_derivative(z1)     # Hidden x 1
            dW1 = dz1.dot(x)                                        # Hidden x N_Features
            db1 = dz1                                               # Hidden x 1
            # Update
            lr = self.learning_rate
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
        
    def backprop_minibatch(self, X, target):
        n = X.shape[1]               # N_features
        batch_size = X.shape[0]      # N_samples
        if self.batch_size == None :
            batch_size = self.minibatch_size(batch_size)
        else :
            batch_size = self.batch_size
            
        X_SGD = X.copy()
        u = rng.shuffle(np.arange(m))
        X_SGD = X_SGD[u,:].squeeze()    # N_samples x N_Features
        target_SGD = target[:,u].squeeze() # Output x N_samples
        cost = 0
        
        pass_length = int(X.shape[0]/batch_size)
        for i in range(pass_length) :
            k = i*batch_size
            # Forward prop
            X = X_SGD[k:k+batch_size,:].reshape(batch_size,-1)                   #  batch_size x N_features
            A2, Z2, A1, Z1 = self.forward(X)
            # cost update
            cost = cost + cross_entropy(target_SGD[:,k:k+batch_size].reshape(-1,batch_size), A2)/pass_length
            # deltas
            dZ2 = A2 - target_SGD[:,k:k+batch_size].reshape(-1,batch_size)   #Output x batch_size
            dW2 = dZ2.dot(A1.T)/batch_size                                   #Output x hidden
            db2 = np.sum(dZ2, axis=1, keepdims=True)/batch_size              #Output x 1
            dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)              # Hidden x batch_size
            dW1 = dZ1.dot(X)/batch_size                                      # Hidden x N_Features
            db1 = np.sum(dZ1, axis=1, keepdims=True)/batch_size              #Hidden x1                                            # Hidden x 1
            # Update
            lr = self.learning_rate
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
        return cost
      
    def predict(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        return A2
    
    def predict_class(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A2, axis=0)
        return y_pred
                   
    def run(self, X_train, target, epochs=10):
        costs = [1e-10]
        if self.delta_stop == None : 
            for i in range(epochs):
                if self.optimizer == 'SGD' :
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                elif self.optimizer == 'minibatch' :
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                else :
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                if i%100 == 0:
                    print(f'Loss after epoch {i+1} : {cost}')
            costs.pop(0)
            return costs  
        else :
            counter = 0
            for i in range(epochs):
                
                if self.optimizer == 'SGD' :
                    cost = self.backpropSGD(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                        
                elif self.optimizer == 'minibatch' :
                    cost = self.backprop_minibatch(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    
                else :
                    cost = self.backprop(X_train, target)
                    costs.append(cost)
                    n = len(costs)
                    delta = np.abs(costs[n-1]/costs[n-2]-1)
                    if(delta < self.delta_stop) :
                        counter+=1
                        if(counter>=self.patience):
                            print(f'Early stop at epoch {i}, the cost is : {cost}')
                            costs.pop(0)
                            return costs
                    else :
                        counter =0
                    
                if i%100 == 0 and i>0 :
                    print(f'Loss after epoch {i} : {cost}')
            costs.pop(0)
            return costs  
            

          
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
        
        
    def minibatch_size(self, n_samples):
        if n_samples < 2000:
            return n_samples
        if n_samples < 12800:
            return 64
        if n_samples < 25600:
            return 128
        if n_samples < 51200:
            return 256
        if n_samples < 102400:
            return 512
        return 1024
    
        
        
            

In [176]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

In [177]:
from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

In [178]:
M = 5
D = data.shape[1]
K = len(set(target))
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T



In [224]:
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               optimizer='minibatch',
               batch_size = 28,
                delta_stop = 1e-3,
                patience = 5,
              )

In [225]:
c=nn.run(X_train, y_train_cat, epochs=1000 );

Loss after epoch 100 : 0.21743216742422897
Loss after epoch 200 : 0.1481239853712665
Loss after epoch 300 : 0.1091663266324776
Loss after epoch 400 : 0.0824858137141755
Loss after epoch 500 : 0.06512065566979974
Loss after epoch 600 : 0.053591334831718976
Loss after epoch 700 : 0.04564935085155455
Loss after epoch 800 : 0.03996679904068107
Loss after epoch 900 : 0.0357263409385219
Early stop at epoch 928, the cost is : 0.03472671301758435


In [226]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.9736842105263158


In [67]:
tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
# W1 of size hidden x features
n = M * D
W1 = tn.rvs(n).reshape((M, D )) # hidden x features
# W2 of size output x hidden
m = M  * K
W2 = tn.rvs(m).reshape((K, M)) # output x hidden
b1 = tn.rvs(M).reshape(-1,1) 
b2 = tn.rvs(K).reshape(-1,1) 

In [68]:
def forward(X):
    Z1 = W1.dot(X.T) + b1 # Hidden x N_samples
    A1 = tanh(Z1)      # Hidden x N_samples
    Z2 = W2.dot(A1) + b2  # Output x N_samples
    A2 = softmax(Z2)      #Output x N_samples
    return A2, Z2, A1, Z1

In [69]:
m = X_train.shape[1]
X_SGD = X_train.copy()
u = rng.shuffle(np.arange(m))
X_SGD = X_SGD[u,:].squeeze()    
target_SGD = y_train_cat[:,u].squeeze()    
cost = 0

x = X_SGD[0,:].reshape(1,-1)

In [110]:
target_SGD[:,0].shape

(3,)

In [111]:
a2, z2, a1, z1 = forward(x)


In [112]:
a2.shape

(3, 1)

In [72]:
# cost update
cost = cost + cross_entropy(target_SGD, a2)/m

In [73]:
# deltas
dz2 = a2 - y_train_cat[:,0].reshape(-1,1)                    #Output x 1
dW2 = dz2.dot(a1.T)                                     #Output x hidden
db2 = dz2                                               #Output x 1
dz1 = W2.T.dot(dz2)*dt(z1)     # Hidden x 1
dW1 = dz1.dot(x)                                        # Hidden x N_Features
db1 = dz1                                               # Hidden x 1

In [54]:
a2.shape

(3, 5)

In [23]:
y_train_cat[:,0].shape

(3,)

In [47]:
# Update
lr = 0.01
W2 -= lr*dW2
b2 -= lr*db2
W1 -= lr*dW1
b1 -= lr*db1


ValueError: non-broadcastable output operand with shape (3,1) doesn't match the broadcast shape (3,112)

In [78]:
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh)

In [238]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 1 : 0.39340275517626755
Loss after epoch 101 : 0.36777361307845496
Loss after epoch 201 : 0.36661290475548086
Loss after epoch 301 : 0.3661962524130979
Loss after epoch 401 : 0.3659397587127295
Loss after epoch 501 : 0.365741506670213
Loss after epoch 601 : 0.3655567086226289
Loss after epoch 701 : 0.3653487364645902
Loss after epoch 801 : 0.3650597632911135
Loss after epoch 901 : 0.3645320214913403
Loss after epoch 1001 : 0.36297644349737057
Loss after epoch 1101 : 0.34395644028092526
Loss after epoch 1201 : 0.26754056115410707
Loss after epoch 1301 : 0.23192893710945078
Loss after epoch 1401 : 0.2125558471381393
Loss after epoch 1501 : 0.2000465512968387
Loss after epoch 1601 : 0.19130126055064564
Loss after epoch 1701 : 0.18486981270985564
Loss after epoch 1801 : 0.17995621611570123
Loss after epoch 1901 : 0.17608455955445507
Loss after epoch 2001 : 0.1729531482154863
Loss after epoch 2101 : 0.17036158605379784
Loss after epoch 2201 : 0.16817121098738386
Loss after 

[0.39340275517626755,
 0.39103774004457104,
 0.3891193337393632,
 0.38752212382079987,
 0.3861637019731666,
 0.38498791359293855,
 0.38395526590993173,
 0.38303720974673927,
 0.3822126042108011,
 0.38146545877743454,
 0.3807834488771659,
 0.3801569145174276,
 0.3795781690127707,
 0.3790410118238531,
 0.37854037878767294,
 0.37807208672673887,
 0.37763264409643427,
 0.377219108622404,
 0.3768289788910356,
 0.37646011082070463,
 0.3761106526031477,
 0.3757789935208342,
 0.37546372330481775,
 0.3751635995819277,
 0.3748775215897283,
 0.3746045087913122,
 0.37434368335258067,
 0.3740942556881465,
 0.3738555124630946,
 0.3736268065738086,
 0.3734075487340463,
 0.37319720037108733,
 0.37299526759728757,
 0.3728012960692982,
 0.37261486658382753,
 0.3724355912876059,
 0.37226311040197607,
 0.3720970893806298,
 0.37193721643349803,
 0.37178320036144824,
 0.3716347686558602,
 0.37149166582479964,
 0.37135365191375913,
 0.371220501194047,
 0.37109200099612955,
 0.37096795066871263,
 0.3708481606

In [240]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.9473684210526315


In [84]:
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh,
               optimizer='SGD')

In [85]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 1 : 0.37051348344015045
Loss after epoch 101 : 0.6677007794137935
Loss after epoch 201 : 0.8553523359797114
Loss after epoch 301 : 0.9666741815933942
Loss after epoch 401 : 1.0431338104557264
Loss after epoch 501 : 1.100806638454739
Loss after epoch 601 : 1.146925503540364
Loss after epoch 701 : 1.1852735418839784
Loss after epoch 801 : 1.2180563406607332
Loss after epoch 901 : 1.2466654037350589
Loss after epoch 1001 : 1.272032346145685
Loss after epoch 1101 : 1.294810312439
Loss after epoch 1201 : 1.3154743541910485
Loss after epoch 1301 : 1.3343804927590948
Loss after epoch 1401 : 1.3518022573434605
Loss after epoch 1501 : 1.3679542469897388
Loss after epoch 1601 : 1.3830078654852453
Loss after epoch 1701 : 1.397102147464834
Loss after epoch 1801 : 1.4103514014045275
Loss after epoch 1901 : 1.4228507278951557
Loss after epoch 2001 : 1.4346800832853643
Loss after epoch 2101 : 1.4459073249431658
Loss after epoch 2201 : 1.456590529247101
Loss after epoch 2301 : 1.46677

[0.37051348344015045,
 0.36824438095734136,
 0.3739646022432506,
 0.3811633737158022,
 0.3876282636343944,
 0.39327122620557237,
 0.39824938804557836,
 0.4026937876007915,
 0.4067063583824954,
 0.4103765428244649,
 0.41379008593733424,
 0.4170297324301385,
 0.42017088618155796,
 0.42327596703607195,
 0.42639063490234597,
 0.42954333406257994,
 0.43274773590660126,
 0.4360066492807263,
 0.4393159996393379,
 0.4426680535747708,
 0.4460536496313569,
 0.44946354911514536,
 0.45288914692374815,
 0.45632277541007654,
 0.45975777695347964,
 0.4631884597321083,
 0.46661000376227313,
 0.4700183529167935,
 0.47341011000433164,
 0.4767824418428593,
 0.4801329961873737,
 0.48345983009599874,
 0.4867613484717043,
 0.49003625134082324,
 0.4932834885323715,
 0.49650222061554367,
 0.4996917851544096,
 0.5028516675183822,
 0.5059814756375678,
 0.5090809182161625,
 0.5121497860202071,
 0.5151879359440624,
 0.518195277638063,
 0.5211717625520915,
 0.5241173753195536,
 0.5270321274757048,
 0.5299160535748

In [77]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.6052631578947368


# Two Hidden Layers

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden1 x N_features
- **b1**    : Hidden1
- **W2**    : Hidden2 x Hidden1
- **b2**    : Hidden2
- **W3**    : Output x Hidden
- **b3**    : Output

In [70]:
class HiddenTwo:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes_1,
                 hidden_nodes_2,
                 learning_rate,
                 activation_hidden_1,
                 activation_hidden_2,
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes_1 = hidden_nodes_1    
        self.hidden_nodes_2 = hidden_nodes_2    
        self.learning_rate = learning_rate 
        self.activation_hidden_1 = activation_hidden_1
        self.activation_hidden_2 = activation_hidden_2
        self.hidden_derivative_1 = derivative(self.activation_hidden_1)
        self.hidden_derivative_2 = derivative(self.activation_hidden_2)
        self.create_weight_matrices()
        self.create_biases()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n1 = self.input_nodes * self.hidden_nodes_1
        self.W1 = tn.rvs(n1).reshape((self.hidden_nodes_1, self.input_nodes )) # hidden1 x features
        # W2 of size hidden2 x hidden1
        n2 = self.hidden_nodes_2 * self.hidden_nodes_1
        self.W2 = tn.rvs(n2).reshape((self.hidden_nodes_2, self.hidden_nodes_1 )) # hidden1 x features
        # W3 of size output x hidden2
        n3 = self.hidden_nodes_2  * self.output_nodes
        self.W3 = tn.rvs(n3).reshape((self.output_nodes, self.hidden_nodes_2 )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes_1).reshape(-1,1) 
        self.b2 = tn.rvs(self.hidden_nodes_2).reshape(-1,1) 
        self.b3 = tn.rvs(self.output_nodes).reshape(-1,1) 
                
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1      # Hidden1 x N_samples
        A1 = self.activation_hidden_1(Z1)      # Hidden1 x N_samples
        Z2 = self.W2.dot(A1) + self.b2      # Hidden2 x N_samples
        A2 = self.activation_hidden_2(Z2)      # Hidden2 x N_samples
        Z3 = self.W3.dot(A2) + self.b3       # Output x N_samples
        A3 = softmax(Z3)                     #Output x N_samples
        return A3, Z3, A2, Z2, A1, Z1
    
    def backprop(self, X, target):
        # Forward prop
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
        # N_samples
        m = X.shape[0]
        # deltas
        dZ3 = A3 - target                                      #Output x N_samples
        dW3 = dZ3.dot(A2.T)/m                                  #Output x Hidden_2
        db3 = np.sum(dZ3, axis=1, keepdims=True)/m             #Output x 1
        dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)    # Hidden2 x N_samples
        dW2 = dZ2.dot(A1.T)/m                                     # Hidden2 x Hidden1 
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m             # Hidden2 x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
     
        # Update
        lr = self.learning_rate
        self.W3 -= lr*dW3
        self.b3 -= lr*db3
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
      
    def predict(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        return A3
    
    def predict_class(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A3, axis=0)
        return y_pred
                   
    def run(self, X_train, target, epochs=10):
        costs = []
        for i in range(epochs):
            A3, Z3, A2, Z2, A1, Z1 = self.forward(X_train)
            cost = cross_entropy(target, A3)
            costs.append(cost)
            if i%100 == 0:
                print(f'Loss after epoch {i} : {cost}')
            self.backprop(X_train, target)
        return costs  
          
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
        
        
        
        
            

In [71]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
               hidden_nodes_2 = M-1,
               learning_rate = 0.01,
               activation_hidden_1 = tanh,
               activation_hidden_2 = tanh)


In [72]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 0 : 0.43052450075533344
Loss after epoch 100 : 0.37957020204320085
Loss after epoch 200 : 0.35210309707940923
Loss after epoch 300 : 0.3188208696201698
Loss after epoch 400 : 0.27298965046000545
Loss after epoch 500 : 0.2314611247316099
Loss after epoch 600 : 0.20380904135917347
Loss after epoch 700 : 0.18670910956380002
Loss after epoch 800 : 0.17531041770056072
Loss after epoch 900 : 0.16647347546663915
Loss after epoch 1000 : 0.15848237654815536
Loss after epoch 1100 : 0.15051364355502048
Loss after epoch 1200 : 0.1422375164659015
Loss after epoch 1300 : 0.13358871541430414
Loss after epoch 1400 : 0.12466753232923737
Loss after epoch 1500 : 0.11568834497515186
Loss after epoch 1600 : 0.10691950381148614
Loss after epoch 1700 : 0.09861300736259554
Loss after epoch 1800 : 0.09095137989637579
Loss after epoch 1900 : 0.08402941976684392
Loss after epoch 2000 : 0.07786546693216173
Loss after epoch 2100 : 0.07242566397931635
Loss after epoch 2200 : 0.06764772945342407
Los

[0.43052450075533344,
 0.42980217747232324,
 0.4290849853891115,
 0.428372899211334,
 0.42766589358092827,
 0.42696394308231145,
 0.42626702224861635,
 0.4255751055679585,
 0.4248881674897048,
 0.4242061824307125,
 0.4235291247815108,
 0.4228569689123941,
 0.42218968917939914,
 0.42152725993013734,
 0.42086965550945254,
 0.4202168502648793,
 0.4195688185518741,
 0.4189255347387933,
 0.4182869732115942,
 0.4176531083782351,
 0.41702391467275357,
 0.41639936655900006,
 0.41577943853401,
 0.41516410513099494,
 0.4145533409219375,
 0.41394712051977656,
 0.41334541858016877,
 0.4127482098028172,
 0.4121554689323582,
 0.41156717075879934,
 0.41098329011750456,
 0.41040380188872383,
 0.409828680996666,
 0.40925790240811794,
 0.4086914411306114,
 0.4081292722101447,
 0.40757137072846705,
 0.40701771179993174,
 0.4064682705679339,
 0.4059230222009423,
 0.40538194188814164,
 0.40484500483470026,
 0.4043121862566824,
 0.40378346137562376,
 0.40325880541278974,
 0.4027381935831403,
 0.402221601089

# Loading fashion mnist

In [235]:
from tensorflow.keras.datasets import fashion_mnist


In [236]:
fashion = fashion_mnist.load_data()

In [237]:
(X_train, y_train),(X_test, y_test) = fashion

In [238]:
print(X_train.shape)

(60000, 28, 28)


In [239]:
M = X_train.shape[1]
N_train = X_train.shape[0]
N_test = X_test.shape[0]

In [240]:
X_train = X_train.reshape(N_train, M*M, 1).squeeze()
X_test = X_test.reshape(N_test, M*M, 1).squeeze()

# Fashion MNIST with 1 hidden layer

In [241]:
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T

In [242]:
print(X_train.shape)

(60000, 784)


In [243]:
D = X_train.shape[1]
K = y_train_cat.shape[0]
M=5
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh)

In [244]:
MAX = 255
X_train = X_train/ MAX
X_test =X_test/ MAX

In [245]:
X_train.shape

(60000, 784)

In [258]:
D = X_train.shape[1]
K = y_train_cat.shape[0]
M=5
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = relu,
               optimizer='minibatch',
                delta_stop = 1e-7,
                patience = 5,
              )

In [259]:
c = nn.run(X_train, y_train_cat, epochs=200 )

Loss after epoch 100 : 0.05975585039746326


In [260]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.7943


In [166]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
               hidden_nodes_2 = M,
               learning_rate = 0.01,
               activation_hidden_1 = relu,
               activation_hidden_2 = relu)


In [169]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 0 : 0.18843736743928718
Loss after epoch 100 : 0.16592613469823828
Loss after epoch 200 : 0.13875456927553473
Loss after epoch 300 : 0.12651550170459827
Loss after epoch 400 : 0.11982636770117769
Loss after epoch 500 : 0.11525604453050124
Loss after epoch 600 : 0.11176647256412618
Loss after epoch 700 : 0.10896720689308095
Loss after epoch 800 : 0.10668667748392244
Loss after epoch 900 : 0.1047578062852407
Loss after epoch 1000 : 0.10314623283878621
Loss after epoch 1100 : 0.1017866798824543
Loss after epoch 1200 : 0.1006526212316377
Loss after epoch 1300 : 0.09967661168776303
Loss after epoch 1400 : 0.09883177916143256
Loss after epoch 1500 : 0.09807348522416472
Loss after epoch 1600 : 0.09738700272392299
Loss after epoch 1700 : 0.09675913008143594
Loss after epoch 1800 : 0.09614779454789515
Loss after epoch 1900 : 0.09557841675297843
Loss after epoch 2000 : 0.09504176539328933
Loss after epoch 2100 : 0.09455086220246572
Loss after epoch 2200 : 0.09404775998001495
Los

[0.18843736743928718,
 0.18345465176131273,
 0.17854443977547052,
 0.17731379869242403,
 0.17912367607988655,
 0.1784227857420287,
 0.18495501790625385,
 0.18166183388831336,
 0.18667794158128753,
 0.1795715014498672,
 0.17882431947870614,
 0.17847113642184476,
 0.18559499226651674,
 0.1797201622667603,
 0.181297993392288,
 0.1818614678226376,
 0.18692276367246194,
 0.1823598412557701,
 0.17754359666743783,
 0.17641766188693117,
 0.17897136491084922,
 0.17538635597730332,
 0.17963376212126506,
 0.18158344422580788,
 0.18983756832842197,
 0.18326430418254028,
 0.17770226264189778,
 0.17692717107994757,
 0.17995604496518755,
 0.17433475779403082,
 0.17496609060707932,
 0.17525938591261372,
 0.1814073868464169,
 0.1760611235950174,
 0.17845453332493888,
 0.17873404298119872,
 0.1833151182737271,
 0.17844336623526083,
 0.17478081326114908,
 0.17184479426594557,
 0.1727221246159429,
 0.17223072749895332,
 0.18126414685808784,
 0.17938232325883538,
 0.19358416965003802,
 0.17565988047569966,

In [170]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 0.7163


In [93]:
tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
# W1 of size hidden x features
n = D * M
W1 = tn.rvs(n).reshape((M, D )) # hidden x features
# W2 of size output x hidden
m = M  * K
W2 = tn.rvs(m).reshape((K, M)) # output x hidden
b1 = tn.rvs(M).reshape(-1,1) 
b2 = tn.rvs(K).reshape(-1,1) 
                

In [95]:
X = X_train
Z1 = W1.dot(X.T) + b1 # Hidden x N_samples
A1 = tanh(Z1)      # Hidden x N_samples
Z2 = W2.dot(A1) + b2  # Output x N_samples
A2 = softmax(Z2)      #Output x N_samples

      

In [98]:
print(A2.shape)

(10, 60000)


In [99]:
print(y_train_cat.shape)

(10, 60000)


In [101]:
m = X.shape[0]
# deltas
dZ2 = A2 - y_train_cat                                   #Output x N_samples
dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
dZ1 = W2.T.dot(dZ2)*dt(Z1)     # Hidden x N_samples
dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1


In [102]:
# Update
lr = 0.01
W2 -= lr*dW2
b2 -= lr*db2
W1 -= lr*dW1
b1 -= lr*db1

In [103]:
cost = cross_entropy(y_train_cat, A2)

In [104]:
cost

0.23973236540442144

In [None]:

    def run(self, X_train, target, epochs=10):
        costs = []
        for i in range(epochs):
            A2, Z2, A1, Z1 = self.forward(X_train)
            cost = cross_entropy(target, A2)
            costs.append(cost)
            if i%100 == 0:
                print(f'Loss after epoch {i} : {cost}')
            self.backprop(X_train, target)
        return costs  