In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import io
import requests
import random

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

from scipy.special import expit as activation_function
from scipy.stats import truncnorm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import datasets

In [29]:
def truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def softmax(X):
    e = np.exp(X)
    return e / e.sum(axis=0, keepdims=True)


def cross_entropy(target, output):
    return -np.mean(target*np.log(output))

def cross_entropy_matrix(output, target):
    target = np.array(target)
    output = np.array(output)
    product = target*np.log(output)
    errors = -np.sum(product, axis=1)
    m = len(errors)
    errors = np.sum(errors) / m
    return errors

def sigmoid(x):
    return 1/(1+np.exp(-x))

def ds(x):
    return sigmoid(x)*(1-sigmoid(x))

def relu(x):
    return np.maximum(x,0)
  

def dr(x):
    dr = (np.sign(x) + 1) / 2
    return dr

def tanh(x):
    a = np.exp(x)
    b = np.exp(-x)
    return (a-b)/(a+b)

def dt(x):
    return 1-tanh(x)**2
    
def leaky(x,a):
    leaky = np.maximum(x,0)*x + a*np.minimum(x,0)
    return leaky

def dl(x,a):
    dl = (np.sign(x)+1)/2 - a*(np.sign(x)-1)/2
    return dl

def derivative(f):
    if f == sigmoid :
        return ds
    if f == tanh :
        return dt
    if f == relu :
        return dr
    if f == leaky :
        return dl
    return None

def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N,K))
    for i in range(N):
        ind[i][y[i]]=1
    return ind

def classification_rate(Y, P):
    return np.mean(Y==P)

# One Hidden Layer

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden x N_features
- **b1**    : Hidden
- **W2**    : Output x Hidden
- **b2**    : Output

In [30]:
class HiddenOne:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes,
                 learning_rate,
                 activation_hidden,
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes = hidden_nodes          
        self.learning_rate = learning_rate 
        self.activation_hidden = activation_hidden
        self.hidden_derivative = derivative(self.activation_hidden)
        self.create_weight_matrices()
        self.create_biases()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n = self.input_nodes * self.hidden_nodes
        self.W1 = tn.rvs(n).reshape((self.hidden_nodes, self.input_nodes )) # hidden x features
        # W2 of size output x hidden
        m = self.hidden_nodes  * self.output_nodes
        self.W2 = tn.rvs(m).reshape((self.output_nodes, self.hidden_nodes )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes).reshape(-1,1) 
        self.b2 = tn.rvs(self.output_nodes).reshape(-1,1) 
                
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1 # Hidden x N_samples
        A1 = self.activation_hidden(Z1)      # Hidden x N_samples
        Z2 = self.W2.dot(A1) + self.b2  # Output x N_samples
        A2 = softmax(Z2)      #Output x N_samples
        return A2, Z2, A1, Z1
    
    def backprop(self, X, target):
        # Forward prop
        A2, Z2, A1, Z1 = self.forward(X)
        # N samples
        m = X.shape[0]
        # deltas
        dZ2 = A2 - target                                       #Output x N_samples
        dW2 = dZ2.dot(A1.T)/m                                   #Output x hidden
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m              #Output x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
        # Update
        lr = self.learning_rate
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
      
    def predict(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        return A2
    
    def predict_class(self, X_predict):
        A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A2, axis=0)
        return y_pred
                   
    def run(self, X_train, target, epochs=10):
        costs = []
        for i in range(epochs):
            A2, Z2, A1, Z1 = self.forward(X_train)
            cost = cross_entropy(target, A2)
            costs.append(cost)
            if i%100 == 0:
                print(f'Loss after epoch {i} : {cost}')
            self.backprop(X_train, target)
        return costs  
          
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
        
        
        
        
            

In [31]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target

In [32]:
from tensorflow.keras.utils import to_categorical
t = to_categorical(target)

In [33]:
M = 5
D = data.shape[1]
K = len(set(target))
X_train, X_test, y_train, y_test = train_test_split(data ,target ,test_size=0.25)
y_train_cat = to_categorical(y_train).T
y_test_cat = to_categorical(y_test).T



In [34]:
nn = HiddenOne(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes = M,
               learning_rate = 0.01,
               activation_hidden = tanh)

In [35]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 0 : 0.4295985468606374
Loss after epoch 100 : 0.35763336643409993
Loss after epoch 200 : 0.3308229675963746
Loss after epoch 300 : 0.2917264996003141
Loss after epoch 400 : 0.2579042218749367
Loss after epoch 500 : 0.23424945986305068
Loss after epoch 600 : 0.2181545066272989
Loss after epoch 700 : 0.20691041742435223
Loss after epoch 800 : 0.19876179185123113
Loss after epoch 900 : 0.1926434384759031
Loss after epoch 1000 : 0.18790076830365446
Loss after epoch 1100 : 0.18411780564161348
Loss after epoch 1200 : 0.1810193113863488
Loss after epoch 1300 : 0.17841466478211357
Loss after epoch 1400 : 0.17616452510078565
Loss after epoch 1500 : 0.17416020135071975
Loss after epoch 1600 : 0.17231086088091818
Loss after epoch 1700 : 0.17053672134122133
Loss after epoch 1800 : 0.16876735988424446
Loss after epoch 1900 : 0.16694338185750768
Loss after epoch 2000 : 0.16501898327925502
Loss after epoch 2100 : 0.16296380248326892
Loss after epoch 2200 : 0.1607635379712266
Loss aft

[0.4295985468606374,
 0.4277784236016454,
 0.4259986450731243,
 0.42425816720714404,
 0.4225559791039296,
 0.4208911014092863,
 0.41926258478955236,
 0.4176695085132471,
 0.4161109791482783,
 0.4145861293833041,
 0.4130941169815817,
 0.4116341238753498,
 0.41020535540838937,
 0.4088070397338637,
 0.4074384273737427,
 0.40609879094499457,
 0.40478742505618537,
 0.4035036463760455,
 0.4022467938728653,
 0.4010162292201497,
 0.3998113373597434,
 0.3986315272085771,
 0.3974762324892883,
 0.39634491265831207,
 0.39523705389775915,
 0.3941521701297497,
 0.393089804004198,
 0.39204952780381386,
 0.3910309442038563,
 0.3900336868195954,
 0.3890574204721959,
 0.38810184110451074,
 0.3871666752826654,
 0.38625167922776854,
 0.38535663733479836,
 0.38448136015257567,
 0.3836256818192653,
 0.38278945697115996,
 0.3819725571673808,
 0.3811748668980182,
 0.3803962792664328,
 0.37963669145621615,
 0.37889600010807356,
 0.37817409674039837,
 0.37747086334877095,
 0.3767861683138129,
 0.376119862734150

In [36]:
acc = nn.evaluate(X_test, y_test)

Accuracy : 1.0


# Two Hidden Layers

# Variables :

- **X**     : N_Samples x N_features
- **W1**    : Hidden1 x N_features
- **b1**    : Hidden1
- **W2**    : Hidden2 x Hidden1
- **b2**    : Hidden2
- **W3**    : Output x Hidden
- **b3**    : Output

In [40]:
class HiddenTwo:
     
    def __init__(self, 
                 input_nodes, 
                 output_nodes, 
                 hidden_nodes_1,
                 hidden_nodes_2,
                 learning_rate,
                 activation_hidden_1,
                 activation_hidden_2,
                ):         
        # Initializations
        self.input_nodes = input_nodes
        self.output_nodes = output_nodes       
        self.hidden_nodes_1 = hidden_nodes_1    
        self.hidden_nodes_2 = hidden_nodes_2    
        self.learning_rate = learning_rate 
        self.activation_hidden_1 = activation_hidden_1
        self.activation_hidden_2 = activation_hidden_2
        self.hidden_derivative_1 = derivative(self.activation_hidden_1)
        self.hidden_derivative_2 = derivative(self.activation_hidden_2)
        self.create_weight_matrices()
        self.create_biases()
             
    def create_weight_matrices(self):
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5) 
        # W1 of size hidden x features
        n1 = self.input_nodes * self.hidden_nodes_1
        self.W1 = tn.rvs(n1).reshape((self.hidden_nodes_1, self.input_nodes )) # hidden1 x features
        # W2 of size hidden2 x hidden1
        n2 = self.hidden_nodes_2 * self.hidden_nodes_1
        self.W2 = tn.rvs(n2).reshape((self.hidden_nodes_2, self.hidden_nodes_1 )) # hidden1 x features
        # W3 of size output x hidden2
        n3 = self.hidden_nodes_2  * self.output_nodes
        self.W3 = tn.rvs(n3).reshape((self.output_nodes, self.hidden_nodes_2 )) # output x hidden
    
    def create_biases(self):    
        tn = truncated_normal(mean=2, sd=1, low=-0.5, upp=0.5)
        self.b1 = tn.rvs(self.hidden_nodes_1).reshape(-1,1) 
        self.b2 = tn.rvs(self.hidden_nodes_2).reshape(-1,1) 
        self.b3 = tn.rvs(self.output_nodes).reshape(-1,1) 
                
    def forward(self, X):
        Z1 = self.W1.dot(X.T) + self.b1      # Hidden1 x N_samples
        A1 = self.activation_hidden_1(Z1)      # Hidden1 x N_samples
        Z2 = self.W2.dot(A1) + self.b2      # Hidden2 x N_samples
        A2 = self.activation_hidden_2(Z2)      # Hidden2 x N_samples
        Z3 = self.W3.dot(A2) + self.b3       # Output x N_samples
        A3 = softmax(Z3)                     #Output x N_samples
        return A3, Z3, A2, Z2, A1, Z1
    
    def backprop(self, X, target):
        # Forward prop
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X)
        # N_samples
        m = X.shape[0]
        # deltas
        dZ3 = A3 - target                                      #Output x N_samples
        dW3 = dZ3.dot(A2.T)/m                                  #Output x Hidden_2
        db3 = np.sum(dZ3, axis=1, keepdims=True)/m             #Output x 1
        dZ2 = self.W3.T.dot(dZ3)*self.hidden_derivative_2(Z2)    # Hidden2 x N_samples
        dW2 = dZ2.dot(A1.T)/m                                     # Hidden2 x Hidden1 
        db2 = np.sum(dZ2, axis=1, keepdims=True)/m             # Hidden2 x 1
        dZ1 = self.W2.T.dot(dZ2)*self.hidden_derivative_1(Z1)     # Hidden x N_samples
        dW1 = dZ1.dot(X)/m                                      # Hidden x N_Features
        db1 = np.sum(dZ1, axis=1, keepdims=True)/m              # Hidden x 1
     
        # Update
        lr = self.learning_rate
        self.W3 -= lr*dW3
        self.b3 -= lr*db3
        self.W2 -= lr*dW2
        self.b2 -= lr*db2
        self.W1 -= lr*dW1
        self.b1 -= lr*db1
      
    def predict(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        return A3
    
    def predict_class(self, X_predict):
        A3, Z3, A2, Z2, A1, Z1 = self.forward(X_predict)
        y_pred = np.argmax(A3, axis=0)
        return y_pred
                   
    def run(self, X_train, target, epochs=10):
        costs = []
        for i in range(epochs):
            A3, Z3, A2, Z2, A1, Z1 = self.forward(X_train)
            cost = cross_entropy(target, A3)
            costs.append(cost)
            if i%100 == 0:
                print(f'Loss after epoch {i} : {cost}')
            self.backprop(X_train, target)
        return costs  
          
       
    def evaluate(self, X_evaluate, target):
        '''
        return accuracy score, target must be the classes and not the hot encoded target
        '''
        
        y_pred = self.predict_class(X_evaluate)
        accuracy = classification_rate(y_pred, target)
        print('Accuracy :', accuracy)
        return accuracy
        
        
        
        
            

In [41]:
nn = HiddenTwo(input_nodes = D, 
               output_nodes = K, 
               hidden_nodes_1 = M,
               hidden_nodes_2 = M-1,
               learning_rate = 0.01,
               activation_hidden_1 = tanh,
               activation_hidden_2 = tanh)


In [42]:
nn.run(X_train, y_train_cat, epochs=10000 )

Loss after epoch 0 : 0.39792433600152016
Loss after epoch 100 : 0.36829270539086983
Loss after epoch 200 : 0.3651986158446728
Loss after epoch 300 : 0.3634953946878887
Loss after epoch 400 : 0.35728790025103846
Loss after epoch 500 : 0.3451282320050664
Loss after epoch 600 : 0.32449201194988786
Loss after epoch 700 : 0.2900905866779301
Loss after epoch 800 : 0.25191331352951046
Loss after epoch 900 : 0.22228716715759944
Loss after epoch 1000 : 0.2028661974995158
Loss after epoch 1100 : 0.19062205276814803
Loss after epoch 1200 : 0.18274861774670292
Loss after epoch 1300 : 0.1774739525561994
Loss after epoch 1400 : 0.17377706049494826
Loss after epoch 1500 : 0.17107394128828735
Loss after epoch 1600 : 0.1690219396444291
Loss after epoch 1700 : 0.16741183495398712
Loss after epoch 1800 : 0.16610957184970288
Loss after epoch 1900 : 0.1650239018260414
Loss after epoch 2000 : 0.16408674730480205
Loss after epoch 2100 : 0.1632383490077512
Loss after epoch 2200 : 0.16241118043947278
Loss afte

[0.39792433600152016,
 0.3970762002356644,
 0.39625195276777253,
 0.3954508832594413,
 0.3946723024411096,
 0.39391554156529957,
 0.39317995186558735,
 0.3924649040221143,
 0.39176978763434345,
 0.3910940107016774,
 0.3904369991124694,
 0.3897981961418801,
 0.38917706195896296,
 0.38857307314329387,
 0.38798572221140665,
 0.3874145171532353,
 0.38685898097872046,
 0.38631865127469134,
 0.3857930797720944,
 0.3852818319236044,
 0.3847844864916207,
 0.3843006351466248,
 0.3838298820758466,
 0.38337184360216575,
 0.3829261478131543,
 0.38249243420014895,
 0.38207035330722455,
 0.38165956638992987,
 0.3812597450836305,
 0.3808705710812997,
 0.380491735820584,
 0.3801229401799668,
 0.37976389418384676,
 0.3794143167163426,
 0.3790739352436328,
 0.3787424855446365,
 0.37841971144983944,
 0.378105364588068,
 0.37779920414101437,
 0.3775009966053166,
 0.37721051556199564,
 0.3769275414530564,
 0.37665186136505924,
 0.3763832688194705,
 0.37612156356960236,
 0.37586655140396,
 0.375618043955806

In [3]:
from tensorflow.keras.datasets import fashion_mnist


In [4]:
fashion = fashion_mnist.load_data()

In [5]:
type(fashion)

tuple

In [7]:
fashion.keys


AttributeError: 'tuple' object has no attribute 'keys'