## Artificial Neural Network from scratch

- Fully connected neural network


In [539]:
import numpy as np

In [540]:
# activation functions and their derivatives

def sigmoid(z):
    return 1/(1+np.exp(-z))

def relu(z):
    return np.maximum(0,z)

def sigmoid_derivative(z):
    return sigmoid(z) * (1-sigmoid(z)) 

def relu_derivative(z):
    return np.where(z > 0, 1, 0)

def identity(z):
    return z

def identity_derivative(z):
    return np.ones_like(z)


In [None]:
# main neural net class block

class NeuralNet():
    '''Fully Connected Neural Network:
    '''
    pass

    def __init__(self, layers):
        # initialize the numpy array with weights 
        # expected class parameters: no of layers, no of neurons in each layer

        #self.layers = [5, 6, 3, 1] # 5 inputs-> 6 neuron layers (hidden), 3 nueonrs layer (hidden), output 1

        self.layers = layers

        self.input_len = self.layers[0]

        # store weights for each layer in a n x lm array, bias : 1x lm
        # lm: no of neurons in the layer l, n: (weights for a neuron/ no of inputs) no of ouputs from previous layers
        # first is input layer so dont need weights
        self.weights = [np.random.randn(self.layers[i],self.layers[i+1]) for i in range(len(self.layers)-1)] 
        self.biases = [np.zeros((1,self.layers[i+1])) for i in range(len(self.layers)-1)]

        print("Initialized weights and biases for layers:", self.layers)
    

    def forward(self, x):
        #Forward pass that stores A and Z for use in backward pass
        # prediction will predict for given input x which is a n1 x n2 array where n1 is no of obs and n2 is no of features
        # n2 must be equal to input len of the NN
        if x.shape[1] != self.input_len:
            print("Input features not matching the input into the NN")
            return
        A = x
        self.A_cache = [x]       # store A[0]
        self.Z_cache = []

        for i in range(len(self.weights)):
            Z = np.dot(A, self.weights[i]) + self.biases[i]
            if i < len(self.weights) - 1:
                A = relu(Z) ## relu activation for hidden layers
            else:
                A = identity(Z) ## identity activation for output layer
            # store intermediate Z and A for use in backpropagation
            self.Z_cache.append(Z)
            self.A_cache.append(A)

        return A
    
    def backward(self, x, y_true, lr):
        ##Backpropagation 

        y_pred = self.forward(x)  # ensure forward pass has been done

        L = len(self.weights)              # total layers excluding input
        m = y_true.shape[0]                # batch size

        # Gradients container
        dW = [None] * L
        dB = [None] * L
        dZ = [None] * L

        # ---------- LAST LAYER ----------
        # dz[L] = dŷ * g’(z[L])
        dA = (y_pred - y_true)             # assume MSE derivative: dA = y_pred - y_true
        dZ[L-1] = dA * identity_derivative(self.Z_cache[L-1]) # L = length of weights but index starts from 0

        # dw[L] = dz[L] * a[L−1] 
        dW[L-1] = np.matmul(self.A_cache[L-1].T, dZ[L-1]) / m  # m is no of obs in batch, must divide the dot profuct 
        ## here a_cache[L-1] is the activation from previous layer, a cache also consists of inputs in index=0.. so indexing may be confusing
        dB[L-1] = np.sum(dZ[L-1], axis=0, keepdims=True) / m


        # ---------- HIDDEN LAYERS ----------
        for l in range(L-2, -1, -1):
            # dz[l] = g’[l](z[l]) * Σ_j dz[l+1]_j * w[l+1]_ij
            
            dZ[l] = np.matmul(dZ[l+1], self.weights[l+1].T) * relu_derivative(self.Z_cache[l])

            # print(f"Layer {l}: dZ : {dZ[l]}")

            # dw[l] = dz[l] * a[l−1]
            dW[l] = np.matmul(self.A_cache[l].T, dZ[l]) / m
            dB[l] = np.sum(dZ[l], axis=0, keepdims=True) / m

            # print(f"Layer {l}: dW : {dW[l]}, dB: {dB[l]}")

        # ------------ Gradient Descent Update ------------
        for l in range(L):
            self.weights[l] -= lr * dW[l]
            self.biases[l]  -= lr * dB[l]
            # print(f"Updated weights layer {l}:", dW[l])
            # print(f"Updated biases layer {l}:", dB[l])



    def train(self, X, Y, epochs=1000, lr=0.001):
        # train using back propagation
        # train function: will do training of parameters, input x (features), y (actual ouputs), no of epochs, learning rate

        for epoch in range(epochs):
            self.backward(X, Y, lr)

            if epoch % 10 == 0:
                y_pred = self.forward(X)
                loss = (np.mean((y_pred - Y)**2))**0.5  # RMSE
                print(f"Epoch {epoch}, RMSE Loss={loss:.6f}")

        
    def predict(self, x):
        # predict function: will give predictions for input x
        A = self.forward(x)
        return A



In [542]:
nn = NeuralNet(layers = [10, 8, 5, 2])

xin = np.random.randn(5, 10)

nn.predict(xin)

Initialized weights and biases for layers: [10, 8, 5, 2]


array([[  5.95671399, -10.35520634],
       [ 10.55129928, -13.30340221],
       [-11.93356644,   3.65934233],
       [-18.60742097,  -1.92548912],
       [ -3.01294174,  -1.8834433 ]])

In [543]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

In [544]:
data = load_diabetes()
X = data.data       
Y = data.target.reshape(-1, 1)  # convert to column vector

print("Dataset shape:", X.shape, Y.shape)

Dataset shape: (442, 10) (442, 1)


In [545]:
data.data[:3]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034]])

In [546]:
data.target[:20]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.])

In [547]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Train:", X_train.shape, Y_train.shape)
print("Test:", X_test.shape, Y_test.shape)

Train: (353, 10) (353, 1)
Test: (89, 10) (89, 1)


In [548]:
nn = NeuralNet(layers=[10, 32, 16, 1])   # input = 10 features

Initialized weights and biases for layers: [10, 32, 16, 1]


In [549]:
nn.train(X_train, Y_train, epochs=2000, lr=0.0005)

Epoch 0, RMSE Loss=171.284835
Epoch 10, RMSE Loss=76.902106
Epoch 20, RMSE Loss=74.232961
Epoch 30, RMSE Loss=71.507275
Epoch 40, RMSE Loss=69.214399
Epoch 50, RMSE Loss=66.818747
Epoch 60, RMSE Loss=64.620350
Epoch 70, RMSE Loss=62.657642
Epoch 80, RMSE Loss=130.139153
Epoch 90, RMSE Loss=63.853522
Epoch 100, RMSE Loss=78.452829
Epoch 110, RMSE Loss=66.613420
Epoch 120, RMSE Loss=69.562127
Epoch 130, RMSE Loss=68.598190
Epoch 140, RMSE Loss=67.241611
Epoch 150, RMSE Loss=67.826387
Epoch 160, RMSE Loss=66.443108
Epoch 170, RMSE Loss=66.448088
Epoch 180, RMSE Loss=65.793237
Epoch 190, RMSE Loss=65.207356
Epoch 200, RMSE Loss=64.815206
Epoch 210, RMSE Loss=64.326169
Epoch 220, RMSE Loss=64.001289
Epoch 230, RMSE Loss=63.553613
Epoch 240, RMSE Loss=63.276766
Epoch 250, RMSE Loss=62.871199
Epoch 260, RMSE Loss=62.520816
Epoch 270, RMSE Loss=62.190143
Epoch 280, RMSE Loss=61.913476
Epoch 290, RMSE Loss=61.586373
Epoch 300, RMSE Loss=61.350232
Epoch 310, RMSE Loss=61.100339
Epoch 320, RMSE L

In [550]:
Y_test_pred = nn.predict(X_test)
test_loss = (np.mean((Y_test_pred - Y_test)**2))**0.5  # RMSE
print(f"RMSE Test Loss={test_loss:.6f}")

RMSE Test Loss=52.360070
