## Artificial Neural Network from scratch

- Fully connected neural network


In [570]:
import numpy as np

In [571]:
# activation functions and their derivatives

def sigmoid(z):
    return 1/(1+np.exp(-z))

def relu(z):
    return np.maximum(0,z)

def sigmoid_derivative(z):
    return sigmoid(z) * (1-sigmoid(z)) 

def relu_derivative(z):
    return np.where(z > 0, 1, 0)

def identity(z):
    return z

def identity_derivative(z):
    return np.ones_like(z)

def softmax(z):
    # numerical stability: subtract max
    e_z = np.exp(z - np.max(z))
    return e_z / np.sum(e_z)

def softmax_derivative(z):
    s = softmax(z)
    # Jacobian matrix: diag(s) - s * s^T
    return np.diag(s) - np.outer(s, s)

def tanh(z):
    return np.tanh(z)

def tanh_derivative(z):
    return 1 - np.tanh(z)**2




In [572]:
# main neural net class block

class NeuralNet():
    '''Fully Connected Neural Network:
    '''
    pass

    def __init__(self, layers):
        # initialize the numpy array with weights 
        # expected class parameters: no of layers, no of neurons in each layer

        #self.layers = [5, 6, 3, 1] # 5 inputs-> 6 neuron layers (hidden), 3 nueonrs layer (hidden), output 1

        self.layers = layers

        self.input_len = self.layers[0]

        # store weights for each layer in a n x lm array, bias : 1x lm
        # lm: no of neurons in the layer l, n: (weights for a neuron/ no of inputs) no of ouputs from previous layers
        # first is input layer so dont need weights
        self.weights = [np.random.randn(self.layers[i],self.layers[i+1]) for i in range(len(self.layers)-1)] 
        self.biases = [np.zeros((1,self.layers[i+1])) for i in range(len(self.layers)-1)]

        print("Initialized weights and biases for layers:", self.layers)
    

    def forward(self, x):
        #Forward pass that stores A and Z for use in backward pass
        # prediction will predict for given input x which is a n1 x n2 array where n1 is no of obs and n2 is no of features
        # n2 must be equal to input len of the NN
        if x.shape[1] != self.input_len:
            print("Input features not matching the input into the NN")
            return
        A = x
        self.A_cache = [x]       # store A[0]
        self.Z_cache = []

        for i in range(len(self.weights)):
            Z = np.dot(A, self.weights[i]) + self.biases[i]
            if i < len(self.weights) - 1:
                A = relu(Z) ## relu activation for hidden layers
            else:
                A = identity(Z) ## identity activation for output layer
                
            # store intermediate Z and A for use in backpropagation
            self.Z_cache.append(Z)
            self.A_cache.append(A)

        return A
    
    def backward(self, x, y_true, lr):
        ##Backpropagation 

        y_pred = self.forward(x)  # ensure forward pass has been done

        L = len(self.weights)              # total layers excluding input
        m = y_true.shape[0]                # batch size

        # Gradients container
        dW = [None] * L
        dB = [None] * L
        dZ = [None] * L

        # ---------- LAST LAYER ----------
        # dz[L] = dŷ * g’(z[L])
        dA = (y_pred - y_true)             # assume MSE derivative: dA = y_pred - y_true
        dZ[L-1] = dA * identity_derivative(self.Z_cache[L-1]) # L = length of weights but index starts from 0

        # dw[L] = dz[L] * a[L−1] 
        dW[L-1] = np.matmul(self.A_cache[L-1].T, dZ[L-1]) / m  # m is no of obs in batch, must divide the dot profuct 
        ## here a_cache[L-1] is the activation from previous layer, a cache also consists of inputs in index=0.. so indexing may be confusing
        dB[L-1] = np.sum(dZ[L-1], axis=0, keepdims=True) / m


        # ---------- HIDDEN LAYERS ----------
        for l in range(L-2, -1, -1):
            # dz[l] = g’[l](z[l]) * Σ_j dz[l+1]_j * w[l+1]_ij
            
            dZ[l] = np.matmul(dZ[l+1], self.weights[l+1].T) * relu_derivative(self.Z_cache[l])

            # print(f"Layer {l}: dZ : {dZ[l]}")

            # dw[l] = dz[l] * a[l−1]
            dW[l] = np.matmul(self.A_cache[l].T, dZ[l]) / m
            dB[l] = np.sum(dZ[l], axis=0, keepdims=True) / m

            # print(f"Layer {l}: dW : {dW[l]}, dB: {dB[l]}")

        # ------------ Gradient Descent Update ------------
        for l in range(L):
            self.weights[l] -= lr * dW[l]
            self.biases[l]  -= lr * dB[l]
            # print(f"Updated weights layer {l}:", dW[l])
            # print(f"Updated biases layer {l}:", dB[l])



    def train(self, X, Y, lr=0.01, batch_size=0, epochs=100):
        """
        Train the neural network using mini-batch gradient descent.
        batch_size:
            0 -> full batch
            1 -> stochastic
            k < N -> mini batch
        """

        N = X.shape[0]

        # Full batch training --> treat whole dataset as one batch
        if batch_size == 0 or batch_size >= N:
            batch_size = N

        for epoch in range(epochs):
            # Shuffle data at the start of each epoch
            indices = np.random.permutation(N)
            X_shuffled = X[indices]
            Y_shuffled = Y[indices]

            # Iterate over mini-batches
            for start in range(0, N, batch_size):
                end = start + batch_size

                # Properly handle leftover batch (last small batch)
                X_batch = X_shuffled[start:end]
                Y_batch = Y_shuffled[start:end]

                # Call backward pass to update weights
                self.backward(X_batch, Y_batch, lr)

            print(f"Epoch {epoch+1}/{epochs} completed.")
            
            y_pred = self.forward(X)
            loss = (np.mean((y_pred - Y)**2))**0.5  # RMSE
            print(f"Epoch {epoch}, RMSE Loss={loss:.6f}")

        
    def predict(self, x):
        # predict function: will give predictions for input x
        A = self.forward(x)
        return A



In [573]:
nn = NeuralNet(layers = [10, 8, 5, 2])

xin = np.random.randn(5, 10)

nn.predict(xin)

Initialized weights and biases for layers: [10, 8, 5, 2]


array([[  0.70670593,   0.64098616],
       [ -2.3349557 ,  -1.91482866],
       [-12.96582124,   5.49498647],
       [  5.28973398,   1.15409753],
       [ -0.35204961,  -0.18871631]])

In [574]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

In [575]:
data = load_diabetes()
X = data.data       
Y = data.target.reshape(-1, 1)  # convert to column vector

print("Dataset shape:", X.shape, Y.shape)

Dataset shape: (442, 10) (442, 1)


In [576]:
data.data[:3]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034]])

In [577]:
data.target[:20]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.])

In [578]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Train:", X_train.shape, Y_train.shape)
print("Test:", X_test.shape, Y_test.shape)

Train: (353, 10) (353, 1)
Test: (89, 10) (89, 1)


In [579]:
nn = NeuralNet(layers=[10, 32, 16, 1])   # input = 10 features

Initialized weights and biases for layers: [10, 32, 16, 1]


In [580]:
nn.train(X_train, Y_train, epochs=10, batch_size= 30, lr=0.0005)

Epoch 1/10 completed.
Epoch 0, RMSE Loss=74.834424
Epoch 2/10 completed.
Epoch 1, RMSE Loss=81.898185
Epoch 3/10 completed.
Epoch 2, RMSE Loss=69.199672
Epoch 4/10 completed.
Epoch 3, RMSE Loss=73.588714
Epoch 5/10 completed.
Epoch 4, RMSE Loss=68.416735
Epoch 6/10 completed.
Epoch 5, RMSE Loss=72.477679
Epoch 7/10 completed.
Epoch 6, RMSE Loss=75.998826
Epoch 8/10 completed.
Epoch 7, RMSE Loss=65.096622
Epoch 9/10 completed.
Epoch 8, RMSE Loss=71.481783
Epoch 10/10 completed.
Epoch 9, RMSE Loss=60.642082


In [582]:
Y_test_pred = nn.predict(X_test)
test_loss = (np.mean((Y_test_pred - Y_test)**2))**0.5  # RMSE
print(f"RMSE Test Loss={test_loss:.6f}")

RMSE Test Loss=57.245478
