* Take two hidden layers. The first hidden layer has 7 to 10 neurons and the second hidden layer has 5 to 7 neurons.
* Compute square error at the output layer.
* Perform backpropagation using stochastic gradient descent.
* Update weight vectors at each layer for two iterations

In [4]:
import numpy as np

class NeuralNetwork:
    def __init__(self, X, y, X_test, y_test, alpha, epochs):
        self.X = X
        self.y = y
        self.X_test = X_test
        self.y_test = y_test
        self.alpha = alpha  
        self.epochs = epochs  
        
        self.layer_sizes = [X.shape[1]]  
        self.weights = []  
        self.biases = []  
        
    def build_layer(self, neuron_count):
        input_size = self.layer_sizes[-1]  
        self.layer_sizes.append(neuron_count)
        
        weight_matrix = np.random.randn(input_size, neuron_count) * np.sqrt(1 / input_size)
        bias_vector = np.zeros((1, neuron_count))
        
        self.weights.append(weight_matrix)
        self.biases.append(bias_vector)
        



    def sigmoid(self, z):
        
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        
        return z * (1 - z)

    def forward_pass(self, X):
        
        activations = [X]
        for W, b in zip(self.weights, self.biases):
            X = self.sigmoid(np.dot(X, W) + b)
            activations.append(X)
        return activations

    def compute_loss(self, y_pred):
        
        return np.mean(np.square(y_pred - self.y))

    def backward_pass(self, activations):
        
        y_pred = activations[-1]
        error = y_pred - self.y 

        for i in reversed(range(len(self.weights))):
            grad = error * self.sigmoid_derivative(activations[i + 1])  
            error = np.dot(grad, self.weights[i].T)  

            self.weights[i] -= self.alpha * np.dot(activations[i].T, grad)
            self.biases[i] -= self.alpha * np.sum(grad, axis=0, keepdims=True)

    def fit(self):
        for epoch in range(self.epochs):
            activations = self.forward_pass(self.X) 
            loss = self.compute_loss(activations[-1]) 
            self.backward_pass(activations)  

            print(f"Epoch {epoch+1}/{self.epochs}, Loss: {loss:.6f}")

    def predict(self, X):
        return self.forward_pass(X)[-1]  

    def calculate_MAE(self, y_true, y_pred, tolerance=0.1):
        
        correct_predictions = np.abs(y_true - y_pred)   
        return np.mean(correct_predictions)



In [5]:

X = np.array([[1], [3], [2], [4]])
y = np.array([[0.4], [0.3], [0.4], [0.1]])
X_test = np.array([[1], [3], [2], [4]])
y_test = np.array([[0.4], [0.3], [0.4], [0.1]])


model = NeuralNetwork(X, y, X_test, y_test, alpha=0.01, epochs=500)


model.build_layer(8)  
model.build_layer(5)  
model.build_layer(1)  


model.fit()

y_pred = model.predict(X_test)
print("\nPredictions:", y_pred.flatten())

# MAE
accuracy = model.calculate_MAE(y_test, y_pred)
print(f"Model MEAN ABSOLUTE ERROR : {accuracy * 100:.2f}%")


Epoch 1/500, Loss: 0.034520
Epoch 2/500, Loss: 0.034278
Epoch 3/500, Loss: 0.034040
Epoch 4/500, Loss: 0.033804
Epoch 5/500, Loss: 0.033572
Epoch 6/500, Loss: 0.033342
Epoch 7/500, Loss: 0.033116
Epoch 8/500, Loss: 0.032892
Epoch 9/500, Loss: 0.032671
Epoch 10/500, Loss: 0.032453
Epoch 11/500, Loss: 0.032237
Epoch 12/500, Loss: 0.032025
Epoch 13/500, Loss: 0.031815
Epoch 14/500, Loss: 0.031607
Epoch 15/500, Loss: 0.031402
Epoch 16/500, Loss: 0.031200
Epoch 17/500, Loss: 0.031001
Epoch 18/500, Loss: 0.030803
Epoch 19/500, Loss: 0.030609
Epoch 20/500, Loss: 0.030417
Epoch 21/500, Loss: 0.030227
Epoch 22/500, Loss: 0.030039
Epoch 23/500, Loss: 0.029854
Epoch 24/500, Loss: 0.029671
Epoch 25/500, Loss: 0.029491
Epoch 26/500, Loss: 0.029313
Epoch 27/500, Loss: 0.029137
Epoch 28/500, Loss: 0.028963
Epoch 29/500, Loss: 0.028791
Epoch 30/500, Loss: 0.028622
Epoch 31/500, Loss: 0.028455
Epoch 32/500, Loss: 0.028289
Epoch 33/500, Loss: 0.028126
Epoch 34/500, Loss: 0.027965
Epoch 35/500, Loss: 0.0

* This helps avoid vanishing/exploding gradient issues.

* Vanishing gradients: As gradients are propagated back through the network, they can get smaller and smaller, eventually becoming so tiny that the weights stop updating. This makes it difficult for the model to learn, especially in deep networks.

* Exploding gradients: Conversely, gradients can grow exponentially as they are propagated back. This can cause extremely large weight updates, leading to unstable training and making the model's learning unpredictable


