In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a small dataset of 6 samples with 2 features and 1 o/p

# Features:
x1 = [0.5, 1.5, 3.0, 2.0, 0.1, 4.0]
x2 = [1.0, 2.0, 1.0, 3.0, 0.5, 2.0]

# Output:
y =  []
for i in range(6):

    sum = x1[i] + x2[i]
    if sum>3:
        y.append(1)
    else:
        y.append(0)

print(y)

[0, 1, 1, 1, 0, 1]


In [3]:
# Creating a dataframe of the features and output

df = pd.DataFrame({
    'Feature1' : x1,
    'Feature2' : x2,
    'y' : y
})

df

Unnamed: 0,Feature1,Feature2,y
0,0.5,1.0,0
1,1.5,2.0,1
2,3.0,1.0,1
3,2.0,3.0,1
4,0.1,0.5,0
5,4.0,2.0,1


### 🧠 Neural Network Architecture:

- **Problem Type:** Classification  
- **Input Features:** 2 (`x1`, `x2`)  
- **Architecture:**
  - **Input Layer:** 2 neurons (for 2 input features)
  - **Hidden Layer:** 2 neurons  
  - **Output Layer:** 1 neuron  
- **Activation Function:**  
  - Hidden Layer: `Sigmoid`  
  - Output Layer: `Sigmoid`  
- **Loss Function:** Binary Cross Entropy 
- **Optimization Algorithm:** Gradient Descent (Manually implemented)  

---

In [4]:
# Intializing parameters:

def initialize_param(layer_dim):         # i.e- Layer_dim = [2,2,1] means 2 i/p neurons, 2 neurons in hidden layer 1, 1 neuron in o/p layer
    
    params = {}                       # A dictionary to store parameters (weight and bias)     

    L = len(layer_dim)
    
    for i in (1, L-1):

        # Weight matrix: number of rows: layer_dim[i-1] (number of neurons in the previous layer)
        # number of columns: layer_dim[i] (number of neurons in the current layer)
        params['w' + str(i)] = np.ones((layer_dim[i-1], layer_dim[i])) * 0.1    # np.ones((shape)): np.ones((2,3)) gives a matrix of 1 of size {2,3}
                                                                                # & np.ones(...) * 0.1 means all values in this matrix are 0.1
        params['b' + str(i)] = np.zeros((layer_dim[i], 1))                      # it gives a column matrix (shape: {rows, 1}) of 0 
        
    return params

In [5]:
# Initializing parameters for the neural n/w:

layer_dim = [2,2,1]

params = initialize_param(layer_dim)

print(params)

{'w1': array([[0.1, 0.1],
       [0.1, 0.1]]), 'b1': array([[0.],
       [0.]]), 'w2': array([[0.1],
       [0.1]]), 'b2': array([[0.]])}


In [6]:
# Activation Function

def sigmoid(z):

    val = 1/(1 + np.exp(-z))

    return val

In [7]:
# Forward propagation for one sample (Calculates layer-wise o/p for one sample)

def forward_pass(x, params, layer_dim):

    layer_output = {}
    
    layer_count = len(layer_dim)
    
    # Output for first layer:
    z = np.dot(params['w1'].T , x) + params['b1']
    layer_output['l' + str(1)] = sigmoid(z)                # Applying activation function to the weighted sum(z)
    
    for i in range(2, layer_count):                        # loop from 2nd layer to last layer- [2, last_layer + 1), last_layer + 1 means layer_count

        layer_w = params['w' + str(i)]
        layer_b = params['b' + str(i)]
        A_prev  = layer_output['l' + str(i-1)]
        
        z = np.dot(layer_w.T , A_prev) + layer_b
        layer_output['l' + str(i)] = sigmoid(z)

    return layer_output

In [8]:
# Loss Function(Binary Cross Entropy) for 1 sample:

def loss(y, y_hat):

    loss = (-1)*( y*np.log(y_hat) + (1-y)*(np.log(1-y_hat)))
    
    return loss

### ✅ All 9 Gradients of this Neural Network are:

| Gradient                           | Formula                                                                 |
|------------------------------------|-------------------------------------------------------------------------|
| $\frac{\partial L}{\partial W^{[2]}_{11}}$ | $-(y - \hat{y}) \cdot o_{11}$                                                  |
| $\frac{\partial L}{\partial W^{[2]}_{21}}$ | $-(y - \hat{y}) \cdot o_{12}$                                                  |
| $\frac{\partial L}{\partial b_{21}}$       | $-(y - \hat{y})$                                                             |
| $\frac{\partial L}{\partial W^{[1]}_{11}}$ | $-(y - \hat{y}) \cdot W^{[2]}_{11} \cdot o_{11}(1 - o_{11}) \cdot x_1$        |
| $\frac{\partial L}{\partial W^{[1]}_{12}}$ | $-(y - \hat{y}) \cdot W^{[2]}_{21} \cdot o_{12}(1 - o_{12}) \cdot x_1$        |
| $\frac{\partial L}{\partial W^{[1]}_{21}}$ | $-(y - \hat{y}) \cdot W^{[2]}_{11} \cdot o_{11}(1 - o_{11}) \cdot x_2$        |
| $\frac{\partial L}{\partial W^{[1]}_{22}}$ | $-(y - \hat{y}) \cdot W^{[2]}_{21} \cdot o_{12}(1 - o_{12}) \cdot x_2$        |
| $\frac{\partial L}{\partial b_{11}}$       | $-(y - \hat{y}) \cdot W^{[2]}_{11} \cdot o_{11}(1 - o_{11})$                 |
| $\frac{\partial L}{\partial b_{12}}$       | $-(y - \hat{y}) \cdot W^{[2]}_{21} \cdot o_{12}(1 - o_{12})$                 |

🔹 Where:
- $\hat{y} = o_{21}$ : predicted output (after sigmoid)  
- $y$ : true label (0 or 1)  
- $o_{11}, o_{12}$ : outputs from hidden layer neurons  
- $x_1, x_2$ : input features

---
### ✅ Gradient Descent Formula

For any parameter $\theta$:

$$
\theta := \theta - \alpha \cdot \frac{\partial L}{\partial \theta}
$$

Where:

- $\alpha$: learning rate (let's take it as 0.001)
- $\frac{\partial L}{\partial \theta}$: gradient of loss with respect to that parameter

---
### ✅ Parameter Update Formulas, by substituting Gradient value in Gradient Descent formula (In Same Order as Code)

| Python Code Line             | Parameters             | Update Formula                                                                                      |
|------------------------------|------------------------|------------------------------------------------------------------------------------------------------|
| `parameters['W2'][0][0]`     | $W^{[2]}_{11}$         | $ W^{[2]}_{11} = W^{[2]}_{11} - \alpha \cdot [-(y - \hat{y}) \cdot o_{11}]$                          |
| `parameters['W2'][1][0]`     | $W^{[2]}_{21}$         | $ W^{[2]}_{21} = W^{[2]}_{21} - \alpha \cdot [-(y - \hat{y}) \cdot o_{12}]$                          |
| `parameters['b2'][0][0]`     | $b_{21}$               | $ b_{21} = b_{21} - \alpha \cdot [-(y - \hat{y})]$                                                   |
| `parameters['W1'][0][0]`     | $W^{[1]}_{11}$         | $ W^{[1]}_{11} = W^{[1]}_{11} - \alpha \cdot [-(y - \hat{y}) \cdot W^{[2]}_{11} \cdot o_{11}(1 - o_{11}) \cdot x_1]$ |
| `parameters['W1'][0][1]`     | $W^{[1]}_{12}$         | $ W^{[1]}_{12} = W^{[1]}_{12} - \alpha \cdot [-(y - \hat{y}) \cdot W^{[2]}_{21} \cdot o_{12}(1 - o_{12}) \cdot x_1]$ |
| `parameters['W1'][1][0]`     | $W^{[1]}_{21}$         | $ W^{[1]}_{21} = W^{[1]}_{21} - \alpha \cdot [-(y - \hat{y}) \cdot W^{[2]}_{11} \cdot o_{11}(1 - o_{11}) \cdot x_2]$ |
| `parameters['W1'][1][1]`     | $W^{[1]}_{22}$         | $ W^{[1]}_{22} = W^{[1]}_{22} - \alpha \cdot [-(y - \hat{y}) \cdot W^{[2]}_{21} \cdot o_{12}(1 - o_{12}) \cdot x_2]$ |
| `parameters['b1'][0][0]`     | $b_{11}$               | $ b_{11} = b_{11} - \alpha \cdot [-(y - \hat{y}) \cdot W^{[2]}_{11} \cdot o_{11}(1 - o_{11})]$        |
| `parameters['b1'][1][0]`     | $b_{12}$               | $ b_{12} = b_{12} - \alpha \cdot [-(y - \hat{y}) \cdot W^{[2]}_{21} \cdot o_{12}(1 - o_{12})]$        |


In [9]:
# Update parameter value by gradient descent algorithm for 1 sample:

def update_parameters(params, y, layer_output, X):

    y_hat = layer_output['l2']              # output from layer 2
    A1 = layer_output['l1']                 # Output from layer 1

    lr = 0.001
    error = (y - y_hat).item()              # convert to scalar
    
    w211 = params['w2'][0][0].item()
    w221 = params['w2'][1][0].item()

    a11 = A1[0][0].item()                   # o11
    a12 = A1[1][0].item()                   # o12

    x1 = X[0][0].item()
    x2 = X[1][0].item()

    # Update output layer weights and bias
    params['w2'][0][0] += lr * error * a11    # W[2]11
    params['w2'][1][0] += lr * error * a12    # W[2]21
    params['b2'][0][0] += lr * error          # b21

    # Update first hidden layer weights and biases
    params['w1'][0][0] += lr * error * w211 * a11*(1-a11) * x1   # W[1]11
    params['w1'][1][0] += lr * error * w211 * a11*(1-a11) * x2   # W[1]21
    params['b1'][0][0] += lr * error * w211 * a11*(1-a11)        # b11
    
    params['w1'][0][1] += lr * error * w221 * a12*(1-a12) * x1   # W[1]12
    params['w1'][1][1] += lr * error * w221 * a12*(1-a12) * x2   # W[1]22
    params['b1'][1][0] += lr * error * w221 * a12*(1-a12)        # b12

    return params

---

## Predicting output and updating parameter for 1st sample:

In [10]:
# Creating x(features) and y(target) array for 1st input sample:

x = df[['Feature1', 'Feature2']].values[0].reshape(2,1)      # Shape(no of features, no. of training example)
y = df[['y']].values[0][0]

print(x)
print(y)
print(params)

[[0.5]
 [1. ]]
0
{'w1': array([[0.1, 0.1],
       [0.1, 0.1]]), 'b1': array([[0.],
       [0.]]), 'w2': array([[0.1],
       [0.1]]), 'b2': array([[0.]])}


In [11]:
# Prediction for 1st input sample:

y1_hat = forward_pass(x, params, layer_dim)

print(y1_hat)

{'l1': array([[0.53742985],
       [0.53742985]]), 'l2': array([[0.52684565]])}


In [12]:
# Loss of 1st sample:

print(loss(y , y1_hat['l2']).item())

0.7483336246288533


In [13]:
# Updating parmaters for 1st input sample:

update_parameters(params, y, y1_hat, x)

print(params)

{'w1': array([[0.09999345, 0.09999345],
       [0.0999869 , 0.0999869 ]]), 'b1': array([[-1.30973306e-05],
       [-1.30973306e-05]]), 'w2': array([[0.09971686],
       [0.09971686]]), 'b2': array([[-0.00052685]])}


In [14]:
# Checking loss after parameter update for 1st sample:

y1_new = forward_pass(x, params, layer_dim)
print('Predicted o/p after paramter update:\n', y1_new)
print('Loss after paramter update:\n', loss(y, y1_new['l2']).item())

Predicted o/p after paramter update:
 {'l1': array([[0.53742252],
       [0.53742252]]), 'l2': array([[0.52663809]])}
Loss after paramter update:
 0.7478950355194686


## Predicting output and updating parameter for 2nd sample:

In [15]:
x = df[['Feature1', 'Feature2']].values[1].reshape(2,1)      
y = df[['y']].values[1][0]

print('X:\n', x)
print('Y:\n', y)
print('Initial Paramters:\n', params)

# output prediction of layers
y2_hat = forward_pass(x, params, layer_dim)
print('Output of layers:\n', y2_hat)

# Loss
print('Loss: ', loss(y, y2_hat['l2']).item())

# Paramters update
update_parameters(params, y, y2_hat, x)
print('Updated paramters:\n', params)

# Checking loss after parameter update:
y2_new = forward_pass(x, params, layer_dim)
print('Predicted o/p after paramter update:\n', y2_new)
print('Loss after paramter update:\n', loss(y, y2_new['l2']).item())

X:
 [[1.5]
 [2. ]]
Y:
 1
Initial Paramters:
 {'w1': array([[0.09999345, 0.09999345],
       [0.0999869 , 0.0999869 ]]), 'b1': array([[-1.30973306e-05],
       [-1.30973306e-05]]), 'w2': array([[0.09971686],
       [0.09971686]]), 'b2': array([[-0.00052685]])}
Output of layers:
 {'l1': array([[0.58660567],
       [0.58660567]]), 'l2': array([[0.52908266]])}
Loss:  0.636610599922258
Updated paramters:
 {'w1': array([[0.10001053, 0.10001053],
       [0.10000968, 0.10000968]]), 'b1': array([[-1.70994469e-06],
       [-1.70994469e-06]]), 'w2': array([[0.0999931],
       [0.0999931]]), 'b2': array([[-5.59283123e-05]])}
Predicted o/p after paramter update:
 {'l1': array([[0.58662569],
       [0.58662569]]), 'l2': array([[0.52928173]])}
Loss after paramter update:
 0.6362344107188425


## Predicting output and updating parameter for 3rd sample:

In [16]:
x = df[['Feature1', 'Feature2']].values[2].reshape(2,1)      
y = df[['y']].values[2][0]

print('X:\n', x)
print('Y:\n', y)
print('Initial Paramters:\n', params)

# output prediction of layers
y3_hat = forward_pass(x, params, layer_dim)
print('Output of layers:\n', y3_hat)

# Loss
print('Loss: ', loss(y, y3_hat['l2']).item())

# Paramters update
update_parameters(params, y, y3_hat, x)
print('Updated paramters:\n', params)

# Checking loss after parameter update:
y3_new = forward_pass(x, params, layer_dim)
print('Predicted o/p after paramter update:\n', y3_new)
print('Loss after paramter update:\n', loss(y, y3_new['l2']).item())

X:
 [[3.]
 [1.]]
Y:
 1
Initial Paramters:
 {'w1': array([[0.10001053, 0.10001053],
       [0.10000968, 0.10000968]]), 'b1': array([[-1.70994469e-06],
       [-1.70994469e-06]]), 'w2': array([[0.0999931],
       [0.0999931]]), 'b2': array([[-5.59283123e-05]])}
Output of layers:
 {'l1': array([[0.59869717],
       [0.59869717]]), 'l2': array([[0.52988315]])}
Loss:  0.6350987621509052
Updated paramters:
 {'w1': array([[0.10004441, 0.10004441],
       [0.10002097, 0.10002097]]), 'b1': array([[9.58425019e-06],
       [9.58425019e-06]]), 'w2': array([[0.10027456],
       [0.10027456]]), 'b2': array([[0.00041419]])}
Predicted o/p after paramter update:
 {'l1': array([[0.59872701],
       [0.59872701]]), 'l2': array([[0.5300857]])}
Loss after paramter update:
 0.6347165837031522


---

In [17]:
# Epoch Implementation:

for j in range(10):
    
    sample_loss = []

    for i in range(df.shape[0]):

        x = df[['Feature1', 'Feature2']].values[i].reshape(2,1)
        y = df[['y']].values[i][0]
        
        y_hat = forward_pass(x, params, layer_dim)
        
        sample_loss.append(loss(y, y_hat['l2']).item())
        
        update_parameters(params, y, y_hat, x)

    print('Epoch ', str(j), ':')
    print('Parameters:', params)
    print('Sample loss:', sample_loss)
    print('Epoch Loss: ', np.mean(sample_loss))
    print('------------------------------------------------------------------------------------------------------------------------')

Epoch  0 :
Parameters: {'w1': array([[0.10015275, 0.10015275],
       [0.10009007, 0.10009007]]), 'b1': array([[2.77479323e-05],
       [2.77479323e-05]]), 'w2': array([[0.1008713],
       [0.1008713]]), 'b2': array([[0.00123748]])}
Sample loss: [0.7487087651614454, 0.6362613759815511, 0.6347461323725118, 0.6321218418316483, 0.7471115857934999, 0.6299477926944006]
Epoch Loss:  0.6714829156391762
------------------------------------------------------------------------------------------------------------------------
Epoch  1 :
Parameters: {'w1': array([[0.10026161, 0.10026161],
       [0.10015948, 0.10015948]]), 'b1': array([[4.5959641e-05],
       [4.5959641e-05]]), 'w2': array([[0.10146687],
       [0.10146687]]), 'b2': array([[0.00205847]])}
Sample loss: [0.7494847148744956, 0.6355375199886034, 0.6340149291475079, 0.63137931072389, 0.7478690592246311, 0.6291917830206483]
Epoch Loss:  0.6712462194966293
-----------------------------------------------------------------------------------