In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a small dataset of 6 samples with 2 features and 1 o/p

# Features:
x1 = [0.5, 1.5, 3.0, 2.0, 0.1, 4.0]
x2 = [1.0, 2.0, 1.0, 3.0, 0.5, 2.0]

# Output:
y =  []
for i in range(6):
    y.append(1.5*x1[i] + 2*x2[i] + 3)

print(y)

[5.75, 9.25, 9.5, 12.0, 4.15, 13.0]


In [3]:
# Creating a dataframe of the features and output

df = pd.DataFrame({
    'Feature1' : x1,
    'Feature2' : x2,
    'y' : y
})

df

Unnamed: 0,Feature1,Feature2,y
0,0.5,1.0,5.75
1,1.5,2.0,9.25
2,3.0,1.0,9.5
3,2.0,3.0,12.0
4,0.1,0.5,4.15
5,4.0,2.0,13.0


### 🧠 Neural Network Architecture:

- **Problem Type:** Regression  
- **Input Features:** 2 (`x1`, `x2`)  
- **Architecture:**
  - **Input Layer:** 2 neurons (for 2 input features)
  - **Hidden Layer:** 2 neurons  
  - **Output Layer:** 1 neuron  
- **Activation Function:**  
  - Hidden Layer: `Linear`  
  - Output Layer: `Linear`  
- **Loss Function:** Mean Squared Error (MSE)  
- **Optimization Algorithm:** Gradient Descent (Manually implemented)  

---

In [4]:
# Intializing parameters:

def initialize_param(layer_dim):         # i.e- Layer_dim = [2,2,1] means 2 i/p neurons, 2 neurons in hidden layer 1, 1 neuron in o/p layer
    
    params = {}                       # A dictionary to store parameters (weight and bias)     

    L = len(layer_dim)
    
    for i in (1, L-1):

        # Weight matrix: number of rows: layer_dim[i-1] (number of neurons in the previous layer)
        # number of columns: layer_dim[i] (number of neurons in the current layer)
        params['w' + str(i)] = np.ones((layer_dim[i-1], layer_dim[i])) * 0.1    # np.ones((shape)): np.ones((2,3)) gives a matrix of 1 of size {2,3}
                                                                                # & np.ones(...) * 0.1 means all values in this matrix are 0.1
        params['b' + str(i)] = np.zeros((layer_dim[i], 1))                      # it gives a column matrix (shape: {rows, 1}) of 0 
        
    return params

In [5]:
# Initializing parameters for the neural n/w:

layer_dim = [2,2,1]

params = initialize_param(layer_dim)

print(params)

{'w1': array([[0.1, 0.1],
       [0.1, 0.1]]), 'b1': array([[0.],
       [0.]]), 'w2': array([[0.1],
       [0.1]]), 'b2': array([[0.]])}


In [6]:
# Forward propagation for one sample (Calculates layer-wise o/p for one sample)

def forward_pass(x, params, layer_dim):

    layer_output = {}
    
    layer_count = len(layer_dim)
    
    # Output for first layer:
    layer_output['l' + str(1)] = np.dot(params['w1'].T , x) + params['b1']
    
    for i in range(2, layer_count):                        # loop from 2nd layer to last layer- [2, last_layer + 1), last_layer + 1 means layer_count

        layer_w = params['w' + str(i)]
        layer_b = params['b' + str(i)]
        A_prev  = layer_output['l' + str(i-1)]
        
        layer_output['l' + str(i)] = np.dot(layer_w.T , A_prev) + layer_b

    return layer_output

In [7]:
# Loss Function(MSE) for 1 sample:

def loss(y, y_hat):

    loss = (y-y_hat)**2
    
    return loss

### ✅ All 9 Gradients of this neural network are:

| Gradient              | Formula                                                |
|-----------------------|--------------------------------------------------------|
| $\frac{\partial L}{\partial W^{[2]}_{11}}$ | $-2(y - \hat{y}) \cdot O_{11}$                      |
| $\frac{\partial L}{\partial W^{[2]}_{21}}$ | $-2(y - \hat{y}) \cdot O_{12}$                      |
| $\frac{\partial L}{\partial b_{21}}$       | $-2(y - \hat{y})$                                   |
| $\frac{\partial L}{\partial W^{[1]}_{11}}$ | $-2(y - \hat{y}) \cdot W^{[2]}_{11} \cdot x_1$      |
| $\frac{\partial L}{\partial W^{[1]}_{12}}$ | $-2(y - \hat{y}) \cdot W^{[2]}_{21} \cdot x_1$      |
| $\frac{\partial L}{\partial W^{[1]}_{21}}$ | $-2(y - \hat{y}) \cdot W^{[2]}_{11} \cdot x_2$      |
| $\frac{\partial L}{\partial W^{[1]}_{22}}$ | $-2(y - \hat{y}) \cdot W^{[2]}_{21} \cdot x_2$      |
| $\frac{\partial L}{\partial b_{11}}$       | $-2(y - \hat{y}) \cdot W^{[2]}_{11}$                |
| $\frac{\partial L}{\partial b_{12}}$       | $-2(y - \hat{y}) \cdot W^{[2]}_{21}$                |

---
### ✅ Gradient Descent Formula

For any parameter $\theta$:

$$
\theta := \theta - \alpha \cdot \frac{\partial L}{\partial \theta}
$$

Where:

- $\alpha$: learning rate (let's take it as 0.001)
- $\frac{\partial L}{\partial \theta}$: gradient of loss with respect to that parameter

---
### ✅ Parameter Update Formulas, by substituting Gradient value in Gradient Descent formula (In Same Order as Code)

| Python Code Line             | Parameters             | Update Formula                                                                 |
|------------------------------|------------------------|---------------------------------------------------------------------------------|
| `parameters['W2'][0][0]`     | $W^{[2]}_{11}$         | $ W^{[2]}_{11} - \alpha \cdot [-2(y - \hat{y}) \cdot O_{11}]$    |
| `parameters['W2'][1][0]`     | $W^{[2]}_{21}$         | $ W^{[2]}_{21} - \alpha \cdot [-2(y - \hat{y}) \cdot O_{12}]$    |
| `parameters['b2'][0][0]`     | $b_{21}$               | $ b_{21} - \alpha \cdot [-2(y - \hat{y})]$                             |
| `parameters['W1'][0][0]`     | $W^{[1]}_{11}$         | $ W^{[1]}_{11} - \alpha \cdot [-2(y - \hat{y}) \cdot W^{[2]}_{11} \cdot x_1]$ |
| `parameters['W1'][0][1]`     | $W^{[1]}_{12}$         | $ W^{[1]}_{12} - \alpha \cdot [-2(y - \hat{y}) \cdot W^{[2]}_{11} \cdot x_2]$ |
| `parameters['b1'][0][0]`     | $b_{11}$               | $ b_{11} - \alpha \cdot [-2(y - \hat{y}) \cdot W^{[2]}_{11}]$          |
| `parameters['W1'][1][0]`     | $W^{[1]}_{21}$         | $ W^{[1]}_{21} - \alpha \cdot [-2(y - \hat{y}) \cdot W^{[2]}_{21} \cdot x_1]$ |
| `parameters['W1'][1][1]`     | $W^{[1]}_{22}$         | $ W^{[1]}_{22} - \alpha \cdot [-2(y - \hat{y}) \cdot W^{[2]}_{21} \cdot x_2]$ |
| `parameters['b1'][1][0]`     | $b_{12}$               | $ b_{12} - \alpha \cdot [-2(y - \hat{y}) \cdot W^{[2]}_{21}]$          |



In [8]:
# Update parameter value by gradient descent algorithm for 1 sample:

def update_parameters(params, y, layer_output, X):

    y_hat = layer_output['l2']              # output from layer 2
    A1 = layer_output['l1']                 # Output from layer 1

    lr = 0.001
    error = (y - y_hat).item()   # convert to scalar
    
    w211 = params['w2'][0][0].item()
    w221 = params['w2'][1][0].item()

    a11 = A1[0][0].item()
    a12 = A1[1][0].item()

    x1 = X[0][0].item()
    x2 = X[1][0].item()

    # Update output layer weights and bias
    params['w2'][0][0] += lr * 2 * error * a11    # W[2]11
    params['w2'][1][0] += lr * 2 * error * a12    # W[2]21
    params['b2'][0][0] += lr * 2 * error          # b21

    # Update first hidden layer weights and biases
    params['w1'][0][0] += lr * 2 * error * w211 * x1   # W[1]11
    params['w1'][0][1] += lr * 2 * error * w211 * x2   # W[1]12
    params['b1'][0][0] += lr * 2 * error * w211        # b11

    params['w1'][1][0] += lr * 2 * error * w221 * x1   # W[1]21
    params['w1'][1][1] += lr * 2 * error * w221 * x2   # W[1]22
    params['b1'][1][0] += lr * 2 * error * w221        # b12

    return params

---

## Predicting output and updating parameter for 1st sample:

In [9]:
# Creating x(features) and y(target) array for 1st input sample:

x = df[['Feature1', 'Feature2']].values[0].reshape(2,1)      # Shape(no of features, no. of training example)
y = df[['y']].values[0][0]

print(x)
print(y)
print(params)

[[0.5]
 [1. ]]
5.75
{'w1': array([[0.1, 0.1],
       [0.1, 0.1]]), 'b1': array([[0.],
       [0.]]), 'w2': array([[0.1],
       [0.1]]), 'b2': array([[0.]])}


In [10]:
# Prediction for 1st input sample:

y1_hat = forward_pass(x, params, layer_dim)

print(y1_hat)

{'l1': array([[0.15],
       [0.15]]), 'l2': array([[0.03]])}


In [11]:
# Loss of 1st sample:

print(loss(y , y1_hat['l2']).item())

32.718399999999995


In [12]:
# Updating parmaters for 1st input sample:

update_parameters(params, y, y1_hat, x)

print(params)

{'w1': array([[0.100572, 0.101144],
       [0.100572, 0.101144]]), 'b1': array([[0.001144],
       [0.001144]]), 'w2': array([[0.101716],
       [0.101716]]), 'b2': array([[0.01144]])}


In [13]:
# Checking loss after parameter update for 1st sample:

y1_new = forward_pass(x, params, layer_dim)
print('Predicted o/p after paramter update:\n', y1_new)
print('Loss after paramter update:\n', loss(y, y1_new['l2']).item())

Predicted o/p after paramter update:
 {'l1': array([[0.152002],
       [0.15286 ]]), 'l2': array([[0.04244934]])}
Loss after paramter update:
 32.57613450002943


## Predicting output and updating parameter for 2nd sample:

In [14]:
x = df[['Feature1', 'Feature2']].values[1].reshape(2,1)      
y = df[['y']].values[1][0]

print('X:\n', x)
print('Y:\n', y)
print('Initial Paramters:\n', params)

# output prediction of layers
y2_hat = forward_pass(x, params, layer_dim)
print('Output of layers:\n', y2_hat)

# Loss
print('Loss: ', loss(y, y2_hat['l2']).item())

# Paramters update
update_parameters(params, y, y2_hat, x)
print('Updated paramters:\n', params)

# Checking loss after parameter update:
y2_new = forward_pass(x, params, layer_dim)
print('Predicted o/p after paramter update:\n', y2_new)
print('Loss after paramter update:\n', loss(y, y2_new['l2']).item())

X:
 [[1.5]
 [2. ]]
Y:
 9.25
Initial Paramters:
 {'w1': array([[0.100572, 0.101144],
       [0.100572, 0.101144]]), 'b1': array([[0.001144],
       [0.001144]]), 'w2': array([[0.101716],
       [0.101716]]), 'b2': array([[0.01144]])}
Output of layers:
 {'l1': array([[0.353146],
       [0.355148]]), 'l2': array([[0.08348483]])}
Loss:  84.02500031593422
Updated paramters:
 {'w1': array([[0.10336914, 0.10487353],
       [0.10336914, 0.10487353]]), 'b1': array([[0.00300876],
       [0.00300876]]), 'w2': array([[0.10819024],
       [0.10822694]]), 'b2': array([[0.02977303]])}
Predicted o/p after paramter update:
 {'l1': array([[0.36480077],
       [0.3700661 ]]), 'l2': array([[0.10929203]])}
Loss after paramter update:
 83.55254214429553


## Predicting output and updating parameter for 3rd sample:

In [15]:
x = df[['Feature1', 'Feature2']].values[2].reshape(2,1)      
y = df[['y']].values[2][0]

print('X:\n', x)
print('Y:\n', y)
print('Initial Paramters:\n', params)

# output prediction of layers
y3_hat = forward_pass(x, params, layer_dim)
print('Output of layers:\n', y3_hat)

# Loss
print('Loss: ', loss(y, y3_hat['l2']).item())

# Paramters update
update_parameters(params, y, y3_hat, x)
print('Updated paramters:\n', params)

# Checking loss after parameter update:
y3_new = forward_pass(x, params, layer_dim)
print('Predicted o/p after paramter update:\n', y3_new)
print('Loss after paramter update:\n', loss(y, y3_new['l2']).item())

X:
 [[3.]
 [1.]]
Y:
 9.5
Initial Paramters:
 {'w1': array([[0.10336914, 0.10487353],
       [0.10336914, 0.10487353]]), 'b1': array([[0.00300876],
       [0.00300876]]), 'w2': array([[0.10819024],
       [0.10822694]]), 'b2': array([[0.02977303]])}
Output of layers:
 {'l1': array([[0.41648534],
       [0.42250286]]), 'l2': array([[0.12055887]])}
Loss:  87.97391592985298
Updated paramters:
 {'w1': array([[0.10945773, 0.10690305],
       [0.10945979, 0.10690374]]), 'b1': array([[0.00503829],
       [0.00503898]]), 'w2': array([[0.11600304],
       [0.11615262]]), 'b2': array([[0.04853191]])}
Predicted o/p after paramter update:
 {'l1': array([[0.44287127],
       [0.43265188]]), 'l2': array([[0.15015997]])}
Loss after paramter update:
 87.4195085227457


---

In [16]:
# Epoch Implementation:

for j in range(10):
    
    sample_loss = []

    for i in range(df.shape[0]):

        x = df[['Feature1', 'Feature2']].values[i].reshape(2,1)
        y = df[['y']].values[i][0]
        
        y_hat = forward_pass(x, params, layer_dim)
        
        sample_loss.append(loss(y, y_hat['l2']).item())
        
        update_parameters(params, y, y_hat, x)

    print('Epoch ', str(j), ':')
    print('Parameters:', params)
    print('Sample loss:', sample_loss)
    print('Epoch Loss: ', np.mean(sample_loss))
    print('------------------------------------------------------------------------------------------------------------------------')

Epoch  0 :
Parameters: {'w1': array([[0.14175789, 0.132368  ],
       [0.14168484, 0.13232879]]), 'b1': array([[0.01891096],
       [0.01888898]]), 'w2': array([[0.16835665],
       [0.1673425 ]]), 'b2': array([[0.1534483]])}
Sample loss: [32.065336303339684, 82.78257360671338, 86.61765584837131, 137.906121227915, 16.027971503281233, 159.85499715889136]
Epoch Loss:  85.87577594141867
------------------------------------------------------------------------------------------------------------------------
Epoch  1 :
Parameters: {'w1': array([[0.1869137 , 0.1680447 ],
       [0.18623189, 0.16756766]]), 'b1': array([[0.03833816],
       [0.03807976]]), 'w2': array([[0.23644479],
       [0.23139497]]), 'b2': array([[0.25597381]])}
Sample loss: [30.483658570703867, 79.44841876965553, 82.86227320800639, 132.32531002101257, 15.001163935183307, 152.49508999389968]
Epoch Loss:  82.10265241641022
------------------------------------------------------------------------------------------------------