In [None]:
import numpy as np

# Helper function: Activation functions and their derivatives
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def sigmoid_derivative(Z):
    return sigmoid(Z) * (1 - sigmoid(Z))

def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return np.where(Z > 0, 1, 0)

# Helper function: Initialize parameters for a deep neural network
def initialize_parameters(layers_dims):
    np.random.seed(1)
    parameters = {}
    L = len(layers_dims)  # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

    return parameters

# Helper function: Forward propagation
def forward_propagation(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2  # number of layers in the network

    for l in range(1, L):
        A_prev = A
        Z = np.dot(parameters['W' + str(l)], A_prev) + parameters['b' + str(l)]
        A = relu(Z)
        caches.append((A_prev, Z, parameters['W' + str(l)], parameters['b' + str(l)]))

    # Output layer
    ZL = np.dot(parameters['W' + str(L)], A) + parameters['b' + str(L)]
    AL = sigmoid(ZL)
    caches.append((A, ZL, parameters['W' + str(L)], parameters['b' + str(L)]))

    return AL, caches

# Helper function: Compute the cost
def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = -(1/m) * np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL))
    return np.squeeze(cost)

# Helper function: Backward propagation
def backward_propagation(AL, Y, caches):
    grads = {}
    L = len(caches)  # number of layers
    m = AL.shape[1]

    # Initial gradient on the output layer
    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    # Output layer gradients
    current_cache = caches[L - 1]
    A_prev, ZL, W, b = current_cache
    dZL = dAL * sigmoid_derivative(ZL)
    grads["dW" + str(L)] = (1/m) * np.dot(dZL, A_prev.T)
    grads["db" + str(L)] = (1/m) * np.sum(dZL, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZL)

    # Backpropagation for hidden layers
    for l in reversed(range(L - 1)):
        current_cache = caches[l]
        A_prev, Z, W, b = current_cache
        dZ = dA_prev * relu_derivative(Z)
        grads["dW" + str(l + 1)] = (1/m) * np.dot(dZ, A_prev.T)
        grads["db" + str(l + 1)] = (1/m) * np.sum(dZ, axis=1, keepdims=True)
        dA_prev = np.dot(W.T, dZ)

    return grads

# Helper function: Update parameters using gradient descent
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2  # number of layers

    for l in range(1, L + 1):
        parameters["W" + str(l)] -= learning_rate * grads["dW" + str(l)]
        parameters["b" + str(l)] -= learning_rate * grads["db" + str(l)]

    return parameters

# Training the neural network
def model(X, Y, layers_dims, learning_rate=0.01, num_iterations=10000):
    np.random.seed(1)
    parameters = initialize_parameters(layers_dims)

    for i in range(0, num_iterations):
        # Forward propagation
        AL, caches = forward_propagation(X, parameters)

        # Compute cost
        cost = compute_cost(AL, Y)

        # Backward propagation
        grads = backward_propagation(AL, Y, caches)

        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)

        if i % 1000 == 0:
            print(f"Cost after iteration {i}: {cost}")

    return parameters

# Example: Train a 3-layer neural network (2 hidden layers)
layers_dims = [3, 4, 4, 1]  # Input layer: 3 units, two hidden layers with 4 units each, output layer with 1 unit

# Example input data (X) and labels (Y)
X = np.random.randn(3, 5)  # 3 features, 5 examples
Y = np.array([[1, 0, 1, 0, 1]])  # Corresponding labels

# Train the model
parameters = model(X, Y, layers_dims, learning_rate=0.01, num_iterations=10000)


https://chatgpt.com/share/6715bf8f-7ea4-8008-9cfa-39a2c50c5503

### Backpropagation Algorithm for Training a Deep Neural Network (DNN)

**Introduction**:  
Backpropagation is a key algorithm for training deep neural networks (DNNs), which allows the model to minimize its error by adjusting the network's weights and biases through gradient descent. A DNN consists of multiple layers, including an input layer, two or more hidden layers, and an output layer. Each layer applies an activation function to produce non-linear outputs, allowing the network to learn complex patterns.

### Architecture:
- **Input Layer**: This layer receives input features.
- **Hidden Layers**: At least two hidden layers with non-linear activation functions like ReLU (Rectified Linear Unit) are used to introduce non-linearity into the model.
- **Output Layer**: For classification tasks, the sigmoid function is often applied to the output layer to convert raw scores into probabilities.

### Forward Propagation:
1. **Initialization**: Weights (W) and biases (b) are randomly initialized.
2. **Feedforward**: Input data is passed through each layer:
   - Hidden layers apply the activation function (e.g., ReLU).
   - The final output layer uses the sigmoid function to output probabilities for classification.
   
   \[
   Z = W \cdot A_{\text{prev}} + b
   \]
   \[
   A = \text{activation}(Z)
   \]

### Cost Function:
To measure the model's performance, a loss function (e.g., binary cross-entropy for classification) is computed after forward propagation:
\[
\text{Cost} = -\frac{1}{m} \sum \left( Y \cdot \log(A) + (1-Y) \cdot \log(1-A) \right)
\]
where \(Y\) is the true label, \(A\) is the predicted output, and \(m\) is the number of training examples.

### Backward Propagation:
This is the key part of training a DNN. It computes the gradients of the cost function with respect to the network parameters (weights and biases) and updates them using gradient descent.
1. **Calculate output layer gradients**:
   \[
   dZ = A - Y
   \]
   \[
   dW = \frac{1}{m} \cdot dZ \cdot A_{\text{prev}}^T
   \]
   \[
   db = \frac{1}{m} \cdot \sum dZ
   \]
2. **Propagate backwards through the hidden layers**: Use the chain rule to compute gradients for hidden layers, applying the derivatives of the ReLU function.
3. **Update Parameters**: Update weights and biases using the learning rate:
   \[
   W = W - \alpha \cdot dW, \quad b = b - \alpha \cdot db
   \]

### Diagram:
Here’s a simplified diagram of the process:

```
Input Layer --> [ W1, b1 ] --> Hidden Layer 1 (ReLU) --> [ W2, b2 ] --> Hidden Layer 2 (ReLU) --> [ W3, b3 ] --> Output Layer (Sigmoid) --> Prediction

                        <-- Backpropagation (Calculate Gradients) <--                            
```

### Conclusion:
The backpropagation algorithm, combined with gradient descent, iteratively reduces the error of a DNN by adjusting its weights and biases. By applying this technique, a DNN with at least two hidden layers can be trained effectively for classification tasks, with the network learning to map inputs to outputs through multiple layers of abstraction.