In [1]:
import numpy as np

In [2]:
def initialize_parameters(layer_dims):
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims) - 1  # number of layers
    
    for l in range(1, L + 1):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))
    return parameters


In [3]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu(Z):
    return np.maximum(0, Z)

def forward_propagation(X, parameters):
    caches = {}
    A = X
    L = len(parameters) // 2
    
    for l in range(1, L):
        Z = np.dot(parameters[f"W{l}"], A) + parameters[f"b{l}"]
        A = relu(Z)
        caches[f"Z{l}"] = Z
        caches[f"A{l}"] = A

    # Output layer (sigmoid)
    ZL = np.dot(parameters[f"W{L}"], A) + parameters[f"b{L}"]
    AL = sigmoid(ZL)
    caches[f"Z{L}"] = ZL
    caches[f"A{L}"] = AL
    return AL, caches


In [4]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = -(1 / m) * np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL))
    return np.squeeze(cost)


In [5]:
def relu_derivative(Z):
    return np.where(Z > 0, 1, 0)

def backward_propagation(parameters, caches, X, Y):
    grads = {}
    m = X.shape[1]
    L = len(parameters) // 2

    AL = caches[f"A{L}"]
    dZL = AL - Y
    grads[f"dW{L}"] = (1 / m) * np.dot(dZL, caches[f"A{L-1}"].T)
    grads[f"db{L}"] = (1 / m) * np.sum(dZL, axis=1, keepdims=True)
    dA_prev = np.dot(parameters[f"W{L}"].T, dZL)

    for l in reversed(range(1, L)):
        dZ = dA_prev * relu_derivative(caches[f"Z{l}"])
        A_prev = X if l == 1 else caches[f"A{l-1}"]
        grads[f"dW{l}"] = (1 / m) * np.dot(dZ, A_prev.T)
        grads[f"db{l}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
        if l > 1:
            dA_prev = np.dot(parameters[f"W{l}"].T, dZ)
    return grads


In [6]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L + 1):
        parameters[f"W{l}"] -= learning_rate * grads[f"dW{l}"]
        parameters[f"b{l}"] -= learning_rate * grads[f"db{l}"]
    return parameters


In [7]:
def model(X, Y, layer_dims, learning_rate=0.01, iterations=1000):
    parameters = initialize_parameters(layer_dims)
    for i in range(iterations):
        AL, caches = forward_propagation(X, parameters)
        cost = compute_cost(AL, Y)
        grads = backward_propagation(parameters, caches, X, Y)
        parameters = update_parameters(parameters, grads, learning_rate)
        
        if i % 100 == 0:
            print(f"Iteration {i}, cost: {cost:.6f}")
    return parameters


In [8]:
# Example data
np.random.seed(2)
X = np.random.randn(4, 5)  # 4 features, 5 examples
Y = (np.random.randn(1, 5) > 0).astype(int)  # binary labels (0 or 1)

# 4 input neurons → 3 hidden layers (3 units each) → 1 output neuron
layer_dims = [4, 3, 3, 3, 1]

# Train the model
trained_parameters = model(X, Y, layer_dims, learning_rate=0.05, iterations=500)


Iteration 0, cost: 0.693147
Iteration 100, cost: 0.523339
Iteration 200, cost: 0.504098
Iteration 300, cost: 0.501077
Iteration 400, cost: 0.500532
