In [None]:
import numpy as np

# -------------------------
# Activation functions
# -------------------------
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative_from_output(sigmoid_out):
    return sigmoid_out * (1 - sigmoid_out)

def relu(x):
    return np.maximum(0, x)

def relu_derivative_from_preact(z):
    return (z > 0).astype(float)

# -------------------------
# Loss (MSE) and derivative
# -------------------------
def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def mse_loss_derivative(y_true, y_pred):
    m = y_true.shape[0]
    return (2.0 / m) * (y_pred - y_true)

# -------------------------
# Deep Neural Network
# -------------------------
class DeepNeuralNetwork:
    def __init__(self, layer_sizes, learning_rate=0.1, seed=42):
        np.random.seed(seed)
        self.layer_sizes = layer_sizes
        self.L = len(layer_sizes) - 1
        self.lr = learning_rate
        self.W, self.b = [], []
        for i in range(self.L):
            in_dim, out_dim = layer_sizes[i], layer_sizes[i + 1]
            limit = np.sqrt(6.0 / (in_dim + out_dim))
            self.W.append(np.random.uniform(-limit, limit, (in_dim, out_dim)))
            self.b.append(np.zeros((1, out_dim)))

    def forward(self, X):
        activations, pre_acts = [X], []
        for l in range(self.L):
            z = activations[-1].dot(self.W[l]) + self.b[l]
            pre_acts.append(z)
            a = sigmoid(z) if l == self.L - 1 else relu(z)
            activations.append(a)
        self.activations, self.pre_acts = activations, pre_acts
        return activations[-1]

    def backward(self, X, y):
        m, grads_W, grads_b = X.shape[0], [None]*self.L, [None]*self.L
        dA = mse_loss_derivative(y, self.activations[-1])
        for l in reversed(range(self.L)):
            z, a, a_prev = self.pre_acts[l], self.activations[l+1], self.activations[l]
            dZ = dA * (sigmoid_derivative_from_output(a) if l == self.L-1 else relu_derivative_from_preact(z))
            grads_W[l] = a_prev.T.dot(dZ) / m
            grads_b[l] = np.sum(dZ, axis=0, keepdims=True) / m
            dA = dZ.dot(self.W[l].T)
        for l in range(self.L):
            self.W[l] -= self.lr * grads_W[l]
            self.b[l] -= self.lr * grads_b[l]

    def train(self, X, y, epochs=1000, print_every=100):
        for epoch in range(1, epochs+1):
            y_pred = self.forward(X)
            loss = mse_loss(y, y_pred)
            self.backward(X, y)
            if epoch % print_every == 0 or epoch == 1:
                print(f"Epoch {epoch}/{epochs} - Loss: {loss:.6f}")

    def predict_proba(self, X):
        return self.forward(X)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

# -------------------------
# Example: XOR problem
# -------------------------
X = np.array([[0,0],[0,1],[1,0],[1,1]], dtype=float)
y = np.array([[0],[1],[1],[0]], dtype=float)

net = DeepNeuralNetwork(layer_sizes=[2,4,1], learning_rate=0.1, seed=1)
net.train(X, y, epochs=5000, print_every=500)

print("\nPredictions:")
preds_proba = net.predict_proba(X)
preds = net.predict(X)
for xi, p, lab, true in zip(X, preds_proba.flatten(), preds.flatten(), y.flatten()):
    print(f"{xi} -> {p:.4f} (label={lab}, true={int(true)})")
