In [1]:
import numpy as np
import pandas as pd

Sigmoid -> puts number in range between 0 and 1

In [2]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

Sigmoid Derivative ->
We need the derivative for backpropagation.

In [3]:
def sigmoid_derivative(a):
    return a * (1 - a)

ReLU (Rectified Linear Unit) ->
ReLU keeps positive numbers and replaces negatives with 0 to avoid some problems sigmoid has

In [4]:
def relu(z):
    return np.maximum(0, z)

ReLU Derivative ->
The derivative is 1 for positive values and 0 for negative values.

In [5]:
def relu_derivative(z):
    return (z > 0).astype(float)

# Initialize Parameters

We randomly set initial weights and set biases to zero.

W1 → connects input layer → hidden layer

b1 → bias for hidden layer

W2 → connects hidden layer → output layer

b2 → bias for output layer

In [6]:
def init_params(input_dim, hidden_dim, seed=42):
    np.random.seed(seed)
    W1 = np.random.randn(input_dim, hidden_dim) * 0.01
    b1 = np.zeros((1, hidden_dim))
    W2 = np.random.randn(hidden_dim, 1) * 0.01
    b2 = np.zeros((1, 1))
    return W1, b1, W2, b2

# Forward Propagation

We feed input data through the network to get predictions.

Z1 = X·W1 + b1

A1 = ReLU(Z1) → hidden layer activation

Z2 = A1·W2 + b2

A2 = Sigmoid(Z2) → output layer activation (probability)

In [7]:
def forward(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = sigmoid(Z2)
    return Z1, A1, Z2, A2

Loss Function — Binary Cross Entropy ->
Measures how well our predictions match the labels.

Loss = −
m
1
​
 ∑[ylog(A2)+(1−y)log(1−A2)]

In [8]:
def compute_loss(y, A2):
    m = y.shape[0]
    loss = -np.mean(y * np.log(A2 + 1e-8) + (1 - y) * np.log(1 - A2 + 1e-8))
    return loss

**We add 1e-8 to avoid log(0) errors**

# Backward Propagation

we find gradients to know how to update weights.

Output layer error:

𝑑
𝑍
2
=
𝐴
2
−
𝑦
dZ2=A2−y


Then:

𝑑
𝑊
2
=
1
𝑚
𝐴
1
𝑇
⋅
𝑑
𝑍
2
dW2=
m
1
​
 A1
T
 ⋅dZ2
𝑑
𝑏
2
=
1
𝑚
∑
𝑑
𝑍
2


db2=
m
1
​
 ∑dZ2


Hidden layer error:

𝑑
𝑍
1
=
(
𝑑
𝑍
2
⋅
𝑊
2
𝑇
)
⋅
𝑅
𝑒
𝐿
𝑈
′
(
𝑍
1
)
dZ1=(dZ2⋅W2
T
 )⋅ReLU
′
 (Z1)


𝑑
𝑊
1
=
1
𝑚
𝑋
𝑇
⋅
𝑑
𝑍
1
dW1=
m
1
​
 X
T
 ⋅dZ1
𝑑
𝑏
1
=
1
𝑚
∑
𝑑
𝑍
1


db1=
m
1
​
 ∑dZ1

In [9]:
def backward(X, y, Z1, A1, A2, W2):
    m = X.shape[0]
    dZ2 = A2 - y
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    dZ1 = np.dot(dZ2, W2.T) * relu_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    return dW1, db1, dW2, db2

**Update Parameters (Gradient Descent)**

We move weights against the gradient to minimize loss.

In [10]:
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, lr):
    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2
    return W1, b1, W2, b2

**Training Loop** ->
We connect all pieces to train our network.

In [11]:
# Example: Toy dataset
from sklearn.datasets import make_moons

# Generate data
X, y = make_moons(n_samples=200, noise=0.2, random_state=42)
y = y.reshape(-1, 1)

# Initialize parameters
W1, b1, W2, b2 = init_params(input_dim=2, hidden_dim=4)

# Training
epochs = 1000
lr = 0.1

for i in range(epochs):
    # Forward
    Z1, A1, Z2, A2 = forward(X, W1, b1, W2, b2)

    # Loss
    loss = compute_loss(y, A2)

    # Backward
    dW1, db1, dW2, db2 = backward(X, y, Z1, A1, A2, W2)

    # Update
    W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, lr)

    # Print progress
    if i % 100 == 0:
        print(f"Epoch {i}, Loss: {loss:.4f}")

Epoch 0, Loss: 0.6932
Epoch 100, Loss: 0.6925
Epoch 200, Loss: 0.6542
Epoch 300, Loss: 0.3816
Epoch 400, Loss: 0.3235
Epoch 500, Loss: 0.3145
Epoch 600, Loss: 0.3127
Epoch 700, Loss: 0.3121
Epoch 800, Loss: 0.3119
Epoch 900, Loss: 0.3117


**END  ;)**