In [4]:
import numpy as np  # importing NumPy

np.random.seed(42)

input_nodes = 5  # nodes in each layer
hidden_1_nodes = 3
hidden_2_nodes = 5
output_nodes = 4

x = np.random.randint(1, 100, size=(input_nodes, 1)) / 100
x

y = np.random.randint(1, 100, size=(output_nodes, 1)) / 100
y


def relu(x, leak=0):  # ReLU
    return np.where(x <= 0, leak * x, x)


def relu_dash(x, leak=0):  # ReLU derivative
    return np.where(x <= 0, leak, 1)


def sig(x):  # Sigmoid
    return 1 / (1 + np.exp(-x))


def sig_dash(x):  # Sigmoid derivative
    return sig(x) * (1 - sig(x))


def mse(y_true, y_pred):  # MSE
    return np.mean((y_true - y_pred) ** 2)


def mse_grad(y_true, y_pred):  # MSE derivative
    N = y_true.shape[0]
    return -2 * (y_true - y_pred) / N


w1 = np.random.random(size=(hidden_1_nodes, input_nodes))  # w1
b1 = np.zeros(shape=(hidden_1_nodes, 1))  # b1

w2 = np.random.random(size=(hidden_2_nodes, hidden_1_nodes))  # w2
b2 = np.zeros(shape=(hidden_2_nodes, 1))  # b2

w3 = np.random.random(size=(output_nodes, hidden_2_nodes))  # w3
b3 = np.zeros(shape=(output_nodes, 1))  # b3

in_hidden_1 = w1.dot(x) + b1  # forward feed
out_hidden_1 = relu(in_hidden_1, leak=0.1)
in_hidden_2 = w2.dot(out_hidden_1) + b2
out_hidden_2 = sig(in_hidden_2)
in_output_layer = w3.dot(out_hidden_2) + b3
y_hat = sig(in_output_layer)
print("y_hat")  # y_hat
print(y_hat)  # y_hat
print("y")  # y
print(y)  # y
print("mse(y, y_hat)")  # MSE loss
print(mse(y, y_hat))  # MSE loss

learning_rate = 0.01
epochs = 10000

for epoch in range(epochs):
    # ----------------------Forward Propagation--------------------------

    in_hidden_1 = w1.dot(x) + b1
    out_hidden_1 = relu(in_hidden_1, leak=0.1)
    in_hidden_2 = w2.dot(out_hidden_1) + b2
    out_hidden_2 = sig(in_hidden_2)
    in_output_layer = w3.dot(out_hidden_2) + b3
    y_hat = sig(in_output_layer)

    loss = mse(y, y_hat)
    print(f"loss before training is {loss} -- epoch number {epoch + 1}")
    print("\n")

    # -------------------------- Gradient Calculation via Backpropagation ------------------------------ #

    grad_w3 = mse_grad(y, y_hat) * sig_dash(in_output_layer).dot(
        out_hidden_2.T
    )  # grad_w3

    grad_b3 = mse_grad(y, y_hat) * sig_dash(in_output_layer)  # grad_b3

    error_grad_upto_H2 = np.sum(
        mse_grad(y, y_hat) * sig_dash(in_output_layer) * w3, axis=0
    ).reshape((-1, 1))
    # error grad upto H2

    grad_w2 = error_grad_upto_H2 * sig_dash(in_hidden_2).dot(out_hidden_1.T)  # grad w2

    grad_b2 = error_grad_upto_H2 * sig_dash(in_hidden_2)  # grad b2

    error_grad_upto_H1 = np.sum(
        error_grad_upto_H2 * sig_dash(in_hidden_2) * w2, axis=0
    ).reshape((-1, 1))
    # error grad upto H1

    grad_w1 = error_grad_upto_H1 * relu_dash(in_hidden_1, leak=0.1).dot(x.T)
    # grad w1

    grad_b1 = error_grad_upto_H1 * relu_dash(in_hidden_1, leak=0.1)
    # grad b1

    update_w1 = -learning_rate * grad_w1
    w1 += update_w1  # w1

    update_b1 = -learning_rate * grad_b1
    b1 += update_b1  # b1

    update_w2 = -learning_rate * grad_w2
    w2 += update_w2  # w2

    update_b2 = -learning_rate * grad_b2
    b2 += update_b2  # b2

    update_w3 = -learning_rate * grad_w3
    w3 += update_w3  # w3

    update_b3 = -learning_rate * grad_b3
    b3 += update_b3  # b3



y_hat
[[0.83237553]
 [0.89655717]
 [0.87337397]
 [0.92904704]]
y
[[0.21]
 [0.83]
 [0.87]
 [0.75]]
mse(y, y_hat)
0.1059625955371147
loss before training is 0.1059625955371147 -- epoch number 1


loss before training is 0.10585968196673604 -- epoch number 2


loss before training is 0.10575657334190687 -- epoch number 3


loss before training is 0.10565326961304952 -- epoch number 4


loss before training is 0.10554977073417839 -- epoch number 5


loss before training is 0.10544607666292939 -- epoch number 6


loss before training is 0.10534218736058998 -- epoch number 7


loss before training is 0.10523810279212893 -- epoch number 8


loss before training is 0.10513382292622661 -- epoch number 9


loss before training is 0.10502934773530453 -- epoch number 10


loss before training is 0.10492467719555569 -- epoch number 11


loss before training is 0.10481981128697457 -- epoch number 12


loss before training is 0.10471474999338735 -- epoch number 13


loss before training is 0.10460949

In [5]:
print("N y_hat")  # y_hat
print(y_hat)  # y_hat
print("N y")  # y
print(y)  # y
print("N mse(y, y_hat)")  # MSE loss
print(mse(y, y_hat))  # MSE loss

N y_hat
[[0.21021091]
 [0.83087915]
 [0.86950084]
 [0.75024423]]
N y
[[0.21]
 [0.83]
 [0.87]
 [0.75]]
N mse(y, y_hat)
2.815469181909573e-07
