<h3>Neural network, math experimentation</h3>

In [1]:
import math

# activation functions
# ReLu is very simple, it filters out all negative numbers
# this is a powerful activation function in reality
def activation_ReLu(x):
    return math.tanh(x)
    
# we also need a derived version of ReLu later
# otherwise the same than original, but instead of original value
# return 1 instead
def activation_ReLu_partial_derivative(x):
    return 1 - math.tanh(x)**2

In [2]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = 1
w2 = 0.5
w3 = 1
w4 = -0.5
w5 = 1
w6 = 1
bias1 = 0.5
bias2 = 0
bias3 = 0.5

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.01

<b>FORWARD PASS</b>

In [3]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w3 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

0.9051482536448664

In [4]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w4 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.46211715726000974

In [5]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_3_output = node_1_output * w5 + node_2_output * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

0.95334556310551

In [6]:
# compare predicted value with true value
print(f"Predicted: {node_3_output} --> True value: {true_value}")

Predicted: 0.95334556310551 --> True value: 2


In [7]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_3_output
loss = (predicted_value - true_value) ** 2
loss

1.0954855102709222

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [8]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w5 = 2 * node_1_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w5

-0.2402889630354407

In [9]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0024028896303545

In [10]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w6 = 2 * node_2_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w6

-0.1226778619654283

In [11]:
# calculate new value for weight 6
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.0012267786196543

In [12]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b3 = 2 * 1 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b3

-0.26546917819024785

In [13]:
# update the bias 3 based on previous derivation
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.5026546917819025

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [14]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.000479720429286

In [15]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5020877763336005

In [16]:
# use the same formula again
deriv_L_w3_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [17]:
# use the same formula again
deriv_L_w4_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [18]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5004797204292858

In [19]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.0020877763336004823

<b>Everything should be okay now, let's compare the results</b>

In [20]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")



ORIGINAL WEIGHTS AND BIASES
w1: 1
w2: 0.5
w3: 1
w4: -0.5
w5: 1
w6: 1
b1: 0.5
b2: 0
b3: 0.5


######################################
NEW WEIGHTS AND BIASES
w1: 1.000479720429286
w2: 0.5020877763336005
w3: 1.0
w4: -0.5
w5: 1.0024028896303545
w6: 1.0012267786196543
b1: 0.5004797204292858
b2: 0.0020877763336004823
b3: 0.5026546917819025


<b>Trying with new values and increased learning rate

In [21]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = new_w1
w2 = new_w2
w3 = new_w3
w4 = new_w4
w5 = new_w5
w6 = new_w6
bias1 = new_b1
bias2 = new_b2
bias3 = new_b3

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.05

<b>FORWARDS PASS

In [22]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w3 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

0.9053214804880472

In [23]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w4 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.4653946678744216

In [24]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_3_output = node_1_output * w5 + node_2_output * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

0.9541454720197019

In [25]:
# compare predicted value with true value
print(f"Predicted: {node_3_output} --> True value: {true_value}")

Predicted: 0.9541454720197019 --> True value: 2


In [26]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_3_output
loss = (predicted_value - true_value) ** 2
loss

1.093811693696892

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [27]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w5 = 2 * node_1_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w5

-0.22430758504307224

In [28]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0136182688825082

In [29]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w6 = 2 * node_2_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w6

-0.1153088226588394

In [30]:
# calculate new value for weight 6
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.0069922197525962

In [31]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b3 = 2 * 1 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b3

-0.2477656720595549

In [32]:
# update the bias 3 based on previous derivation
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.5150429753848802

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [33]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.002719850164749

In [34]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5118047603576069

In [35]:
# use the same formula again
deriv_L_w3_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [36]:
# use the same formula again
deriv_L_w4_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [37]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5027198501647488

In [38]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.01180476035760688

<b>Everything should be okay now, let's compare the results</b>

In [39]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")



ORIGINAL WEIGHTS AND BIASES
w1: 1.000479720429286
w2: 0.5020877763336005
w3: 1.0
w4: -0.5
w5: 1.0024028896303545
w6: 1.0012267786196543
b1: 0.5004797204292858
b2: 0.0020877763336004823
b3: 0.5026546917819025


######################################
NEW WEIGHTS AND BIASES
w1: 1.002719850164749
w2: 0.5118047603576069
w3: 1.0
w4: -0.5
w5: 1.0136182688825082
w6: 1.0069922197525962
b1: 0.5027198501647488
b2: 0.01180476035760688
b3: 0.5150429753848802


<b>Trying with new values and increased learning rate

In [40]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = new_w1
w2 = new_w2
w3 = new_w3
w4 = new_w4
w5 = new_w5
w6 = new_w6
bias1 = new_b1
bias2 = new_b2
bias3 = new_b3

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.1

<b>FORWARDS PASS

In [41]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w3 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

0.9061264177390772

In [42]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w4 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.4804810426765055

In [43]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_3_output = node_1_output * w5 + node_2_output * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

0.9576984004128958

In [44]:
# compare predicted value with true value
print(f"Predicted: {node_3_output} --> True value: {true_value}")

Predicted: 0.9576984004128958 --> True value: 2


In [45]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_3_output
loss = (predicted_value - true_value) ** 2
loss

1.0863926245018358

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [46]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w5 = 2 * node_1_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w5

-0.14978280914647166

In [47]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0285965497971554

In [48]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w6 = 2 * node_2_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w6

-0.07942357589935772

In [49]:
# calculate new value for weight 6
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.014934577342532

In [50]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b3 = 2 * 1 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b3

-0.1653001239277434

In [51]:
# update the bias 3 based on previous derivation
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.5315729877776545

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [52]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.005717926594625

In [53]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5246075185982385

In [54]:
# use the same formula again
deriv_L_w3_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [55]:
# use the same formula again
deriv_L_w4_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [56]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5057179265946249

In [57]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.02460751859823847

<b>Everything should be okay now, let's compare the results</b>

In [58]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")



ORIGINAL WEIGHTS AND BIASES
w1: 1.002719850164749
w2: 0.5118047603576069
w3: 1.0
w4: -0.5
w5: 1.0136182688825082
w6: 1.0069922197525962
b1: 0.5027198501647488
b2: 0.01180476035760688
b3: 0.5150429753848802


######################################
NEW WEIGHTS AND BIASES
w1: 1.005717926594625
w2: 0.5246075185982385
w3: 1.0
w4: -0.5
w5: 1.0285965497971554
w6: 1.014934577342532
b1: 0.5057179265946249
b2: 0.02460751859823847
b3: 0.5315729877776545


<b>Trying with new values and increased learning rate

In [59]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = new_w1
w2 = new_w2
w3 = new_w3
w4 = new_w4
w5 = new_w5
w6 = new_w6
bias1 = new_b1
bias2 = new_b2
bias3 = new_b3

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.2

<b>FORWARDS PASS

In [60]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w3 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

0.9071935281512686

In [61]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w4 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.4999316665341724

In [62]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_3_output = node_1_output * w5 + node_2_output * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

0.9620030056314516

In [63]:
# compare predicted value with true value
print(f"Predicted: {node_3_output} --> True value: {true_value}")

Predicted: 0.9620030056314516 --> True value: 2


In [64]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_3_output
loss = (predicted_value - true_value) ** 2
loss

1.0774377603181406

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [65]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w5 = 2 * node_1_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w5

-0.050608597451414654

In [66]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0387182692874384

In [67]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w6 = 2 * node_2_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w6

-0.027889132450495225

In [68]:
# calculate new value for weight 6
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.020512403832631

In [69]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b3 = 2 * 1 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b3

-0.055785888987267995

In [70]:
# update the bias 3 based on previous derivation
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.5427301655751081

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [71]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.0077492189975321

In [72]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5331011464893863

In [73]:
# use the same formula again
deriv_L_w3_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [74]:
# use the same formula again
deriv_L_w4_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [75]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.507749218997532

In [76]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w4 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.03310114648938629

<b>Everything should be okay now, let's compare the results</b>

In [77]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")



ORIGINAL WEIGHTS AND BIASES
w1: 1.005717926594625
w2: 0.5246075185982385
w3: 1.0
w4: -0.5
w5: 1.0285965497971554
w6: 1.014934577342532
b1: 0.5057179265946249
b2: 0.02460751859823847
b3: 0.5315729877776545


######################################
NEW WEIGHTS AND BIASES
w1: 1.0077492189975321
w2: 0.5331011464893863
w3: 1.0
w4: -0.5
w5: 1.0387182692874384
w6: 1.020512403832631
b1: 0.507749218997532
b2: 0.03310114648938629
b3: 0.5427301655751081
