<h3>Neural network, math experimentation</h3>

In [838]:
# activation functions
# ReLu is very simple, it filters out all negative numbers
# this is a powerful activation function in reality
def activation_ReLu(number):
    if number > 0:
        return number
    else:
        return 0
    
# we also need a derived version of ReLu later
# otherwise the same than original, but instead of original value
# return 1 instead
def activation_ReLu_partial_derivative(number):
    if number > 0:
        return 1
    else:
        return 0

In [839]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = 1
w2 = 0.5
w3 = 1
w4 = -0.5
w5 = 1
w6 = 1
w7 = 0
w8 = 1
w9 = 0.5
bias1 = 0.5
bias2 = 0
bias3 = 0.5
bias4 = 0.5

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.01

<b>FORWARD PASS</b>

In [840]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w4 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

1.5

In [841]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w5 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.5

In [842]:
node_3_output = input1 * w3 + input2 * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

1.5

In [843]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_4_output = node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias3
node_4_output = activation_ReLu(node_4_output)
node_4_output

1.75

In [844]:
# compare predicted value with true value
print(f"Predicted: {node_4_output} --> True value: {true_value}")

Predicted: 1.75 --> True value: 2


In [845]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_4_output
loss = (predicted_value - true_value) ** 2
loss

0.0625

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [846]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w7 = 2 * node_1_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w7

-0.75

In [847]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w7 = w7 - LR * deriv_L_w7
new_w7

0.0075

In [848]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w8 = 2 * node_2_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w8

-0.25

In [849]:
# calculate new value for weight 6
new_w8 = w8 - LR * deriv_L_w8
new_w8

1.0025

In [850]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w9 = 2 * node_3_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w9

-0.75

In [851]:
# calculate new value for weight 6
new_w9 = w9 - LR * deriv_L_w9
new_w9

0.5075

In [852]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b4 = 2 * 1 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b4

-0.5

In [853]:
# update the bias 3 based on previous derivation
new_b4 = bias4 - LR * deriv_L_b4
new_b4

0.505

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [854]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w7* (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4- true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.0

In [855]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.505

In [856]:
# use the same formula again
deriv_L_w3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [857]:
# use the same formula again
deriv_L_w4_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [858]:
# use the same formula again
deriv_L_w5_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w5_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input2
deriv_L_w5 = deriv_L_w5_left * deriv_L_w5_right
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0

In [859]:
# use the same formula again
deriv_L_w6_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w6_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias2) * input2
deriv_L_w6 = deriv_L_w6_left * deriv_L_w6_right
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.0

In [860]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5

In [861]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.005

In [862]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * 1
deriv_L_b3 = deriv_L_b3_left * deriv_L_b3_right
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.5025

<b>Everything should be okay now, let's compare the results</b>

In [863]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"w7: {w7}")
print(f"w8: {w8}")
print(f"w9: {w9}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")
print(f"b4: {bias4}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"w7: {new_w7}")
print(f"w8: {new_w8}")
print(f"w9: {new_w9}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")
print(f"b4: {new_b4}")

ORIGINAL WEIGHTS AND BIASES
w1: 1
w2: 0.5
w3: 1
w4: -0.5
w5: 1
w6: 1
w7: 0
w8: 1
w9: 0.5
b1: 0.5
b2: 0
b3: 0.5
b4: 0.5


######################################
NEW WEIGHTS AND BIASES
w1: 1.0
w2: 0.505
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.0075
w8: 1.0025
w9: 0.5075
b1: 0.5
b2: 0.005
b3: 0.5025
b4: 0.505


<b>Trying with new values and increased learning rate

In [864]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = new_w1
w2 = new_w2
w3 = new_w3
w4 = new_w4
w5 = new_w5
w6 = new_w6
w7 = new_w7
w8 = new_w8
w9 = new_w9
bias1 = new_b1
bias2 = new_b2
bias3 = new_b3
bias4 = new_b4

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.05

<b>FORWARD PASS</b>

In [865]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w4 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

1.5

In [866]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w5 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.51

In [867]:
node_3_output = input1 * w3 + input2 * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

1.5025

In [868]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_4_output = node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias3
node_4_output = activation_ReLu(node_4_output)
node_4_output

1.7875437499999998

In [869]:
# compare predicted value with true value
print(f"Predicted: {node_4_output} --> True value: {true_value}")

Predicted: 1.7875437499999998 --> True value: 2


In [870]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_4_output
loss = (predicted_value - true_value) ** 2
loss

0.0451376581640626

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [871]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w7 = 2 * node_1_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w7

-0.6298687500000009

In [872]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w7 = w7 - LR * deriv_L_w7
new_w7

0.03899343750000005

In [873]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w8 = 2 * node_2_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w8

-0.2141553750000003

In [874]:
# calculate new value for weight 6
new_w8 = w8 - LR * deriv_L_w8
new_w8

1.01320776875

In [875]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w9 = 2 * node_3_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w9

-0.6309185312500009

In [876]:
# calculate new value for weight 6
new_w9 = w9 - LR * deriv_L_w9
new_w9

0.5390459265624999

In [877]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b4 = 2 * 1 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b4

-0.4199125000000006

In [878]:
# update the bias 3 based on previous derivation
new_b4 = bias4 - LR * deriv_L_b4
new_b4

0.5259956250000001

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [879]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w7* (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4- true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.0001574671875

In [880]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5260481140625001

In [881]:
# use the same formula again
deriv_L_w3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [882]:
# use the same formula again
deriv_L_w4_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [883]:
# use the same formula again
deriv_L_w5_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w5_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input2
deriv_L_w5 = deriv_L_w5_left * deriv_L_w5_right
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0

In [884]:
# use the same formula again
deriv_L_w6_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w6_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias2) * input2
deriv_L_w6 = deriv_L_w6_left * deriv_L_w6_right
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.0

In [885]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5001574671875

In [886]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.02604811406250003

In [887]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * 1
deriv_L_b3 = deriv_L_b3_left * deriv_L_b3_right
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.5131552796875

<b>Everything should be okay now, let's compare the results</b>

In [888]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"w7: {w7}")
print(f"w8: {w8}")
print(f"w9: {w9}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")
print(f"b4: {bias4}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"w7: {new_w7}")
print(f"w8: {new_w8}")
print(f"w9: {new_w9}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")
print(f"b4: {new_b4}")

ORIGINAL WEIGHTS AND BIASES
w1: 1.0
w2: 0.505
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.0075
w8: 1.0025
w9: 0.5075
b1: 0.5
b2: 0.005
b3: 0.5025
b4: 0.505


######################################
NEW WEIGHTS AND BIASES
w1: 1.0001574671875
w2: 0.5260481140625001
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.03899343750000005
w8: 1.01320776875
w9: 0.5390459265624999
b1: 0.5001574671875
b2: 0.02604811406250003
b3: 0.5131552796875
b4: 0.5259956250000001


<b>Trying with new values

In [889]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = new_w1
w2 = new_w2
w3 = new_w3
w4 = new_w4
w5 = new_w5
w6 = new_w6
w7 = new_w7
w8 = new_w8
w9 = new_w9
bias1 = new_b1
bias2 = new_b2
bias3 = new_b3
bias4 = new_b4

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.1

<b>FORWARD PASS</b>

In [890]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w4 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

1.500314934375

In [891]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w5 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.5520962281250001

In [892]:
node_3_output = input1 * w3 + input2 * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

1.5131552796875

In [893]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_4_output = node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias3
node_4_output = activation_ReLu(node_4_output)
node_4_output

1.9467060935172777

In [894]:
# compare predicted value with true value
print(f"Predicted: {node_4_output} --> True value: {true_value}")

Predicted: 1.9467060935172777 --> True value: 2


In [895]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_4_output
loss = (predicted_value - true_value) ** 2
loss

0.002840240468189145

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [896]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w7 = 2 * node_1_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w7

-0.12138616394467376

In [897]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w7 = w7 - LR * deriv_L_w7
new_w7

0.05113205389446743

In [898]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w8 = 2 * node_2_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w8

-0.044668517072607224

In [899]:
# calculate new value for weight 6
new_w8 = w8 - LR * deriv_L_w8
new_w8

1.0176746204572609

In [900]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w9 = 2 * node_3_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w9

-0.12242503933376575

In [901]:
# calculate new value for weight 6
new_w9 = w9 - LR * deriv_L_w9
new_w9

0.5512884304958765

In [902]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b4 = 2 * 1 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b4

-0.08090712234044428

In [903]:
# update the bias 3 based on previous derivation
new_b4 = bias4 - LR * deriv_L_b4
new_b4

0.5340863372340445

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [904]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w7* (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4- true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.0004729518693287

In [905]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5342456865527545

In [906]:
# use the same formula again
deriv_L_w3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [907]:
# use the same formula again
deriv_L_w4_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [908]:
# use the same formula again
deriv_L_w5_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w5_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input2
deriv_L_w5 = deriv_L_w5_left * deriv_L_w5_right
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0

In [909]:
# use the same formula again
deriv_L_w6_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w6_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias2) * input2
deriv_L_w6 = deriv_L_w6_left * deriv_L_w6_right
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.0

In [910]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5004729518693287

In [911]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.03424568655275451

In [912]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * 1
deriv_L_b3 = deriv_L_b3_left * deriv_L_b3_right
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.517516545160251

<b>Everything should be okay now, let's compare the results</b>

In [913]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"w7: {w7}")
print(f"w8: {w8}")
print(f"w9: {w9}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")
print(f"b4: {bias4}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"w7: {new_w7}")
print(f"w8: {new_w8}")
print(f"w9: {new_w9}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")
print(f"b4: {new_b4}")

ORIGINAL WEIGHTS AND BIASES
w1: 1.0001574671875
w2: 0.5260481140625001
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.03899343750000005
w8: 1.01320776875
w9: 0.5390459265624999
b1: 0.5001574671875
b2: 0.02604811406250003
b3: 0.5131552796875
b4: 0.5259956250000001


######################################
NEW WEIGHTS AND BIASES
w1: 1.0004729518693287
w2: 0.5342456865527545
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.05113205389446743
w8: 1.0176746204572609
w9: 0.5512884304958765
b1: 0.5004729518693287
b2: 0.03424568655275451
b3: 0.517516545160251
b4: 0.5340863372340445


<b> Trying with new values

In [914]:
# initialize weights and biases
# in Keras etc. these are usually randomized in the beginning
w1 = new_w1
w2 = new_w2
w3 = new_w3
w4 = new_w4
w5 = new_w5
w6 = new_w6
w7 = new_w7
w8 = new_w8
w9 = new_w9
bias1 = new_b1
bias2 = new_b2
bias3 = new_b3
bias4 = new_b4

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# our learning rate
LR = 0.1

<b>FORWARD PASS</b>

In [915]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w4 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

1.5009459037386574

In [916]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w5 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.568491373105509

In [917]:
node_3_output = input1 * w3 + input2 * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

1.517516545160251

In [918]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the the previous weights
node_4_output = node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias3
node_4_output = activation_ReLu(node_4_output)
node_4_output

2.0093915487941922

In [919]:
# compare predicted value with true value
print(f"Predicted: {node_4_output} --> True value: {true_value}")

Predicted: 2.0093915487941922 --> True value: 2


In [920]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_4_output
loss = (predicted_value - true_value) ** 2
loss

8.820118875369371e-05

<h3>BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS</h3>

In [921]:
# solving the partial derivative of the loss function with respect to weight 5
deriv_L_w7 = 2 * node_1_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w7

0.07793313646273173

In [922]:
# this decreases the weight value a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w7 = w7 - LR * deriv_L_w7
new_w7

0.04333874024819426

In [923]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w8 = 2 * node_2_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w8

0.029517596635402505

In [924]:
# calculate new value for weight 6
new_w8 = w8 - LR * deriv_L_w8
new_w8

1.0147228607937206

In [925]:
# solving the partial derivative of the loss function with respect to weight 6
deriv_L_w9 = 2 * node_3_output * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w9

0.07879352860342603

In [926]:
# calculate new value for weight 6
new_w9 = w9 - LR * deriv_L_w9
new_w9

0.5434090776355339

In [927]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE: the * 1 comes from derivating the bias, which is same as derivating x, 
# which result in 1
deriv_L_b4 = 2 * 1 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b4

0.05192268173597103

In [928]:
# update the bias 3 based on previous derivation
new_b4 = bias4 - LR * deriv_L_b4
new_b4

0.5288940690604474

<b>To access the first layer, we need to use chain rule, in order to calculate new values for w1-w4 and bias1/2</b>

In [929]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostly the same as derivating w5 and w6
deriv_L_w1_left = 2 * w7* (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4- true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs and biases
# based on which weight are you are derivating
# COMPARE THE OTHER ORIGINAL PICTURE IN THE MATERIALS
# in the case of w1 => use w1 and and w3 inside the Relu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input1, because it is connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

1.0002074605332418

In [930]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the equation of right side
deriv_L_w2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.5289616470098768

In [931]:
# use the same formula again
deriv_L_w3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [932]:
# use the same formula again
deriv_L_w4_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [933]:
# use the same formula again
deriv_L_w5_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w5_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * input2
deriv_L_w5 = deriv_L_w5_left * deriv_L_w5_right
new_w5 = w5 - LR * deriv_L_w5
new_w5

1.0

In [934]:
# use the same formula again
deriv_L_w6_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_w6_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias2) * input2
deriv_L_w6 = deriv_L_w6_left * deriv_L_w6_right
new_w6 = w6 - LR * deriv_L_w6
new_w6

1.0

In [935]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b1_left = 2 * w7 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w4 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.5002074605332418

In [936]:
# similar derivation of bias 2
deriv_L_b2_left = 2 * w8 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w2 + input2 * w5 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

0.028961647009876766

In [937]:
# otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias-term, which is the same as derivation of x, which results in 1
deriv_L_b3_left = 2 * w9 * (node_1_output * w7 + node_2_output * w8 + node_3_output * w9 + bias4 - true_value)
deriv_L_b3_right = activation_ReLu_partial_derivative(input1 * w3 + input2 * w6 + bias1) * 1
deriv_L_b3 = deriv_L_b3_left * deriv_L_b3_right
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.514654107788115

<b>Everything should be okay now, let's compare the results</b>

In [938]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"w7: {w7}")
print(f"w8: {w8}")
print(f"w9: {w9}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")
print(f"b4: {bias4}")

print("\n\n######################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"w7: {new_w7}")
print(f"w8: {new_w8}")
print(f"w9: {new_w9}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")
print(f"b4: {new_b4}")

ORIGINAL WEIGHTS AND BIASES
w1: 1.0004729518693287
w2: 0.5342456865527545
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.05113205389446743
w8: 1.0176746204572609
w9: 0.5512884304958765
b1: 0.5004729518693287
b2: 0.03424568655275451
b3: 0.517516545160251
b4: 0.5340863372340445


######################################
NEW WEIGHTS AND BIASES
w1: 1.0002074605332418
w2: 0.5289616470098768
w3: 1.0
w4: -0.5
w5: 1.0
w6: 1.0
w7: 0.04333874024819426
w8: 1.0147228607937206
w9: 0.5434090776355339
b1: 0.5002074605332418
b2: 0.028961647009876766
b3: 0.514654107788115
b4: 0.5288940690604474
