In [2]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_gaussian_quantiles

In [3]:
samples = np.array([
    [1,2,3],
    [6,7,8],
    [7,8,9],
    [3,4,5],
    [4,5,6],
])

X = samples
targets = np.array([False, True, True, False, False])

In [4]:
uniques, indices = np.unique(targets, return_inverse = True)
print(f"Original array : {targets}")
print(f"Unique array : {uniques}")
print(f"Indices : {indices}")

Original array : [False  True  True False False]
Unique array : [False  True]
Indices : [0 1 1 0 0]


In [5]:
n_samples = targets.shape[0]
n_classes = len(uniques)
y = np.zeros((n_samples, n_classes))
print(y)


[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [6]:
print(np.arange(n_samples))
print(indices)

[0 1 2 3 4]
[0 1 1 0 0]


In [7]:
y[np.arange(n_samples), indices] = 1
print(targets, '\n')
print("one hot encoding targets:")
print(y)

[False  True  True False False] 

one hot encoding targets:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]


In [8]:
# The shape of our dataset
print(X.shape)
n_features = X.shape[1]

print(f"Dataset size : {n_samples}")
print(f"Features size : {n_features}")

(5, 3)
Dataset size : 5
Features size : 3


In [9]:
# the number of units in the hidden layer
n_hidden_units = 4

In [10]:
np.random.seed(10)

Wh = np.random.uniform(low=-0.5, high=0.5, size=(n_features, n_hidden_units))
bh = np.zeros((1, n_hidden_units))

In [11]:
print(Wh)

[[ 0.27132064 -0.47924805  0.13364823  0.24880388]
 [-0.00149299 -0.27520335 -0.30193714  0.26053071]
 [-0.33088916 -0.41166019  0.18535982  0.45339335]]


In [12]:
print(f"input shape: {X.shape}")
print(f"hidden weights shape : {Wh.shape}")
print(f"hidden biases shape: {bh.shape}")

input shape: (5, 3)
hidden weights shape : (3, 4)
hidden biases shape: (1, 4)


In [13]:
# the weights of the first hidden unit
# reshape is used just to display the result in column format
print(Wh, '\n')
print("Weights of the first hidden unit: ")
print(Wh[:,0].reshape(3,1)) 
# this code snippet is for explaination only

[[ 0.27132064 -0.47924805  0.13364823  0.24880388]
 [-0.00149299 -0.27520335 -0.30193714  0.26053071]
 [-0.33088916 -0.41166019  0.18535982  0.45339335]] 

Weights of the first hidden unit: 
[[ 0.27132064]
 [-0.00149299]
 [-0.33088916]]


In [14]:
h1 = np.dot(X, Wh) + bh
print(h1.shape)
print(h1)

(5, 4)
[[-0.72433282 -2.26463532  0.08585342  2.13004535]
 [-1.02964036 -8.09519327  0.17120801  6.94368505]
 [-1.09070187 -9.26130486  0.18827893  7.90641299]
 [-0.84645584 -4.5968585   0.11999526  4.05550123]
 [-0.90751735 -5.76297009  0.13706617  5.01822917]]


In [15]:
# passing values thru relU
a1 = np.maximum(0,h1)
print("before  ReLU (h1) :")
print(h1, '\n')
print("After ReLU (a1): ")
print(a1) # a1 is the output of the hidden layer

before  ReLU (h1) :
[[-0.72433282 -2.26463532  0.08585342  2.13004535]
 [-1.02964036 -8.09519327  0.17120801  6.94368505]
 [-1.09070187 -9.26130486  0.18827893  7.90641299]
 [-0.84645584 -4.5968585   0.11999526  4.05550123]
 [-0.90751735 -5.76297009  0.13706617  5.01822917]] 

After ReLU (a1): 
[[0.         0.         0.08585342 2.13004535]
 [0.         0.         0.17120801 6.94368505]
 [0.         0.         0.18827893 7.90641299]
 [0.         0.         0.11999526 4.05550123]
 [0.         0.         0.13706617 5.01822917]]


In [16]:
np.random.seed(100)

Wo = np.random.uniform(low=-0.5, high=0.5, size = (n_hidden_units, n_classes))
bo = np.zeros((1,n_classes))


In [17]:
print(Wo)

[[ 0.04340494 -0.22163061]
 [-0.07548241  0.34477613]
 [-0.49528114 -0.37843088]
 [ 0.17074908  0.32585276]]


In [18]:
print(f"Hidden layer output shape: {a1.shape}")
print(f"Output weights shape: {Wo.shape}")
print(f"Output biases shape: {bo.shape}")

Hidden layer output shape: (5, 4)
Output weights shape: (4, 2)
Output biases shape: (1, 2)


In [19]:
h2 = np.dot(a1, Wo) + bo
print(h2.shape)
print(h2)

(5, 2)
[[0.32118171 0.66159156]
 [1.10083177 2.19782851]
 [1.25676178 2.5050759 ]
 [0.63304174 1.27608634]
 [0.78897175 1.58333373]]


In [20]:
# softmax func
#first we will calculate the numerators
e_x = np.exp(h2)
print(e_x)

[[ 1.3787561   1.93787412]
 [ 3.00666583  9.005437  ]
 [ 3.51402386 12.24448824]
 [ 1.88333047  3.5825912 ]
 [ 2.20113194  4.87116792]]


In [52]:
print(np.exp(10))
print(np.exp(100))
print(np.exp(1000))

22026.465794806718
2.6881171418161356e+43
inf


  print(np.exp(1000))


In [22]:
np.max(h2)

np.float64(2.5050758959413426)

In [23]:
print(h2, '\n')
print("Maximum value from each row: ")
print(np.max(h2,axis=1))

[[0.32118171 0.66159156]
 [1.10083177 2.19782851]
 [1.25676178 2.5050759 ]
 [0.63304174 1.27608634]
 [0.78897175 1.58333373]] 

Maximum value from each row: 
[0.66159156 2.19782851 2.5050759  1.27608634 1.58333373]


In [24]:
# this will show error
#as we are subtracting the 2 columns from the 5 columns
np.exp(h2 - np.max(h2, axis=1))

ValueError: operands could not be broadcast together with shapes (5,2) (5,) 

In [27]:
np.max(h2, axis=1, keepdims = True) #hence we make the row and column same

array([[0.66159156],
       [2.19782851],
       [2.5050759 ],
       [1.27608634],
       [1.58333373]])

In [28]:
# we can now calculate
e_x = np.exp(h2 - np.max(h2, axis = 1, keepdims = True))
print(e_x)

[[0.71147867 1.        ]
 [0.33387229 1.        ]
 [0.28698822 1.        ]
 [0.52568947 1.        ]
 [0.45186944 1.        ]]


In [29]:
#now we will calculate the softmax denominators
np.sum(e_x, axis=1, keepdims = True) 

array([[1.71147867],
       [1.33387229],
       [1.28698822],
       [1.52568947],
       [1.45186944]])

In [30]:
y_hat = e_x / np.sum(e_x, axis=1, keepdims= True) #  according to softmax formula
y_hat

array([[0.41570992, 0.58429008],
       [0.25030304, 0.74969696],
       [0.22299211, 0.77700789],
       [0.34455863, 0.65544137],
       [0.31123284, 0.68876716]])

In [31]:
#calculate CCE loss (full version)
print(y)
print(y_hat)
  

[[1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]
[[0.41570992 0.58429008]
 [0.25030304 0.74969696]
 [0.22299211 0.77700789]
 [0.34455863 0.65544137]
 [0.31123284 0.68876716]]


In [32]:
#component wise multiply and summation in each row
np.sum(y * -np.log(y_hat), axis = 1)

array([0.87776756, 0.2880862 , 0.25230477, 1.06549102, 1.16721398])

In [33]:
y_hat_clipped = np.clip(y_hat, np.finfo(float).eps, 1 - np.finfo(float).eps) # clip function is used so that we can use it to to let log(0) to show error
print(y_hat_clipped)

[[0.41570992 0.58429008]
 [0.25030304 0.74969696]
 [0.22299211 0.77700789]
 [0.34455863 0.65544137]
 [0.31123284 0.68876716]]


In [34]:
neg_logs = np.sum(y * -np.log(y_hat_clipped), axis = 1)
neg_logs

array([0.87776756, 0.2880862 , 0.25230477, 1.06549102, 1.16721398])

In [35]:
cce_loss = np.mean(neg_logs)
print(f"The loss after this forward pass is : {cce_loss}") #this completes our forward pass

The loss after this forward pass is : 0.7301727079439226


In [36]:
print("y")
print(y, '\n')
print("y-hat")
print(y_hat_clipped)

y
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]] 

y-hat
[[0.41570992 0.58429008]
 [0.25030304 0.74969696]
 [0.22299211 0.77700789]
 [0.34455863 0.65544137]
 [0.31123284 0.68876716]]


In [37]:
# we will normalize also so that the gradient is not affected by the size of the gradient.
dloss_dh2 = (y_hat - y) / n_samples
print(dloss_dh2)

[[-0.11685802  0.11685802]
 [ 0.05006061 -0.05006061]
 [ 0.04459842 -0.04459842]
 [-0.13108827  0.13108827]
 [-0.13775343  0.13775343]]


In [38]:
dh2_dWo = a1
print(dh2_dWo)

[[0.         0.         0.08585342 2.13004535]
 [0.         0.         0.17120801 6.94368505]
 [0.         0.         0.18827893 7.90641299]
 [0.         0.         0.11999526 4.05550123]
 [0.         0.         0.13706617 5.01822917]]


In [39]:
# now we will multiply 
print(f"Wo: {Wo.shape}")

Wo: (4, 2)


In [40]:
print(f"dh2_dwo: {dh2_dWo.shape}")
print(f"dloss_dh2: {dloss_dh2.shape}")

dh2_dwo: (5, 4)
dloss_dh2: (5, 2)


In [41]:
print(f'{dh2_dWo.T.shape} * {dloss_dh2.shape}') # we take the transpose

(4, 5) * (5, 2)


In [None]:
print(a1)

[[0.         0.         0.08585342 2.13004535]
 [0.         0.         0.17120801 6.94368505]
 [0.         0.         0.18827893 7.90641299]
 [0.         0.         0.11999526 4.05550123]
 [0.         0.         0.13706617 5.01822917]]


In [42]:
print(dloss_dh2)

[[-0.11685802  0.11685802]
 [ 0.05006061 -0.05006061]
 [ 0.04459842 -0.04459842]
 [-0.13108827  0.13108827]
 [-0.13775343  0.13775343]]


In [43]:
dh2_dWo = a1.T
print(dh2_dWo)

[[0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.08585342 0.17120801 0.18827893 0.11999526 0.13706617]
 [2.13004535 6.94368505 7.90641299 4.05550123 5.01822917]]


In [44]:
#now we can calculate the derivative of Lcce wrt W0
#gradient of the output weight
dloss_dWo = np.dot(dh2_dWo, dloss_dh2)
print("The gradient for the outptut weights (Wo) :")
print(dloss_dWo)

The gradient for the outptut weights (Wo) :
[[ 0.          0.        ]
 [ 0.          0.        ]
 [-0.02767625  0.02767625]
 [-0.77160119  0.77160119]]


In [45]:
# gradient of the output baises
dloss_dbo = np.sum(dloss_dh2, axis =0, keepdims= True)
print("The gradient for the output biases (bo):")
print(dloss_dbo)

The gradient for the output biases (bo):
[[-0.29104069  0.29104069]]


In [46]:
 #now we will calculate the gradient of the hidden weights and biases
dh2_da1 = Wo.T
print(dh2_da1.shape)

(2, 4)


In [47]:
print(dloss_dh2.shape)
print(Wo.T.shape)


(5, 2)
(2, 4)


In [48]:
print(Wo)

[[ 0.04340494 -0.22163061]
 [-0.07548241  0.34477613]
 [-0.49528114 -0.37843088]
 [ 0.17074908  0.32585276]]


In [49]:
print("Weights between first hidden unit and each output unit:")
print(Wo[0])

Weights between first hidden unit and each output unit:
[ 0.04340494 -0.22163061]


In [50]:
print(dloss_dh2)

[[-0.11685802  0.11685802]
 [ 0.05006061 -0.05006061]
 [ 0.04459842 -0.04459842]
 [-0.13108827  0.13108827]
 [-0.13775343  0.13775343]]


In [51]:
dloss_da1 = np.dot(dloss_dh2, dh2_da1)
print(dloss_da1.shape)
print(dloss_da1)

(5, 4)
[[-0.03097153  0.04911058  0.01365489  0.01812511]
 [ 0.01326784 -0.0210384  -0.0058496  -0.00776458]
 [ 0.01182017 -0.01874287 -0.00521134 -0.00691738]
 [-0.03474305  0.05509097  0.0153177   0.02033227]
 [-0.03650956  0.05789206  0.01609653  0.02136606]]


In [54]:
da1_dh1 = np.zeros(h1.shape, dtype = np.float32)
da1_dh1[h1 > 0] = 1
print(h1, '\n')
print(da1_dh1)

[[-0.72433282 -2.26463532  0.08585342  2.13004535]
 [-1.02964036 -8.09519327  0.17120801  6.94368505]
 [-1.09070187 -9.26130486  0.18827893  7.90641299]
 [-0.84645584 -4.5968585   0.11999526  4.05550123]
 [-0.90751735 -5.76297009  0.13706617  5.01822917]] 

[[0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]]


In [55]:
print(f"dloss_da1: {dloss_da1.shape}")
print(f"da1_dh1: {da1_dh1.shape}")

dloss_da1: (5, 4)
da1_dh1: (5, 4)


In [56]:
dloss_dh1 = da1_dh1 * dloss_da1
print(dloss_dh1)

[[-0.          0.          0.01365489  0.01812511]
 [ 0.         -0.         -0.0058496  -0.00776458]
 [ 0.         -0.         -0.00521134 -0.00691738]
 [-0.          0.          0.0153177   0.02033227]
 [-0.          0.          0.01609653  0.02136606]]


In [None]:
dh1_dWo = X.T


(3, 5)


In [None]:
dloss_dWh = np.dot(dh1_dWo, dloss_dh1)


(3, 4)


In [63]:
print(f"Hidden weights: {Wh.shape}")
print(f"dh1_dWo: {dloss_dWh.shape}")

Hidden weights: (3, 4)
dh1_dWo: (3, 4)


In [66]:
dloss_dbh = np.sum(dloss_dh1, axis=  0, keepdims = True)
dloss_dbh

array([[0.        , 0.        , 0.03400818, 0.04514148]])

In [None]:
#learning rate
lr = 0.01

#upadtes output weights and biases
new_Wo = Wo - lr*dloss_dWo
new_bo = bo - lr * dloss_dbo

#updates hidden weights and biases.
new_Wh = Wh - lr * dloss_dWh
new_bh = bh - lr * dloss_dbh

#This completes our backward pass

In [None]:
# To check if the loss minimized or not we again do a forward pass with thenew Values
h1 = np.dot(X, new_Wh) + new_bh
a1 = np.maximum(0, h1)
h2 = np.dot(a1, new_Wo) + new_bo

# Softmax
e_x = np.exp(h2 - np.max(h2, axis=1, keepdims=True))
y_hat = e_x / np.sum(e_x, axis=1, keepdims=True)
y_hat_clipped = np.clip(y_hat, np.finfo(float).eps, 1 - np.finfo(float).eps)

# Cross entropy
neg_logs = np.sum(y * -np.log(y_hat_clipped), axis=1)

new_cce_loss = np.mean(neg_logs)

print(f'New loss: {new_cce_loss}')
print(f'Previous loss: {cce_loss}')

# Hence the new loss decreased

New loss: 0.7168031908508434
Previous loss: 0.7301727079439226


In [None]:
# But this way is very lenthy confusing and not applicable for large scale
# therefore we will be using frameworks for faster implementation int any big projects. 