In [4]:
import numpy as np
from typing import Callable

coding the fundamentals of DL - linear algebra, calculus from scratch. \
Consider input matrix X of shape [3,3] and weight matrix [3,2]. the resultant matrix N of shape [3,2] is passed through an activation function $\sigma$ \
$S = \sigma (N)$. the result is passed through a summation function $\lambda$ which gives a scalar quantity L. \
the objective is to code backward pass computing gradient $\frac{\partial{L}}{\partial{X}}$ and $\frac{\partial{L}}{\partial{W}}$

In [5]:

def deriv(func : Callable[[np.ndarray], np.ndarray],
          input_ : np.ndarray,
          delta : float = 1e-3) -> np.ndarray:
    return ((func(input_ + delta) - func(input_ - delta)) / (2 * delta))

In [2]:
#forward pass
def matrix_forward_sum(X : np.ndarray, W : np.ndarray, sigma ) -> np.ndarray:
    assert X.shape[1] == W.shape[0] # matrix multiplicatin shape requirement
    N = np.dot(X,W)
    S = sigma(N)
    L = np.sum(S)
    return L

Doing some math, we see that :
 $\frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{\sigma}} \times \frac{\partial{L}}{\partial{\sigma}} \times W^{T}$ \
 this does not mean that $\frac{\partial{N}}{\partial{X}} = W^{T}$, when you chain them together for this activation and matrix multiplication, you get $W^{T}$ in the expression simplifying the gradients. \
 Similarly we note that :
 $\frac{\partial{L}}{\partial{W}} = \frac{\partial{L}}{\partial{\sigma}} \times X^{T} \times \frac{\partial{\sigma}}{\partial{N}}$

In [6]:
def matrix_backward_sum(X : np.ndarray, W : np.ndarray, sigma) -> np.ndarray:
    assert X.shape[1] == W.shape[0] # matrix multiplicatin shape requirement
    N = np.dot(X,W)
    S = sigma(N)
    L = np.sum(S)

    # backward pass
    dL_ds = np.ones_like(S)
    dS_dN = deriv(sigma, N)
    dL_dN = dL_ds * dS_dN
    dN_dX = np.transpose(W, (1,0))
    dL_dX = np.dot(dL_dN, dN_dX)
    return dL_dX 

In [7]:
def sigmoid(X : np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-X))

In [8]:
# verification of code
np.random.seed(42) # hitchhiker seed
X = np.random.randn(3,3)
W = np.random.randn(3,2)
print("X = ", X)
print("W = ", W)

print("L = ", matrix_forward_sum(X,W,sigmoid))

print("dL_dX = ", matrix_backward_sum(X,W,sigmoid))




X =  [[ 0.49671415 -0.1382643   0.64768854]
 [ 1.52302986 -0.23415337 -0.23413696]
 [ 1.57921282  0.76743473 -0.46947439]]
W =  [[ 0.54256004 -0.46341769]
 [-0.46572975  0.24196227]
 [-1.91328024 -1.72491783]]
L =  3.067006496510426
dL_dX =  [[ 0.03685663 -0.05665209 -0.6691846 ]
 [-0.02524588 -0.01606117 -0.72432314]
 [-0.027645   -0.0145585  -0.72786339]]


In [12]:
#verficiation:
# dL_dX's first component is 0.03685, which means if we increase X by 0.001, we should see 
# diff of 0.001 * 0.03685 in dL_dx
X1 = X.copy()
X1[0,0] += 0.001
dL_dX_empirical = (matrix_forward_sum(X1,W, sigmoid) - matrix_forward_sum(X, W, sigmoid))/0.001
print("dl_dx = ", dL_dX_empirical)




dl_dx =  0.0368797434000534


In [14]:
# diff between mathematical and empirical values
print("diff = ", matrix_backward_sum(X,W,sigmoid)[0][0] - dL_dX_empirical)

diff =  -2.3113531009644195e-05


Note :
to compute, dL_dW , the expression is np.dot(dN_dW, dS_dN), where dN_dW is np.transpose(X, (1,0))