# Numerical Gradient Checking

We would highly recommend looking at `neural_networks.grad_check.check_gradients` and making sure you understand how numerical gradient checking is being carried out. This function is used in the notebook to check the gradients of the neural network layers you write. Make sure to check the gradient of a layer after finishing its implementation.

The function returns the relative error of the numerical gradient (approximated using finite differences) with respect to the analytical gradient (computed via backpropagation). Correct implementations should get very small errors, usually less than `1e-8` for 64-bit float matrices (the default).

In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
from neural_networks.utils.grad_check import check_gradients
from neural_networks.layers import FullyConnected, Elman, Conv2D
from neural_networks.activations import Linear, Sigmoid, TanH, ReLU, SoftMax

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Gradient Checks for Activation Functions

### Linear Activation

In [10]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
linear_activation = Linear()
_ = linear_activation.forward(X)
grad = linear_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for linear activation:",
    check_gradients(
        fn=linear_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for linear activation: 1.3516731364859686e-11


### Sigmoid Activation

In [11]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
sigmoid_activation = Sigmoid()
_ = sigmoid_activation.forward(X)
grad = sigmoid_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for sigmoid activation:",
    check_gradients(
        fn=sigmoid_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for sigmoid activation: 6.9263946413469e-11


### Tanh Activation

In [12]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
tanh_activation = TanH()
_ = tanh_activation.forward(X)
grad = tanh_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for tanh activation:",
    check_gradients(
        fn=tanh_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for tanh activation: 5.729794326792073e-11


### ReLU Activation

In [13]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
relu_activation = ReLU()
out = relu_activation.forward(X)
grad = relu_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=relu_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for relu activation: 3.717264202808011e-11


### Softmax Activation

In [14]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
softmax_activation = SoftMax()
_ = softmax_activation.forward(X)
grad = softmax_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for softmax activation:",
    check_gradients(
        fn=softmax_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for softmax activation: 9.804391120217638e-11


## Gradient Checks for Full Layers (Linear Activations)

### Fully Connected Layer

In [15]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="linear")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

Relative error for W: 2.3604526113962683e-11
Relative error for b: 3.6887862088017395e-11


### Elman Recurrent Layer

In [21]:
X = np.random.randn(2, 3, 4)
dLdY = np.random.randn(2, 5)

# initialize a recurrent layer
# and perform a forward and backward pass
elman_layer = Elman(n_out=5, activation="linear")
_ = elman_layer.forward(X)
elman_layer.backward(dLdY)

array([[[ 0.24865537, -0.45816701, -1.14183762, -2.2445318 ],
        [-0.42464869,  0.44467483,  0.03855444,  2.02761611],
        [ 0.25962227,  0.93118781, -1.41458883,  0.53230335]],

       [[ 0.37649308,  0.40338735,  0.24888922,  0.27321365],
        [-0.65576189, -0.36727976, -1.56520023,  2.19214081],
        [-0.28514742,  2.19725584, -0.61627825, -0.53703033]]])

In [48]:
X = np.random.randn(2, 3, 4)
dLdY = np.random.randn(2, 5)

# initialize a recurrent layer
# and perform a forward and backward pass
elman_layer = Elman(n_out=5, activation="linear")
_ = elman_layer.forward(X)
_ = elman_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in elman_layer.parameters:
    # check the gradient
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=elman_layer.forward_with_param(param, X),  # the function we are checking
            grad=elman_layer.gradients[param],  # the analytically computed gradient
            x=elman_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

Relative error for W: 1.6467947349248552e-10
Relative error for U: 7.68427106603854e-11
Relative error for b: 4.0958602298273e-11


In [49]:
print(elman_layer.forward(X))
print(elman_layer.backward(dLdY))

[[ 0.13227646  2.72561834 -3.51184015  1.65476409  1.15050501]
 [-0.30524898  1.13184235 -3.82734197  2.24965152  0.52985821]]
[[[-0.59430344  0.35174392 -0.61351545  1.0759192 ]
  [-0.31286648  0.42094285 -0.66930781  0.24103274]
  [ 0.38228753  1.00438452  0.17411514  0.70154179]]

 [[ 0.56939548  0.15878384  0.73698659 -0.69056823]
  [-0.80464029 -1.30623113 -0.46291686 -0.52211564]
  [ 0.71986804 -0.39600199 -0.80365108 -0.66914818]]]


In [50]:
print(elman_layer.forward_sol(X))
print(np.array(elman_layer.backward_sol(dLdY)).shape)
print(np.swapaxes(np.swapaxes(np.array(elman_layer.backward_sol(dLdY)),1,0),1,2).shape)
print(np.swapaxes(np.swapaxes(np.array(elman_layer.backward_sol(dLdY)),1,0),1,2))

[[ 0.13227646  2.72561834 -3.51184015  1.65476409  1.15050501]
 [-0.30524898  1.13184235 -3.82734197  2.24965152  0.52985821]]
(4, 2, 3)
(2, 3, 4)
[[[-0.59430344  0.35174392 -0.61351545  1.0759192 ]
  [-0.31286648  0.42094285 -0.66930781  0.24103274]
  [ 0.38228753  1.00438452  0.17411514  0.70154179]]

 [[ 0.56939548  0.15878384  0.73698659 -0.69056823]
  [-0.80464029 -1.30623113 -0.46291686 -0.52211564]
  [ 0.71986804 -0.39600199 -0.80365108 -0.66914818]]]


In [26]:
%debug

> [1;32mc:\users\desco\documents\pythonscripts\cs189\hw6\hw6\neural_networks\layers.py[0m(458)[0;36mbackward_step_sol[1;34m()[0m
[1;32m    456 [1;33m        [0mself[0m[1;33m.[0m[0mgradients[0m[1;33m[[0m[1;34m"W"[0m[1;33m][0m [1;33m+=[0m [0mXt[0m[1;33m.[0m[0mT[0m [1;33m@[0m [0mdZt[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    457 [1;33m        [0mself[0m[1;33m.[0m[0mgradients[0m[1;33m[[0m[1;34m"U"[0m[1;33m][0m [1;33m+=[0m [0mYt[0m[1;33m.[0m[0mT[0m [1;33m@[0m [0mdZt[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m--> 458 [1;33m        [0mself[0m[1;33m.[0m[0mgradients[0m[1;33m[[0m[1;34m"b"[0m[1;33m][0m [1;33m+=[0m [0mdZt[0m[1;33m.[0m[0msum[0m[1;33m([0m[0maxis[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m [0mkeepdims[0m[1;33m=[0m[1;32mTrue[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    459 [1;33m        [1;32mreturn[0m [0mdLdYt_prev[0m[1;33m,[0m [0mdXt[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    460 [1

### Conv Layer

In [67]:
X = np.random.randn(2, 5, 6, 7)
dLdY = np.random.randn(2, 5, 6, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
conv_layer = Conv2D(
    n_out=4,
    kernel_shape=(3, 3),
    activation="linear",
    weight_init="uniform",
    pad="same",
)
_ = conv_layer.forward(X)
_ = conv_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in conv_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=conv_layer.forward_with_param(param, X),  # the function we are checking
            grad=conv_layer.gradients[param],  # the analytically computed gradient
            x=conv_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

Relative error for W: 1.1562886306666961e-10
Relative error for b: 2.956962288278152e-11


In [68]:
from neural_networks.losses import CrossEntropy

num_pts = 5
num_classes = 6

# one-hot encoded y
y_idxs = np.random.randint(0, num_classes, (num_pts,))
y = np.zeros((num_pts, num_classes))
y[range(num_pts), y_idxs] = 1

# normalized predictions
scores = np.random.uniform(0, 1, size=(num_pts, num_classes))
y_hat = scores / scores.sum(axis=1, keepdims=True)

cross_entropy_loss = CrossEntropy("cross_entropy")

def forward_fn(Y, Y_hat):    
    def inner_forward(Y_hat):
        return cross_entropy_loss.forward(Y, Y_hat)
    return inner_forward

loss = cross_entropy_loss.forward(y, y_hat)
grad = cross_entropy_loss.backward(y, y_hat)

print(
    f"Relative error for cross entropy loss:",
    check_gradients(
        fn=forward_fn(y, y_hat),  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=y_hat,        # the variable w.r.t. which we are taking the gradient
        dLdf=1,  # gradient at previous layer
    )
)

Relative error for cross entropy loss: 9.604709902453602e-11
