<a href="https://colab.research.google.com/github/CM-MN/UnderstandingDeepLearning/blob/main/Notebooks/Chap07/7_2_Backpropagation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Notebook 7.2: Backpropagation**

This notebook runs the backpropagation algorithm on a deep neural network as described in section 7.4 of the book.

Work through the cells below, running each cell in turn. In various places you will see the words "TODO". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.

Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

First let's define a neural network.  We'll just choose the weights and biases randomly for now

In [None]:
# Set seed so we always get the same random numbers
np.random.seed(0)

# Number of hidden layers
K = 5
# Number of neurons per layer
D = 6
# Input layer
D_i = 1
# Output layer
D_o = 1

# Make empty lists
all_weights = [None] * (K+1)
all_biases = [None] * (K+1)

# Create input and output layers
all_weights[0] = np.random.normal(size=(D, D_i))#6*1
all_weights[-1] = np.random.normal(size=(D_o, D))#1*6
all_biases[0] = np.random.normal(size =(D,1))#6*1
all_biases[-1]= np.random.normal(size =(D_o,1))#1*1

# Create intermediate layers
for layer in range(1,K):
  all_weights[layer] = np.random.normal(size=(D,D))
  all_biases[layer] = np.random.normal(size=(D,1))

# print(all_weights[1])
# print(all_biases)

[[-0.85409574 -2.55298982  0.6536186   0.8644362  -0.74216502  2.26975462]
 [-1.45436567  0.04575852 -0.18718385  1.53277921  1.46935877  0.15494743]
 [ 0.37816252 -0.88778575 -1.98079647 -0.34791215  0.15634897  1.23029068]
 [ 1.20237985 -0.38732682 -0.30230275 -1.04855297 -1.42001794 -1.70627019]
 [ 1.9507754  -0.50965218 -0.4380743  -1.25279536  0.77749036 -1.61389785]
 [-0.21274028 -0.89546656  0.3869025  -0.51080514 -1.18063218 -0.02818223]]


In [None]:
# Define the Rectified Linear Unit (ReLU) function
def ReLU(preactivation):
  activation = preactivation.clip(0.0)
  return activation

Now let's run our random network.  The weight matrices $\boldsymbol\Omega_{0\ldots K}$ are the entries of the list "all_weights" and the biases $\boldsymbol\beta_{0\ldots K}$ are the entries of the list "all_biases"

We know that we will need the preactivations $\mathbf{f}_{0\ldots K}$ and the activations $\mathbf{h}_{1\ldots K}$ for the forward pass of backpropagation, so we'll store and return these as well.


In [None]:
def compute_network_output(net_input, all_weights, all_biases):

  # 检索层数
  K = len(all_weights) -1

  # 我们将每个层的预激活值存储在一个名为"all_f"的列表中，
  # 而将激活值存储在另一个名为"all_h"的列表中。
  all_f = [None] * (K+1)
  all_h = [None] * (K+1)

  # 为了方便起见，我们将all_h[0]设置为输入，
  # all_f[K]将是输出。
  all_h[0] = net_input

  # 遍历所有层次，计算 all_f[0... K-1] 和 all_h[1... K]。
  for layer in range(K):
      # 根据公式 7.16 更新这一层的预激活和激活。
      # 记住要使用 np.matmul 进行矩阵乘法运算。
      # TODO -- Replace the lines below
      all_f[layer] = np.matmul(all_weights[layer], all_h[layer]) + all_biases[layer]
      # print(all_f[layer])
      all_h[layer+1] = ReLU(all_f[layer])
      # print(all_h[layer+1])

  # Compute the output from the last hidden layer
  # TODO -- Replace the line below
  all_f[K] = np.matmul(all_weights[K], all_h[K]) + all_biases[K]

  # Retrieve the output
  net_output = all_f[K]

  return net_output, all_f, all_h

In [None]:
# Define input
net_input = np.ones((D_i,1)) * 1.2
# Compute network output
net_output, all_f, all_h = compute_network_output(net_input,all_weights, all_biases)
print(all_f)
print(all_h)
print("True output = %3.3f, Your answer = %3.3f"%(1.907, net_output[0,0]))

[array([[ 2.878],
       [ 0.602],
       [ 1.618],
       [ 3.023],
       [ 3.735],
       [-1.378]]), array([[-2.668],
       [ 5.727],
       [-2.817],
       [-6.37 ],
       [ 3.353],
       [-7.151]]), array([[-4.218],
       [-4.637],
       [ 0.765],
       [-9.941],
       [ 8.939],
       [ 2.291]]), array([[-1.88 ],
       [15.662],
       [ 9.427],
       [-1.153],
       [ 4.222],
       [ 1.946]]), array([[ -7.223],
       [-20.706],
       [-14.327],
       [-14.817],
       [ 11.063],
       [-20.802]]), array([[1.907]])]
[array([[1.2]]), array([[2.878],
       [0.602],
       [1.618],
       [3.023],
       [3.735],
       [0.   ]]), array([[0.   ],
       [5.727],
       [0.   ],
       [0.   ],
       [3.353],
       [0.   ]]), array([[0.   ],
       [0.   ],
       [0.765],
       [0.   ],
       [8.939],
       [2.291]]), array([[ 0.   ],
       [15.662],
       [ 9.427],
       [ 0.   ],
       [ 4.222],
       [ 1.946]]), array([[ 0.   ],
       [ 0.   ],
      

现在让我们定义一个损失函数。我们将使用最小二乘法损失函数。我们还将编写一个函数来计算 dloss_doutput。

In [1]:
def least_squares_loss(net_output, y):
  return np.sum((net_output-y) * (net_output-y))

def d_loss_d_output(net_output, y):
    return 2*(net_output -y);

In [None]:
y = np.ones((D_o,1)) * 20.0
loss = least_squares_loss(net_output, y)
print("y = %3.3f Loss = %3.3f"%(y, loss))

y = 20.000 Loss = 327.371


  print("y = %3.3f Loss = %3.3f"%(y, loss))


现在让我们计算网络的导数。我们已经计算了前向传递。让我们计算后向传递。

In [None]:
# We'll need the indicator function
def indicator_function(x):
  x_in = np.array(x)
  x_in[x_in>0] = 1
  x_in[x_in<=0] = 0
  return x_in

# Main backward pass routine
def backward_pass(all_weights, all_biases, all_f, all_h, y):
  # 我们也将把导数 dl_dweights 和 dl_dbiases 存储在列表中。
  all_dl_dweights = [None] * (K+1)
  all_dl_dbiases = [None] * (K+1)
  # 我们将把损失相对于激活和预激活的导数存储在列表中。
  all_dl_df = [None] * (K+1)
  all_dl_dh = [None] * (K+1)
  # 为了方便起见，我们仍然沿用 convention，即 all_h[0] 是网络的输入，all_f[k] 是网络的输出。
  # 计算损失相对于网络输出的导数。
  all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))
  # print(all_dl_df)
  # Now work backwards through the network
  for layer in range(K,-1,-1):
    # TODO 使用 all_dl_df[layer] 计算层级偏差相对于损失的导数。（公式 7.21）
    # NOTE!  To take a copy of matrix X, use Z=np.array(X)
    # REPLACE THIS LINE
    all_dl_dbiases[layer] = all_dl_df[layer]


    # TODO 根据 all_dl_df[layer] 和 all_h[layer]（公式 7.22）计算损失函数相对于第 layer 层权重的导数。
    # Don't forget to use np.matmul
    # REPLACE THIS LINE
    all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].T)

    # TODO: 计算损失相对于权重的激活值的导数，以及下一个预激活值的导数（方程7.25最后一行的第二部分）
    # REPLACE THIS LINE
    all_dl_dh[layer] = np.matmul(all_weights[layer].T, all_dl_df[layer])
    print(all_dl_dh)


    if layer > 0:
      # TODO 计算损失相对于预激活函数f的导数（使用ReLU函数的导数，即公式7.25最后一行的第一部分）
      # REPLACE THIS LINE
      all_dl_df[layer-1] = all_dl_dh[layer] * indicator_function(all_f[layer-1])

  return all_dl_dweights, all_dl_dbiases

In [None]:
all_dl_dweights, all_dl_dbiases = backward_pass(all_weights, all_biases, all_f, all_h, y)

[None, None, None, None, None, array([[-34.381],
       [  5.477],
       [  3.735],
       [-14.858],
       [ -5.212],
       [-52.625]])]
[None, None, None, None, array([[-1.652],
       [-4.8  ],
       [-1.661],
       [-4.466],
       [ 3.393],
       [ 5.391]]), array([[-34.381],
       [  5.477],
       [  3.735],
       [-14.858],
       [ -5.212],
       [-52.625]])]
[None, None, None, array([[ -1.993],
       [-11.684],
       [  0.938],
       [  3.609],
       [ -9.993],
       [  0.508]]), array([[-1.652],
       [-4.8  ],
       [-1.661],
       [-4.466],
       [ 3.393],
       [ 5.391]]), array([[-34.381],
       [  5.477],
       [  3.735],
       [-14.858],
       [ -5.212],
       [-52.625]])]
[None, None, array([[-19.484],
       [-11.297],
       [  1.651],
       [ 10.065],
       [-10.722],
       [  3.742]]), array([[ -1.993],
       [-11.684],
       [  0.938],
       [  3.609],
       [ -9.993],
       [  0.508]]), array([[-1.652],
       [-4.8  ],
       [-1

In [None]:
np.set_printoptions(precision=3)
# 为有限差分计算的导数腾出空间。
all_dl_dweights_fd = [None] * (K+1)
all_dl_dbiases_fd = [None] * (K+1)

# 让我们用有限差分法来检验我们的导数是否正确。
delta_fd = 0.000001

# 测试偏置向量的导数。
for layer in range(K+1):
  dl_dbias  = np.zeros_like(all_dl_dbiases[layer])
  # For every element in the bias
  for row in range(all_biases[layer].shape[0]):
    # Take copy of biases  We'll change one element each time
    all_biases_copy = [np.array(x) for x in all_biases]
    all_biases_copy[layer][row] += delta_fd
    network_output_1, *_ = compute_network_output(net_input, all_weights, all_biases_copy)
    network_output_2, *_ = compute_network_output(net_input, all_weights, all_biases)
    dl_dbias[row] = (least_squares_loss(network_output_1, y) - least_squares_loss(network_output_2,y))/delta_fd
  all_dl_dbiases_fd[layer] = np.array(dl_dbias)
  print("-----------------------------------------------")
  print("Bias %d, derivatives from backprop:"%(layer))
  print(all_dl_dbiases[layer])
  print("Bias %d, derivatives from finite differences"%(layer))
  print(all_dl_dbiases_fd[layer])
  if np.allclose(all_dl_dbiases_fd[layer],all_dl_dbiases[layer],rtol=1e-05, atol=1e-08, equal_nan=False):
    print("Success!  Derivatives match.")
  else:
    print("Failure!  Derivatives different.")



# Test the derivatives of the weights matrices
for layer in range(K+1):
  dl_dweight  = np.zeros_like(all_dl_dweights[layer])
  # For every element in the bias
  for row in range(all_weights[layer].shape[0]):
    for col in range(all_weights[layer].shape[1]):
      # Take copy of biases  We'll change one element each time
      all_weights_copy = [np.array(x) for x in all_weights]
      all_weights_copy[layer][row][col] += delta_fd
      network_output_1, *_ = compute_network_output(net_input, all_weights_copy, all_biases)
      network_output_2, *_ = compute_network_output(net_input, all_weights, all_biases)
      dl_dweight[row][col] = (least_squares_loss(network_output_1, y) - least_squares_loss(network_output_2,y))/delta_fd
  all_dl_dweights_fd[layer] = np.array(dl_dweight)
  print("-----------------------------------------------")
  print("Weight %d, derivatives from backprop:"%(layer))
  print(all_dl_dweights[layer])
  print("Weight %d, derivatives from finite differences"%(layer))
  print(all_dl_dweights_fd[layer])
  if np.allclose(all_dl_dweights_fd[layer],all_dl_dweights[layer],rtol=1e-05, atol=1e-08, equal_nan=False):
    print("Success!  Derivatives match.")
  else:
    print("Failure!  Derivatives different.")

-----------------------------------------------
Bias 0, derivatives from backprop:
[[ -4.486]
 [  4.947]
 [  6.812]
 [ -3.883]
 [-24.935]
 [  0.   ]]
Bias 0, derivatives from finite differences
[[ -4.486]
 [  4.947]
 [  6.812]
 [ -3.883]
 [-24.935]
 [  0.   ]]
Success!  Derivatives match.
-----------------------------------------------
Bias 1, derivatives from backprop:
[[ -0.   ]
 [-11.297]
 [  0.   ]
 [  0.   ]
 [-10.722]
 [  0.   ]]
Bias 1, derivatives from finite differences
[[  0.   ]
 [-11.297]
 [  0.   ]
 [  0.   ]
 [-10.722]
 [  0.   ]]
Success!  Derivatives match.
-----------------------------------------------
Bias 2, derivatives from backprop:
[[-0.   ]
 [-0.   ]
 [ 0.938]
 [ 0.   ]
 [-9.993]
 [ 0.508]]
Bias 2, derivatives from finite differences
[[ 0.   ]
 [ 0.   ]
 [ 0.938]
 [ 0.   ]
 [-9.993]
 [ 0.508]]
Success!  Derivatives match.
-----------------------------------------------
Bias 3, derivatives from backprop:
[[-0.   ]
 [-4.8  ]
 [-1.661]
 [-0.   ]
 [ 3.393]
 [ 5.391]