# 完全基于Numpy的神经网络

源代码：https://github.com/SkalskiP/ILearnDeepLearning.py/blob/master/01_mysteries_of_neural_networks/03_numpy_neural_net/Numpy%20deep%20neural%20network.ipynb

## 准备阶段

在这阶段中定义了一个五层的神经网络，并且定义了每一层网络中神经元的计算方式。同时，本节中同样对两种激活函数进行定义。

In [43]:
import numpy as np

定义五层神经网络。每一层神经网络由三个部分组成，分别是输入神经元的数量、输出神经元的数量以及在这一层所使用的激活函数。

In [44]:
nn_architecture = [
    {"input_dim": 2, "output_dim" : 2, "activation" : "relu"},
    {"input_dim": 25, "output_dim" : 50, "activation" : "relu"},
    {"input_dim": 50, "output_dim" : 50, "activation" : "relu"},
    {"input_dim": 50, "output_dim" : 25, "activation" : "relu"},
    {"input_dim": 25, "output_dim" : 1, "activation" : "sigmoid"},
]

    @fn init_layers
    @brief 初始化神经网络层
    @param nn_architecture 神经网络结构
    @param seed 随机种子
    @return params_values 函数层的参数

In [45]:
def init_layers(nn_architecture, seed = 99):
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    params_values = {}
    
    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        params_values['W' + str(layer_idx)] = np.random.randn(
            layer_output_size, layer_input_size) * 0.1
        params_values['b' + str(layer_idx)] = np.random.randn(
            layer_otput_size, 1) * 0.1
    
    return params_values

定义了两种激活函数。同时定义了这两种激活函数的导数以便于反向传播的计算。

In [46]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0, Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dZ, copy = True)
    dZ[Z <= 0] = 0
    return dZ

## 前向传播阶段

    @fn Signle_layer_forward_propagation
    @brief 对一层神经网络的前向传播计算
    @param A_prev 神经元的值
    @param W_curr 神经元的权重
    @param b_curr 神经元的偏置（bias）
    @param activation 所使用激活函数的类型
    @return 该层经过激活值以及未经激活值

In [47]:
def signle_layer_forward_propagation(A_prev, W_curr, b_curr, activation = "relu"):
    Z_curr = np.dot(W_curr, A_prev) + b_curr
    
    if activation == "relu":
        activation_func = relu
    else:
        activation_func = sigmoid
        
    return activation_func(Z_curr), Z_curr

    @fn full_forward_propagation
    @brief 对所有神经网络的前向传播计算
    @param X 输入层的激活值
    @param params_values 层的参数
    @param nn_architecture 神经网络
    @return 每一层的激活值，计算反向传播所需要的内存空间

In [48]:
def full_forward_propagation(X, params_values, nn_architecture):
    # creating a temporary memory to store the information needed for a backward step
    memory = {}
    # X vector is the activation for layer 0 
    A_curr = X
    
    # iteration over network layers
    for idx, layer in enumerate(nn_architecture):
        # we number network layers from 1
        layer_idx = idx + 1
        # transfer the activation from the previous iteration
        A_prev = A_curr
        
        # extraction of the activation function for the current layer
        activ_function_curr = layer["activation"]
        # extraction of W for the current layer
        W_curr = params_values["W" + str(layer_idx)]
        # extraction of b for the current layer
        b_curr = params_values["b" + str(layer_idx)]
        # calculation of activation for the current layer
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)
        
        # saving calculated values in the memory
        memory["A" + str(idx)] = A_prev
        memory["Z" + str(layer_idx)] = Z_curr
       
    # return of prediction vector and a dictionary containing intermediate values
    return A_curr, memory

## 损失函数计算阶段

In [49]:
def get_cost_value(Y_hat, Y):
    # 交叉熵损失函数
    # number of examples
    m = Y_hat.shape[1]
    # calculation of the cost according to the formula
    cost = -1 / m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
    return np.squeeze(cost)

In [50]:
# an auxiliary function that converts probability into class
def convert_prob_into_class(probs):
    probs_ = np.copy(probs)
    probs_[probs_ > 0.5] = 1
    probs_[probs_ <= 0.5] = 0
    return probs_

In [51]:
def get_accuracy_value(Y_hat, Y):
    Y_hat_ = convert_prob_into_class(Y_hat)
    return (Y_hat_ == Y).all(axis=0).mean()

## 反向传播阶段

    @fn single_layer_backward_propagation
    @brief 对一层神经网络的反向传播计算
    @param dA_curr 前一层的梯度
    @param W_curr 层的参数
    @param b_curr 神经元的权重
    @param Z_curr 神经元的偏置（bias）
    @param A_prev 神经元的值
    @param activation 激活函数类型
    @return 该层参数的梯度

In [52]:
def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="relu"):
    # number of examples
    m = A_prev.shape[1]
    
    # selection of activation function
    if activation == "relu":
        backward_activation_func = relu_backward
    elif activation == "sigmoid":
        backward_activation_func = sigmoid_backward
    else:
        raise Exception('Non-supported activation function')
    
    # calculation of the activation function derivative
    dZ_curr = backward_activation_func(dA_curr, Z_curr)
    
    # derivative of the matrix W
    dW_curr = np.dot(dZ_curr, A_prev.T) / m
    # derivative of the vector b
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
    # derivative of the matrix A_prev
    dA_prev = np.dot(W_curr.T, dZ_curr)

    return dA_prev, dW_curr, db_curr

    @fn full_backward_propagation
    @brief 对所有神经网络的反向传播计算
    @param Y_hat 最终得到的前向传播结果
    @param Y 预期的前向传播结果
    @param memory 存储空间
    @param params_values 神经元参数
    @param nn_architecture 神经网络
    @return 梯度值

In [53]:
def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
    grads_values = {}
    
    # number of examples
    m = Y.shape[1]
    # a hack ensuring the same shape of the prediction vector and labels vector
    Y = Y.reshape(Y_hat.shape)
    
    # initiation of gradient descent algorithm
    dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat));
    
    for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
        # we number network layers from 1
        layer_idx_curr = layer_idx_prev + 1
        # extraction of the activation function for the current layer
        activ_function_curr = layer["activation"]
        
        dA_curr = dA_prev
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        
        W_curr = params_values["W" + str(layer_idx_curr)]
        b_curr = params_values["b" + str(layer_idx_curr)]
        
        dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
            dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr)
        
        grads_values["dW" + str(layer_idx_curr)] = dW_curr
        grads_values["db" + str(layer_idx_curr)] = db_curr
    
    return grads_values

## 参数更新阶段

    @fn update
    @brief 更新神经网络参数
    @param params_values 神经元参数
    @param grads_values 梯度值
    @param nn_architecture 神经网络框架
    @param learning_rate 学习率即每一次更新的步长
    @return 更新后的神经网络参数

In [54]:
def update(params_values, grads_values, nn_architecture, learning_rate):

    # iteration over network layers
    for layer_idx, layer in enumerate(nn_architecture, 1):
        params_values["W" + str(layer_idx)] -= learning_rate * grads_values["dW" + str(layer_idx)]        
        params_values["b" + str(layer_idx)] -= learning_rate * grads_values["db" + str(layer_idx)]

    return params_values;

## 训练神经网络

In [55]:
def train(X, Y, nn_architecture, epochs, learning_rate, verbose=False, callback=None):
    # initiation of neural net parameters
    params_values = init_layers(nn_architecture, 2)
    # initiation of lists storing the history 
    # of metrics calculated during the learning process 
    cost_history = []
    accuracy_history = []
    
    # performing calculations for subsequent iterations
    for i in range(epochs):
        # step forward
        Y_hat, cashe = full_forward_propagation(X, params_values, nn_architecture)
        
        # calculating metrics and saving them in history
        cost = get_cost_value(Y_hat, Y)
        cost_history.append(cost)
        accuracy = get_accuracy_value(Y_hat, Y)
        accuracy_history.append(accuracy)
        
        # step backward - calculating gradient
        grads_values = full_backward_propagation(Y_hat, Y, cashe, params_values, nn_architecture)
        # updating model state
        params_values = update(params_values, grads_values, nn_architecture, learning_rate)
        
        if(i % 50 == 0):
            if(verbose):
                print("Iteration: {:05} - cost: {:.5f} - accuracy: {:.5f}".format(i, cost, accuracy))
            if(callback is not None):
                callback(i, params_values)
            
    return params_values