# 神经网络练习(Neural Network Practice)

## 项目背景

#### 项目背景：
* 1.项目基于吴恩达教授的《机器学习》课程。
* 2.数据均来源于课程配套资料。


#### 项目目的：
* 1、使用神经网络辨别手写数字（0-9）。
* 3、使用反向传播优化神经网络。


#### 数据说明：
* 1、分析目的：通过训练集中5000张手写数字（像素20\*20）及正确数字得到辨别手写数字的模型。
* 2、数据说明：ex4data1：X、y数据集
    * X：5000行 * 400列，即5000个手写数字，每列代表数字一个像素点的灰度值
    * y：5000行 * 1列，即对应的5000个数字。
* 3、数据说明：ex4weights：神经网络参数（也叫权重）文件
    * Theta1：Layer1参数
    * Theta2：Layer2参数

# 导入相关的库

In [2]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly as py
import plotly.graph_objs as go
import cufflinks as cf
from plotly.offline import iplot,init_notebook_mode
cf.go_offline(connected=True)
init_notebook_mode(connected=True)

import scipy.io as sio #因为是mat文件，需要用scipy导入
import keras
from keras.models import Sequential #用于初始化神经网络
from keras.layers import Dense 

Using TensorFlow backend.


# 1.导入数据集

In [19]:
data = sio.loadmat(r"E:\Learning\python\ML\code\ex4-NN back propagation\ex4data1.mat")

In [20]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [102]:
X=data["X"]
y=data['y']

In [103]:
print(X.shape)
print(y.shape)

(5000, 400)
(5000, 1)


In [112]:
# 因共有10个分类，将y分成10列
y_matrix = []
y_ravel=data['y'].reshape(data['y'].shape[0])
for k in range(1, 11):
    y_matrix.append((y_ravel == k).astype(int))    # 
y_matrix = np.array(y_matrix).T
y_matrix.shape

(5000, 10)

# 2.数据可视化

In [113]:
# 我们来看看数据
pick_one = np.random.randint(0, 5000) #未设定种子，每次都能查看不同的值
img = np.array(X[pick_one,:].reshape((20,20)).T)
fig = px.imshow(img,color_continuous_scale='gray')
fig.update_layout(width=100,height=100,coloraxis_showscale=False,
                  margin=dict(l=10, r=10, b=10, t=10),
                  xaxis=dict(showticklabels=False),yaxis=dict(showticklabels=False))
fig.show()
print('这是数字 {}'.format(y[pick_one]))

这是数字 [9]


# 3.定义各个函数

## 3.1sigmoid 函数
g 代表一个常用的逻辑函数（logistic function）为S形函数（Sigmoid function），公式为： \\[g\left( z \right)=\frac{1}{1+{{e}^{-z}}}\\] 
合起来，我们得到逻辑回归模型的假设函数： 
	\\[{{h}_{\theta }}\left( x \right)=\frac{1}{1+{{e}^{-{{\theta }^{T}}X}}}\\] 

In [114]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

## 3.2前向传播函数
> a1：(400 + 1) -> a2：(25 + 1) -> $h_{\theta}(x)$：(10)

<img style="float: left;" src="code/img/nn_model.png">

In [115]:
def forward_propagate(X, theta1, theta2):
    m = X.shape[0]
    
    a1 = np.insert(X, 0, values=np.ones(m), axis=1)
    z2 = a1 * theta1.T
    a2 = np.insert(sigmoid(z2), 0, values=np.ones(m), axis=1)
    z3 = a2 * theta2.T
    h = sigmoid(z3)
    
    return a1, z2, a2, z3, h

## 3.3正则化代价函数
<img style="float: left;" src="code/img/nn_regcost.png">

In [116]:
def reg_cost(params, input_size, hidden_size, num_labels, X, y, reg_rate):
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    # 设置隐藏层和输出层的theta
    theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    
    # 前向传播
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    
    # 计算代价
    J = 0
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J = J + np.sum(first_term - second_term)
    
    J = J / m
    
    # 增加正则化参数
    J += (float(reg_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    
    return J

In [117]:
X.shape[0]

5000

In [118]:
# 初始化theta
input_size = 400
hidden_size = 25
num_labels = 10
reg_rate = 1

# 随机初始化完整网络参数大小的参数数组(0-0.5之间)
params = np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1))-0.5

m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)

# 将参数数组解开为每个层的参数矩阵
theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

theta1.shape, theta2.shape

((25, 401), (10, 26))

In [119]:
# 计算初始参数的代价
reg_cost(params, input_size, hidden_size, num_labels, X, y_matrix, reg_rate)

7.196695700408089

## 3.4反向传播函数
${\delta}^{3}=a3-y$\
${\delta}^{2}={({{\theta}^{2}}{\delta}^{3})}^{T}* g'(z^{(3)})= {({{\theta}^{2}}{\delta}^{3})}^{T}* {(a3*(1-a3))}$\
${\delta}^{1}={({{\theta}^{1}}{\delta}^{2})}^{T}* g'(z^{(2)})= {({{\theta}^{1}}{\delta}^{2})}^{T}* {(a2*(1-a2))}$\
假设${\lambda}=0$，即我们不做任何正则化处理时有： $\frac{\partial}{{\partial}{\theta}_{ij}^{(l)}}{J({\theta})}={a_j^{(l)}}{{\delta}_i^{(l+1)}}$

In [120]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [121]:
# 利用反向传播来计算梯度
def backprop(params, input_size, hidden_size, num_labels, X, y, reg_rate):
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    # 实施前向传播
    theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    
    J = 0
    delta1 = np.zeros(theta1.shape)  # (25, 401)
    delta2 = np.zeros(theta2.shape)  # (10, 26)
    
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J = J + np.sum(first_term - second_term)
    
    J = J / m
    
    J += (float(reg_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    
    # 实施反向传播
    for t in range(m):# m=5000
        a1t = a1[t,:]  # (1, 401)
        z2t = z2[t,:]  # (1, 25)
        a2t = a2[t,:]  # (1, 26)
        ht = h[t,:]  # (1, 10)
        yt = y[t,:]  # (1, 10)
        
        d3t = ht - yt  # (1, 10)
        
        z2t = np.insert(z2t, 0, values=np.ones(1))  # (1, 26)
        d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t))  # (1, 26)
        
        delta1 = delta1 + (d2t[:,1:]).T * a1t
        delta2 = delta2 + d3t.T * a2t
        
    delta1 = delta1 / m
    delta2 = delta2 / m
    
    delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * reg_rate) / m
    delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * reg_rate) / m
    
    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))
    
    return J, grad

In [122]:
J, grad = backprop(params, input_size, hidden_size, num_labels, X, y_matrix, reg_rate)
J, grad.shape

(7.196695700408089, (10285,))

# 4.训练神经网络

In [123]:
from scipy.optimize import minimize

# minimize the objective function
fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size, num_labels, X, y_matrix, reg_rate), 
                method='TNC', jac=True, options={'maxiter': 300})
fmin

     fun: 0.35407440910486704
     jac: array([-9.82637846e-04, -4.41413933e-07,  5.78654964e-07, ...,
       -5.63457116e-04, -5.29861524e-04, -2.58676153e-04])
 message: 'Max. number of function evaluations reached'
    nfev: 300
     nit: 21
  status: 3
 success: False
       x: array([ 0.18176269, -0.00220707,  0.00289327, ..., -0.27110709,
       -0.10022853, -1.6654796 ])

* success=false，说明由于迭代次数仅设置为300，函数还未到达局部最低点。

# 5.使用模型进行预测

In [124]:
theta1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred

array([[10],
       [10],
       [10],
       ...,
       [ 9],
       [ 9],
       [ 9]], dtype=int64)

In [125]:
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))

accuracy = 99.18%


# 鸣谢：
感谢黄海广博士提供的读书笔记及各项资料，我会在机器学习路上继续加油！