# 1：反向传播神经网络
### **by:MLZZY**
**实验描述：处理手写数字数据集，使用反向传播的前馈神经网络，自动学习神经网络的参数。**

### 1：神经网络

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.io import loadmat
#预处理工具
from sklearn.preprocessing import OneHotEncoder

In [3]:
data=loadmat('/home/mw/input/andrew_ml_ex45345/ex4data1.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [4]:
X=data['X']
y=data['y']
X.shape,y.shape

((5000, 400), (5000, 1))

In [5]:
weight=loadmat('/home/mw/input/andrew_ml_ex45345/ex4weights.mat')
theta1,theta2=weight['Theta1'],weight['Theta2']
theta1.shape,theta2.shape

((25, 401), (10, 26))

In [6]:
sample_idx=np.random.choice(np.arange(data['X'].shape[0]),100)
sample_images=data['X'][sample_idx,:]
fig,ax_array=plt.subplots(nrows=10,ncols=10,sharex=True,sharey=True,figsize=(12,12))
for r in range(10):
    for c in range(10):
        ax_array[r,c].matshow(np.array(sample_images[10*r+c].reshape((20,20))).T,cmap=matplotlib.cm.binary)
        plt.xticks(np.array([]))
        plt.yticks(np.array([]))

In [7]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [16]:
def forward_propagate(X,theta1,theta2):
    m=X.shape[0]
    #常规插入全‘1’列操作
    a1=np.insert(X,0,values=np.ones(m),axis=1)
    z2=a1*theta1.T
    a2=np.insert(sigmoid(z2),0,values=np.ones(m),axis=1)
    z3=a2*theta2.T
    h=sigmoid(z3)
    return a1,z2,a2,z3,h

![Image Name](https://cdn.kesci.com/upload/image/pzgj2aojyc.png?imageView2/0/w/960/h/960)

In [25]:
#在不加入正则化项的cost函数中，参数lamda无用，但为了统一，还是加上了
def cost(theta1,theta2,input_size,hidden_size,num_labels,X,y,lamda):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)
    a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
    J=0
    for i in range(m):
        first_term=np.multiply(-y[i,:],np.log(h[i,:]))
        second_term=np.multiply((1-y[i,:]),np.log(1-h[i,:]))
        J+=np.sum(first_term-second_term)
    J=J/m
    return J

In [18]:
#y的初始值为5000*1维的向量，下面需要将y编码成5000*10维的矩阵
#注：此数据集中，手写数字'0'在数据集中用'10'表示，即：
#y=2，编码后：[0,1,0,0,0,0,0,0,0,0]
#y=0，编码后：[0,0,0,0,0,0,0,0,0,1]
#使用Scikitlearn内置的编码函数对y进行编码
#初始化OneHotEncoder实例时，默认sparse参数为True，编码后返回的是一个稀疏矩阵的对象，如果要使用需要调用toarray()方法将其转化为array对象。
#若将sparse参数设置为False，则直接生成array对象，可直接使用。
encoder=OneHotEncoder(sparse=False)
y_onehot=encoder.fit_transform(y)
y_onehot.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(5000, 10)

In [19]:
y[0],y_onehot[0,:]

(matrix([[10]], dtype=uint8), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]))

In [21]:
input_size=400
hidden_size=25
num_labels=10
lamda=1

In [26]:
cost(theta1,theta2,input_size,hidden_size,num_labels,X,y_onehot,lamda)

0.2876291651613188

![Image Name](https://cdn.kesci.com/upload/image/pzgju3am60.png?imageView2/0/w/960/h/960)

In [28]:
#上面实现了不带正则化项的代价函数，下面实现带正则化项的损失函数
def costReg(theta1,theta2,input_size,hidden_size,num_labels,X,y,lamda):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)
    a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
    J=0
    for i in range(m):
        first_term=np.multiply(-y[i,:],np.log(h[i,:]))
        second_term=np.multiply((1-y[i,:]),np.log(1-h[i,:]))
        J+=np.sum(first_term-second_term)
    J=J/m
    #正则化项
    J+=(float(lamda)/(2*m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))
    return J

In [29]:
costReg(theta1,theta2,input_size,hidden_size,num_labels,X,y_onehot,lamda)

0.38376985909092354

### **2：反向传播**
**实现反向传播的算法，来计算神经网络代价函数的梯度。获得了梯度后，就可以使用工具库函数来计算代价函数的最小值。**

In [30]:
#计算sigmoid函数的梯度
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z),(1-sigmoid(z)))

In [31]:
sigmoid_gradient(0)

0.25

In [32]:
#进行随机初始化
#将theta初始化为[-0.12,0.12]之间的随机值，此范围保证了theta足够小，使学习效率更高
params=(np.random.random(size=hidden_size*(input_size+1)+num_labels*(hidden_size+1))-0.5)*0.24

In [37]:
#反向传播
def backprop(params,input_size,hidden_size,num_labels,X,y,lamda):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)
    #重塑参数形状
    theta1=np.matrix(np.reshape(params[:hidden_size*(input_size+1)],(hidden_size,(input_size+1))))
    theta2=np.matrix(np.reshape(params[hidden_size*(input_size+1):],(num_labels,(hidden_size+1))))
    #前向传播
    a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
    #初始化
    J=0
    delta1=np.zeros(theta1.shape) #(25,401)
    delta2=np.zeros(theta2.shape) #(10,26)
    #计算损失函数
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    J=J/m
    #执行反向传播
    for t in range(m):
        a1t=a1[t,:] #(1,401)
        z2t=z2[t,:] #(1,25)
        a2t=a2[t,:] #(1,26)
        ht=h[t,:] #(1,10)
        yt=y[t,:] #(1,10)
        d3t=ht-yt #(1,10)
        z2t=np.insert(z2t,0,values=np.zeros(1)) #(1,26)
        d2t=np.multiply((theta2.T*d3t.T).T,sigmoid_gradient(z2t)) #(1,26)
        delta1=delta1+(d2t[:,1:]).T*a1t
        delta2=delta2+d3t.T*a2t
    delta1=delta1/m
    delta2=delta2/m
    return J,delta1,delta2

In [34]:
#梯度校验
#做法见iPad笔记

In [38]:
#加入正则化项的反向传播
def backpropReg(params,input_size,hidden_size,num_labels,X,y,lamda):
    m=X.shape[0]
    X=np.matrix(X)
    y=np.matrix(y)
    #重塑参数形状
    theta1=np.matrix(np.reshape(params[:hidden_size*(input_size+1)],(hidden_size,(input_size+1))))
    theta2=np.matrix(np.reshape(params[hidden_size*(input_size+1):],(num_labels,(hidden_size+1))))
    #前向传播
    a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
    #初始化
    J=0
    delta1=np.zeros(theta1.shape) #(25,401)
    delta2=np.zeros(theta2.shape) #(10,26)
    #计算损失函数
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    J=J/m
    #加入正则化项
    J+=(float(lamda)/(2*m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))
    #执行反向传播
    for t in range(m):
        a1t=a1[t,:] #(1,401)
        z2t=z2[t,:] #(1,25)
        a2t=a2[t,:] #(1,26)
        ht=h[t,:] #(1,10)
        yt=y[t,:] #(1,10)
        d3t=ht-yt #(1,10)
        z2t=np.insert(z2t,0,values=np.zeros(1)) #(1,26)
        d2t=np.multiply((theta2.T*d3t.T).T,sigmoid_gradient(z2t)) #(1,26)
        delta1=delta1+(d2t[:,1:]).T*a1t
        delta2=delta2+d3t.T*a2t
    delta1=delta1/m
    delta2=delta2/m
    #加入正则化项
    delta1[:,1:]=delta1[:,1:]+(theta1[:,1:]*lamda)/m
    delta2[:,1:]=delta2[:,1:]+(theta2[:,1:]*lamda)/m
    #将梯度矩阵分解为单个数组
    grad=np.concatenate((np.ravel(delta1),np.ravel(delta2)))
    return J,grad

In [43]:
#使用工具库计算参数最优解
from scipy.optimize import minimize
fmin = minimize(fun=backpropReg, x0=(params), args=(input_size, hidden_size, num_labels, X, y_onehot, lamda), 
                method='TNC', jac=True, options={'maxiter': 250})
fmin

     fun: 0.3271794925036686
     jac: array([ 1.12063780e-04,  7.05355019e-08, -8.21662881e-09, ...,
       -9.87205862e-05, -4.05536051e-05,  4.23440014e-05])
 message: 'Max. number of function evaluations reached'
    nfev: 250
     nit: 21
  status: 3
 success: False
       x: array([-1.67722945e+00,  3.52677509e-04, -4.10831441e-05, ...,
        1.50155377e+00, -4.71339881e-02, -1.99907691e-01])

In [44]:
X = np.matrix(X)
thetafinal1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
thetafinal2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

In [45]:
# 使用优化后的θ进行预测
a1, z2, a2, z3, h = forward_propagate(X, thetafinal1, thetafinal2 )
y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred

array([[10],
       [10],
       [10],
       ...,
       [ 9],
       [ 9],
       [ 9]])

In [46]:
# 预测值与实际值比较
from sklearn.metrics import classification_report
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99       500
           2       0.99      0.99      0.99       500
           3       1.00      0.99      0.99       500
           4       0.99      0.99      0.99       500
           5       1.00      1.00      1.00       500
           6       1.00      1.00      1.00       500
           7       0.99      0.99      0.99       500
           8       1.00      1.00      1.00       500
           9       0.98      0.99      0.99       500
          10       1.00      1.00      1.00       500

    accuracy                           0.99      5000
   macro avg       0.99      0.99      0.99      5000
weighted avg       0.99      0.99      0.99      5000



In [47]:
#可视化隐藏层
hidden_layer = thetafinal1[:, 1:] 
hidden_layer.shape

(25, 400)

In [48]:
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(12, 12))
for r in range(5):
    for c in range(5):
        ax_array[r, c].matshow(np.array(hidden_layer[5 * r + c].reshape((20, 20))),cmap=matplotlib.cm.binary)
        plt.xticks(np.array([]))
        plt.yticks(np.array([]))