载入数据集

In [88]:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from scipy.optimize import minimize

In [90]:
data = sio.loadmat('C:\\Users\\Administrator\\Desktop\\ex4data1.mat')
raw_X = data['X']
raw_y = data['y']

In [92]:
X = np.insert(raw_X,0,values=1,axis=1)
X.shape

(5000, 401)

1.将标签转化为onehot编码格式

In [95]:
def one_hot_encoder(y):
    result = []
    for i in y:
        y_temp = np.zeros(10)
        y_temp[i-1] = 1
        result.append(y_temp)
    return np.array(result)

In [97]:
y = one_hot_encoder(raw_y)
y.shape

(5000, 10)

2.序列化权重参数

In [100]:
# 序列化是为了传入调用scipy函数库方便，解序列化是为了后续矩阵运算维度保持一致
# 因为minimize优化器的x0初始化参数输入要求是序列化后（只有一列）
# 序列化和反序列化的目的其实是为了方便地把数据保存到本地（即序列化），以及把保存的数据再次读取出来（即反序列化）
theta = sio.loadmat('C:\\Users\\Administrator\\Desktop\\ex4weights.mat')
theta1,theta2 = theta['Theta1'],theta['Theta2']
theta1.shape,theta2.shape

((25, 401), (10, 26))

In [102]:
def serialize(a,b):
    return np.append(a.flatten(),b.flatten())

In [104]:
theta_serialize = serialize(theta1,theta2)

In [106]:
theta_serialize.shape

(10285,)

3.解序列化

In [109]:
def de_serialize(c):
    theta1 = c[:25*401].reshape(25,401)
    theta2 = c[25*401:].reshape(10,26)
    return theta1,theta2

In [111]:
theta1,theta2 = de_serialize(theta_serialize)
theta1.shape,theta2.shape

((25, 401), (10, 26))

4.前向传播

In [114]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [116]:
def feed_forward(theta_serialize,X):
    a1 = X
    theta1,theta2 = de_serialize(theta_serialize)
    z1 = a1 @ theta1.T
    a2 = sigmoid(z1)
    a2 = np.insert(a2,0,values=1,axis=1)
    z2 = a2 @ theta2.T
    h =sigmoid(z2)
    return a1,z1,a2,z2,h

5.损失函数(不带正则化项)

In [119]:
def costFunction(theta,X,y):
    a1,z1,a2,z2,h = feed_forward(theta,X)
    cost = -np.sum(y*np.log(h) + (1-y)*np.log(1-h)) / len(X)
    return cost 

5.损失函数（带正则化项）

In [122]:
def costFunction2(theta,X,y,lamda):
    sum1 = np.sum(np.power(theta1[:,1:],2))
    sum2 = np.sum(np.power(theta2[:,1:],2))
    reg = (sum1+sum2) * lamda / (2*len(X))
    return reg + costFunction(theta,X,y)

In [124]:
cost = costFunction(theta_serialize,X,y)
lamda = 1
cost2 = costFunction2(theta_serialize,X,y,lamda)
cost,cost2

(0.2876291651613189, 0.38376985909092365)

6.梯度（无正则化）

In [127]:
def sigmoid_gradient(z):
    return sigmoid(z) * (1-sigmoid(z))

In [135]:
def gradient(theta,X,y):
    theta1,theat2 = de_serialize(theta)
    a1,z1,a2,z2,h = feed_forward(theta,X)
    d2 = h - y
    d1 = d2 @ theta2[:,1:] * sigmoid_gradient(z1)
    D2 = (d2.T @ a2) / len(X)
    D1 = (d1.T @ a1) / len(X)
    return serialize(D1,D2)

6.梯度（正则化）

In [138]:
def gradient2(theta,X,y,lamda):
    D = gradient(theta,X,y)
    D1,D2 = de_serialize(D)
    theta1,theta2 = de_serialize(theta)
    D1[:,1:] = D1[:,1:] + theta1[:,1:] * lamda / len(X) 
    D2[:,1:] = D2[:,1:] + theta2[:,1:] * lamda / len(X) 
    return serialize(D1,D2)

7.优化

In [149]:
def nn_training(X,y):
    # 随机初始化的权重参数 `init_theta` 作为神经网络的初始值
    init_theta = np.random.uniform(-0.5, 0.5, 10285)
    # 调用 Scipy 的 `minimize()` 函数，利用 TNC 方法来进行神经网络的训练
    res = minimize(fun=costFunction2,  # `fun` 参数传递的是损失函数
                   x0=init_theta,  # `x0` 参数传递的是神经网络的初始权重参数
                   # `args` 参数传递的是需要传递给损失函数的其他参数 此处为数据集 `X` 和标签集 `y`，同时还有一个正则化参数 `lamda`
                   args=(X, y, lamda),
                   method='TNC',
                   jac=gradient2,  # `jac` 参数传递的是损失函数的梯度函数，用于计算权重参数的调整值
                   options={'maxiter': 300}  # `options` 参数中设置了最大迭代次数为 300
                   )
    return res

# 测试
lamda = 10
res = nn_training(X, y)
raw_y = data['y'].reshape(5000, )
_, _, _, _, h = feed_forward(res.x, X)
y_pred = np.argmax(h,axis=1)+1
acc = np.mean(y_pred==raw_y)
print(acc)

  res = minimize(fun=costFunction2,  # `fun` 参数传递的是损失函数


0.9474
