## 写一个手写字体识别的两层网络
输入层为tanh,输出层为softmax,无隐藏层

In [None]:
### 获取数据集
import math


def fetch_mnist():
    def fetch(url):
        import requests, gzip, os, hashlib, numpy
        fp = os.path.join("/tmp", hashlib.md5(url.encode('utf-8')).hexdigest())
        if os.path.isfile(fp):
            with open(fp, "rb") as f:
                dat = f.read()
        else:
            with open(fp, "wb") as f:
                dat = requests.get(url).content
                f.write(dat)
        return numpy.frombuffer(gzip.decompress(dat), dtype=numpy.uint8).copy()

    X_train = fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
    Y_train = fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:]
    X_test = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
    Y_test = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
    return X_train, Y_train, X_test, Y_test

In [None]:
X_train, Y_train, X_test, Y_test = fetch_mnist()

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mpl

for i in range(10):
    s = X_train[i]
    s_dgit = s.reshape(28, 28)
    plt.imshow(s_dgit, cmap=mpl.cm.binary)
    plt.show()

In [None]:
X_train = X_train.reshape(-1, 28 * 28)
X_test = X_test.reshape(-1, 28 * 28)

In [None]:
X_train.shape  # (60000, 28, 28) 6000个样本,28*28

In [None]:
Y_train.shape  # 6000 个样本的label

In [None]:
# 激活函数tanh,一般用于输入层或者隐藏层
def tanh(x):
    return np.tanh(x)


def d_tanh(x):
    return 1 / (np.cosh(x)) ** 2

In [None]:
def relu(x):
    return np.maximum(0, x)


def d_relu(x):
    return -np.minimum(0, x)

In [None]:
# 一般用于输出层,主要用于多分类,因为其函数输出和为1,符合概率论,一般概率最高的就是最可能的类别
def softmax(x):
    exp = np.exp(x - x.max())
    return exp / exp.sum()


def d_softmax(x):
    sm = softmax(x)
    return np.diag(sm) - np.outer(sm, sm)


假设我们构造一个两层的神经网络:
设A为tanh,则输入层线性代数公式为
$layer_{input} = A(X+b_0)$
设B为softmax,则
$layer_output = B(layer_{input}*W_1+b_1)$

总的来说是希望在经过多层神经网络的线性或者非线性变换后得到一个output,其中有每一层的变换函数的权重和误差.最后将这个output,基于梯度下降和反向传播,得到确切的权重和误差值.

In [None]:
input_output_dimensions = [X_train[0].shape, (10,)]  #输入输出的维度
each_layer_activation_function = [tanh, softmax]  #我们只有输入层和输出层
activation_differential_function = {tanh: d_tanh, softmax: d_softmax, relu: d_relu}  #我们只有输入层和输出层

# 参数初始化
parameter_placeholder = [
    {'b': [0, 0]},
    {'b': [0, 0], 'w': [-math.sqrt(6 / (sum(input_output_dimensions[0]) + sum(input_output_dimensions[1]))),
                        math.sqrt(6 / (sum(input_output_dimensions[0]) + sum(input_output_dimensions[1])))]
     }
]


# layer初始化函数
def layer_param_uniform_init(*x, min=-1, max=1):
    ret = np.random.uniform(min, max, size=x) / np.sqrt(np.prod(x))
    return ret.astype(np.float32)

In [None]:
## 初始化layer层的参数
def init_parameter_b(layer):
    dist = parameter_placeholder[layer]['b']  #初始化b参数,b:input_output_dimensions[layer]
    # return layer_param_uniform_init(*input_output_dimensions[layer], min=dist[0], max=dist[1])
    return np.random.rand(np.prod(input_output_dimensions[layer])) * (dist[1] - dist[0]) + dist[0]


def init_parameter_w(layer):
    dist = parameter_placeholder[layer]['w']  #初始化w参数,w:input_output_dimensions[layer - 1], input_output_dimensions[layer]
    # return layer_param_uniform_init(*input_output_dimensions[layer - 1], *input_output_dimensions[layer], min=dist[0], max=dist[1])
    return np.random.rand(np.prod(input_output_dimensions[layer - 1]), np.prod(input_output_dimensions[layer])) * (dist[1] - dist[0]) + dist[0]


def init_all_params() -> list:
    layers = []
    for layer in range(len(parameter_placeholder)):
        layer_parameter = {}
        for param in parameter_placeholder[layer].keys():
            if param == 'b':
                layer_parameter['b'] = init_parameter_b(layer)
                continue
            if param == 'w':
                layer_parameter['w'] = init_parameter_w(layer)
                continue
        layers.append(layer_parameter)
    return layers

In [None]:
def forward(img, params):
    """
    正向传播
    :param img:
    :param params:
    :return:
    """
    l_0_in = img + params[0]['b']  #第一层的输入就是图片和参数
    l_0_out = each_layer_activation_function[0](l_0_in)
    l_1_in = np.dot(l_0_out, params[1]['w']) + params[1]['b']  #第二层的输入是第一层的输出，参数，以及一个新的激活函数
    l_1_out = each_layer_activation_function[1](l_1_in)  #第二层的输出刚好是output
    return l_1_out


params = init_all_params()

In [None]:
# 设置one_hot编码,同事损失函数
one_hat = np.identity(10)

In [None]:
# 损失函数 mse
def square_loss(img, label, params):
    y_pred = forward(img, params)
    y = one_hat[label]
    return np.dot(y - y_pred, y - y_pred)

In [None]:
def grad_params(img, label, params):
    """
    梯度计算
    :return:
    """
    l_0_in = img + params[0]['b']  #第一层的输入就是图片和参数
    l_0_out = each_layer_activation_function[0](l_0_in)
    l_1_in = np.dot(l_0_out, params[1]['w']) + params[1]['b']  #第二层的输入是第一层的输出，参数，以及一个新的激活函数
    l_1_out = each_layer_activation_function[1](l_1_in)  #第二层的输出刚好是output

    diff = one_hat[label] - l_1_out
    act_1 = activation_differential_function[each_layer_activation_function[0]](l_0_in)
    act_2 = activation_differential_function[each_layer_activation_function[1]](l_1_in)
    act_2 = np.dot(act_2, diff)

    grad_b1 = -2 * act_2
    grad_w1 = -2 * np.outer(l_0_in, act_2)
    grad_b0 = -2 * act_1 * np.dot(params[1]['w'], act_2)
    return {'w1': grad_w1, 'b1': grad_b1, 'b0': grad_b0}

In [None]:
def train_batch(current_batch, params):
    grad_accu = grad_params(X_train[current_batch * batch_size], Y_train[current_batch * batch_size], params)
    for i in range(1, batch_size):
        temp = grad_params(X_train[current_batch * batch_size + i], Y_train[current_batch * batch_size + i], params)
        for k in grad_accu.keys():
            grad_accu[k] += temp[k]
    for k in grad_accu.keys():
        grad_accu[k] = grad_accu[k] / batch_size
    return grad_accu

In [None]:
import copy


def combine(params, grad, learn_rate=1):
    tmp = copy.deepcopy(params)
    tmp[0]['b'] -= grad['b0'] * learn_rate
    tmp[1]['b'] -= grad['b1'] * learn_rate
    tmp[1]['w'] -= grad['w1'] * learn_rate
    return tmp

In [None]:
def valid_loss(params):
    loss = 0
    for i in range(X_test.shape[0]):
        loss += square_loss(X_test[i], label=Y_test[i], params=params)
    return loss / X_test.shape[0]


def valid_accu(params):
    correct = []
    for i in range(X_test.shape[0]):
        correct.append(forward(X_test[i], params).argmax() == Y_test[i])
    return correct.count(True) / len(correct)

In [None]:
## 训练
batch_size = 100
epoch = 5
train_datas = X_train.shape[0]
for a in range(epoch):
    for i in range(int(train_datas / batch_size)):
        grad_temp = train_batch(i, params)
        params = combine(params, grad_temp)
    print(f"epoch:{a},valid_accu:{valid_accu(params)},valid_loss:{valid_loss(params)}")