## 加载数据集

In [2]:
import numpy as np
from urllib import request
import gzip
import pickle

filename = [
["training_images","train-images-idx3-ubyte.gz"],
["test_images","t10k-images-idx3-ubyte.gz"],
["training_labels","train-labels-idx1-ubyte.gz"],
["test_labels","t10k-labels-idx1-ubyte.gz"]
]

def download_mnist():
    base_url = "http://yann.lecun.com/exdb/mnist/"
    for name in filename:
        print("Downloading "+name[1]+"...")
        request.urlretrieve(base_url+name[1], "../datasets/" + name[1])
    print("Download complete.")

def save_mnist():
    mnist = {}
    for name in filename[:2]:
        with gzip.open("../datasets/" + name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28*28)
    for name in filename[-2:]:
        with gzip.open("../datasets/" + name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)
    with open("../datasets/" + "mnist.pkl", 'wb') as f:
        pickle.dump(mnist,f)
    print("Save complete.")

def init_mnist():
    download_mnist()
    save_mnist()
    
init_mnist()

Downloading train-images-idx3-ubyte.gz...
Downloading t10k-images-idx3-ubyte.gz...
Downloading train-labels-idx1-ubyte.gz...
Downloading t10k-labels-idx1-ubyte.gz...
Download complete.
Save complete.


## 划分训练测试集

In [6]:
# 获取MNIST数据集
data_path = r'../datasets/mnist.pkl'
def load_data(data_path):
    with open(data_path, "rb") as f:
        data = pickle.load(f)
    return data
data = load_data(data_path)

In [9]:
X_train, X_test, y_train, y_test = data['training_images'], data['test_images'], data['training_labels'], data['test_labels'], 

In [11]:
X_train.shape, X_test.shape

((60000, 784), (10000, 784))

## 对数据进行标准化、归一化

In [13]:
X_train = X_train /  255.0
X_test = X_test / 255.0

X_train = (X_train - X_train.mean()) / X_train.std()
X_test = (X_test - X_test.mean()) / X_test.std()

In [16]:
y_train

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

## 标签独热编码

In [17]:
def one_hot_encoding(labels, num_classes):
    one_hot_labels = []
    for label in labels:
        one_hot = [0] * num_classes
        one_hot[label] = 1
        one_hot_labels.append(one_hot)
    return one_hot_labels

num_classes = 10
y_train = one_hot_encoding(y_train , num_classes)
y_test = one_hot_encoding(y_test , num_classes)

y_train = np.array(y_train)
y_test = np.array(y_test)

## 定义网络结构

In [24]:
'''
input_size = X_train.shape[1]
hidden_size_1 = 128
hidden_size_2 = 64
output_size = num_classes
'''
hidden_size = [X_train.shape[1], 128, 64, num_classes]

In [30]:
num_layers = 3
W, b = [], []
W.append(0), b.append(0) # 下标从1开始，更好对应
for i in range(num_layers):
    w = np.random.randn(hidden_size[i], hidden_size[i + 1]) / np.sqrt(hidden_size[i])
    _b = np.zeros(hidden_size[i + 1])
    W.append(w)
    b.append(_b)

In [29]:
class Func:
    @staticmethod
    def relu(x):
        return np.maximum(0, x)

    @staticmethod
    def softmax(x):
        exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exps / np.sum(exps, axis=-1, keepdims=True)

    @staticmethod
    def cross_entropy_loss(y_pred, y_true):
        n_samples = y_pred.shape[0]
        loss = -np.sum(y_true * np.log(y_pred + 1e-12)) / n_samples
        return loss

In [None]:
def train(X, y_true, learning_rate):
    global W, b
    # 前向传播
    Z, A = [], []
    Z.append(0), A.append(X) # 下标从1开始。A[0]其实就是输入X
    for i in range(num_layers):
        z = np.dot()
    

In [None]:
def train(X, y_true, learning_rate):
    global W1, W2, W3, b1, b2, b3
    # 前向传播
    z1 = np.dot(X, W1) + b1
    a1 = relu(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = relu(z2)
    z3 = np.dot(a2, W3) + b3
    y_pred = softmax(z3)
    m = y_true.shape[0]
    # 计算损失函数值和梯度
    loss = cross_entropy_loss(y_pred, y_true)

    grad_y_pred = y_pred - y_true

    grad_W3 = 1./m*np.dot(a2.T, grad_y_pred)
    grad_b3 = 1./m*np.sum(grad_y_pred, axis=0)
    grad_a2 = np.dot(grad_y_pred, W3.T)

    grad_z2 = grad_a2.copy()
    grad_z2[z2 < 0] = 0
    grad_W2 = 1./m*np.dot(a1.T, grad_z2)
    grad_b2 = 1./m*np.sum(grad_z2, axis=0)
    grad_a1 = np.dot(grad_z2, W2.T)

    grad_z1 = grad_a1.copy()
    grad_z1[z1 < 0] = 0
    grad_W1 = 1./m*np.dot(X.T, grad_z1)
    grad_b1 = 1./m*np.sum(grad_z1, axis=0)

    # 更新权重和偏置
    W3 -= learning_rate * grad_W3
    b3 -= learning_rate * grad_b3
    W2 -= learning_rate * grad_W2
    b2 -= learning_rate * grad_b2
    W1 -= learning_rate * grad_W1
    b1 -= learning_rate * grad_b1

    return loss