## 平时作业（1）：多层神经网络的训练

李锦韬 2201213292

---
请根据自己的计算环境情况和兴趣，选择以下两个数据集之一，完成如下的实验：

http://yann.lecun.com/exdb/mnist/

https://www.cs.toronto.edu/~kriz/cifar.html

1、构造一个多层的神经网络（注意，不要使用卷积神经网络，本题目要求使用多层神经网络），并在上述数据集任务上进行训练，并汇报一个“使用了你认为最优的超参数配置的神经网络”的学习曲线；要求如下：

（1）自己手动完成反向传播算法部分的编写；

（2）该网络应为一个“纯净”的多层神经网络，不使用正则化方法、率优化算法等；



2、在上述“你认为最优配置的神经网络”的基础上，

（1）分别汇报“增加一个隐藏层”和“减小一个隐藏层”情况下的学习曲线；

（2）分别汇报使用BGD和SGD进行训练的学习曲线；

（3）分别汇报使用两种以上参数初始化方法下的学习曲线；

（4）分别汇报使用两种以上学习率优化算法下的学习曲线；

（5）分别汇报使用两种以上正则化方法下的学习曲线；

最终提交：包含6个子文件夹的一个zip文件，其中的子文件夹应包含：

（1）对应上述6种情况之一的一份源代码；

（2）对应上述源代码的学习曲线的一个.png文件；

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# 将标签转换成独热编码
def one_hot_encoding(Y, n_classes):
    m = Y.shape[0]
    Y = np.array(Y).reshape(-1)    
    Y_encoded = np.zeros((m, n_classes))
    Y_encoded[np.arange(m), Y] = 1
    return Y_encoded

# 初始化参数
def init_params(layer_dims):
    params = {}
    L = len(layer_dims)
    for l in range(1, L):
        params["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2/layer_dims[l-1])
        params["b" + str(l)] = np.zeros((layer_dims[l], 1))
    return params

# 定义ReLU激活函数
def relu(Z):
    A = np.maximum(0, Z)
    return A

# 前向传播
def forward_propagation(X, params):
    caches = []
    A = X
    L = len(params) // 2
    for l in range(1, L):
        Z = np.dot(params["W" + str(l)], A) + params["b" + str(l)]
        A = relu(Z)
        caches.append((Z, A))
    Z = np.dot(params["W" + str(L)], A) + params["b" + str(L)]
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
    caches.append((Z, A))
    return A, caches

# 计算交叉熵损失
def cross_entropy_loss(Y, Y_hat):
    m = Y.shape[1]
    loss = -1/m * np.sum(np.multiply(Y, np.log(Y_hat)))
    return loss

# 反向传播
# def backward_propagation(Y, Y_hat, caches, params):
#     grads = {}
#     m = Y.shape[1]
#     L = len(caches)
#     dZ = Y_hat - Y
#     dW = 1/m * np.dot(dZ, caches[-2][1].T)
#     db = 1/m * np.sum(dZ, axis=1, keepdims=True)
#     grads["dZ" + str(L)] = dZ
#     grads["dW" + str(L)] = dW
#     grads["db" + str(L)] = db
#     for l in range(L-1, 0, -1):
#         dA = np.dot(params["W" + str(l+1)].T, dZ)
#         dZ = np.int64(caches[l][0] > 0) * dA
#         dW = 1/m * np.dot(dZ, caches[l-1][1].T)
#         db = 1/m * np.sum(dZ, axis=1, keepdims=True)
#         grads["dZ" + str(l)] = dZ
#         grads["dW" + str(l)] = dW
#         grads["db" + str(l)] = db
#     return grads

def backward_propagation(Y, Y_hat, caches, params):
    grads = {}
    m = Y.shape[1]
    L = len(caches)
    dZ = Y_hat - Y
    grads["dW" + str(L)] = 1/m * np.dot(dZ, caches[-2][1].T)
    grads["db" + str(L)] = 1/m * np.sum(dZ, axis=1, keepdims=True)
    grads["dZ" + str(L)] = dZ
    for l in range(L-1, 0, -1):
        dZ = np.dot(params["W" + str(l+1)].T, dZ) * (caches[l][0] > 0)
        grads["dW" + str(l)] = 1/m * np.dot(dZ, caches[l-1][1].T)
        grads["db" + str(l)] = 1/m * np.sum(dZ, axis=1, keepdims=True)
        grads["dZ" + str(l)] = dZ
    return grads

def update_params(params, grads, learning_rate):
    L = len(params) // 2
    for l in range(1, L+1):
        params["W" + str(l)] = params["W" + str(l)] - learning_rate * grads["dW" + str(l)]
        params["b" + str(l)] = params["b" + str(l)] - learning_rate * grads["db" + str(l)]
    return params

def predict(X, params):
    Y_hat, _ = forward_propagation(X.T, params)
    predictions = np.argmax(Y_hat, axis=0)
    return predictions

def accuracy(Y, Y_hat):
    m = Y.shape[1]
    predictions = np.argmax(Y_hat, axis=0)
    labels = np.argmax(Y, axis=0)
    acc = np.sum(predictions == labels) / m
    return acc

def mlp(X_train, y_train, X_test, y_test, layer_dims, num_epochs, learning_rate):
    # 初始化参数
    params = init_params(layer_dims)
    # 记录训练集和测试集的损失和精度
    train_losses = []
    test_losses = []
    train_accs = []
    test_accs = []
    # 将标签转换成独热编码
    n_classes = len(np.unique(y_train))
    Y_train = one_hot_encoding(y_train, n_classes).T
    Y_test = one_hot_encoding(y_test, n_classes).T
    # 开始训练模型
    for epoch in range(num_epochs):
        # 前向传播
        Y_hat_train, caches_train = forward_propagation(X_train.T, params)
        Y_hat_test, caches_test = forward_propagation(X_test.T, params)
        # 计算损失
        train_loss = cross_entropy_loss(Y_train, Y_hat_train)
        test_loss = cross_entropy_loss(Y_test, Y_hat_test)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        # 反向传播
        grads = backward_propagation(Y_train, Y_hat_train, caches_train, params)
        # 更新参数
        params = update_params(params, grads, learning_rate)
        # 计算精度
        train_acc = accuracy(Y_train, Y_hat_train)
        test_acc = accuracy(Y_test, Y_hat_test)
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        # 输出训练结果
        print("Epoch:", epoch+1)
        print("Train loss:", train_loss)
        print("Train accuracy:", train_acc)
        print("Test loss:", test_loss)
        print("Test accuracy:", test_acc)
        print("="*50)
    # 画出学习曲线
    plt.plot(range(num_epochs), train_losses, label="Train loss")
    plt.plot(range(num_epochs), test_losses, label="Test loss")
    plt.title("Learning Curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    plt.plot(range(num_epochs), train_accs, label="Train accuracy")
    plt.plot(range(num_epochs), test_accs, label="Test accuracy")
    plt.title("Accuracy Curve")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()
    return params

# 加载数据集
# 加载MNIST数据集
mnist = fetch_openml('mnist_784', parser='auto', version=1, cache=True)
X, y = mnist["data"], mnist["target"]

# 将标签转换成整数类型
y = y.astype(np.uint8)

# 将数据标准化到0到1之间
X = X / 255.

# 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 定义网络结构
layer_dims = [X_train.shape[1], 64, 32, 10]
# 训练模型
params = mlp(X_train, y_train, X_test, y_test, layer_dims, num_epochs=100, learning_rate=0.01)
# 预测
predictions = predict(X_test, params)
print("Test accuracy:", accuracy(y_test, predictions))

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

mnist = fetch_openml('mnist_784')
X, y = mnist['data'], mnist['target']
X = X.astype(np.float32) / 255.0
enc = OneHotEncoder(categories='auto')
y = np.array(y)
y = enc.fit_transform(y.reshape(-1,1)).toarray()

split = 60000
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

input_size = X_train.shape[1]
hidden_size = 256
output_size = y_train.shape[1]
lr = 0.1
epochs = 100
batch_size = 128

W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

train_acc_list = []
test_acc_list = []

for i in range(epochs):
    for j in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[j:j+batch_size]
        y_batch = y_train[j:j+batch_size]

        # 前向传播
        hidden = sigmoid(np.dot(X_batch, W1) + b1)
        output = sigmoid(np.dot(hidden, W2) + b2)

        # 反向传播
        output_error = y_batch - output
        output_delta = output_error * sigmoid_derivative(output)

        hidden_error = output_delta.dot(W2.T)
        hidden_delta = hidden_error * sigmoid_derivative(hidden)

        # 更新参数
        W2 += lr * hidden.T.dot(output_delta)
        b2 += lr * np.sum(output_delta, axis=0, keepdims=True)
        W1 += lr * X_batch.T.dot(hidden_delta)
        b1 += lr * np.sum(hidden_delta, axis=0, keepdims=True)

    # 计算训练集和测试集的准确率
    hidden = sigmoid(np.dot(X_train, W1) + b1)
    train_output = sigmoid(np.dot(hidden, W2) + b2)
    train_predictions = np.argmax(train_output, axis=1)
    train_labels = np.argmax(y_train, axis=1)
    train_acc = np.mean(train_predictions == train_labels)
    train_acc_list.append(train_acc)

    hidden = sigmoid(np.dot(X_test, W1) + b1)
    test_output = sigmoid(np.dot(hidden, W2) + b2)
    test_predictions = np.argmax(test_output, axis=1)
    test_labels = np.argmax(y_test, axis=1)
    test_acc = np.mean(test_predictions == test_labels)
    test_acc_list.append(test_acc)

    # 输出每一轮训练的准确率
    print(f'Epoch {i+1}/{epochs}: train_acc={train_acc:.4f}, test_acc={test_acc:.4f}')

import matplotlib.pyplot as plt

plt.plot(range(epochs), train_acc_list, label='train')
plt.plot(range(epochs), test_acc_list, label='test')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [13]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(int)

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)

def softmax_derivative(x):
    return x * (1 - x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def cross_entropy(y, y_hat):
    return -np.sum(y * np.log(y_hat), axis=1, keepdims=True)

def cross_entropy_derivative(y, y_hat):
    return y_hat - y

def accuracy(y, y_hat):
    return np.mean(np.argmax(y, axis=1) == np.argmax(y_hat, axis=1))

def plot_loss_acc(losses, accs, title):
    plt.plot(range(len(losses)), losses, label='loss')
    plt.plot(range(len(accs)), accs, label='acc')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss/Accuracy')
    plt.legend()
    plt.show()

def plot_loss(losses, title):
    plt.plot(range(len(losses)), losses, label='loss')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()


mnist = fetch_openml('mnist_784', parser='auto', cache=True)
X, y = mnist['data'], mnist['target']
X = X.astype(np.float32) / 255.0
enc = OneHotEncoder(categories='auto')
y = np.array(y)
y = enc.fit_transform(y.reshape(-1,1)).toarray()


In [None]:

split = 60000
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

input_size = X_train.shape[1]
hidden_size = 256
output_size = y_train.shape[1]
lr = 0.1
epochs = 100
batch_size = 128

W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

train_acc_list = []
test_acc_list = []

for i in range(epochs):
    for j in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[j:j+batch_size]
        y_batch = y_train[j:j+batch_size]

        # 前向传播
        hidden = relu(np.dot(X_batch, W1) + b1)
        output = softmax(np.dot(hidden, W2) + b2)

        # 反向传播
        output_error = y_batch - output
        output_delta = output_error * relu_derivative(output)

        hidden_error = output_delta.dot(W2.T)
        hidden_delta = hidden_error * softmax_derivative(hidden)

        # 更新参数
        W2 += lr * hidden.T.dot(output_delta)
        b2 += lr * np.sum(output_delta, axis=0, keepdims=True)
        W1 += lr * X_batch.T.dot(hidden_delta)
        b1 += lr * np.sum(hidden_delta, axis=0, keepdims=True)

    # 计算训练集和测试集的准确率
    hidden = relu(np.dot(X_train, W1) + b1)
    train_output = softmax(np.dot(hidden, W2) + b2)
    train_predictions = np.argmax(train_output, axis=1)
    train_labels = np.argmax(y_train, axis=1)
    train_acc = np.mean(train_predictions == train_labels)
    train_acc_list.append(train_acc)

    hidden = relu(np.dot(X_test, W1) + b1)
    test_output = softmax(np.dot(hidden, W2) + b2)
    test_predictions = np.argmax(test_output, axis=1)
    test_labels = np.argmax(y_test, axis=1)
    test_acc = np.mean(test_predictions == test_labels)
    test_acc_list.append(test_acc)

    # 输出每一轮训练的准确率
    print(f'Epoch {i+1}/{epochs}: train_acc={train_acc:.4f}, test_acc={test_acc:.4f}')

import matplotlib.pyplot as plt

plt.plot(range(epochs), train_acc_list, label='train')
plt.plot(range(epochs), test_acc_list, label='test')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


  return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
  return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)


Epoch 1/100: train_acc=0.0987, test_acc=0.0980
Epoch 2/100: train_acc=0.0987, test_acc=0.0980
Epoch 3/100: train_acc=0.0987, test_acc=0.0980
Epoch 4/100: train_acc=0.0987, test_acc=0.0980
Epoch 5/100: train_acc=0.0987, test_acc=0.0980
Epoch 6/100: train_acc=0.0987, test_acc=0.0980
Epoch 7/100: train_acc=0.0987, test_acc=0.0980
Epoch 8/100: train_acc=0.0987, test_acc=0.0980
Epoch 9/100: train_acc=0.0987, test_acc=0.0980
Epoch 10/100: train_acc=0.0987, test_acc=0.0980
Epoch 11/100: train_acc=0.0987, test_acc=0.0980
Epoch 12/100: train_acc=0.0987, test_acc=0.0980
Epoch 13/100: train_acc=0.0987, test_acc=0.0980
Epoch 14/100: train_acc=0.0987, test_acc=0.0980
Epoch 15/100: train_acc=0.0987, test_acc=0.0980
Epoch 16/100: train_acc=0.0987, test_acc=0.0980
Epoch 17/100: train_acc=0.0987, test_acc=0.0980
Epoch 18/100: train_acc=0.0987, test_acc=0.0980
