# 準備資料集

In [1]:
import numpy as np

# XOR 問題的輸入資料
x_data = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

# XOR 的標籤
y_data = np.array([0, 1, 1, 0])

# 建立模型

In [2]:
class MLP:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.1):
        # 初始化第一層的權重與偏差（輸入層 -> 隱藏層）
        self.w1 = np.random.randn(input_size, hidden_size)
        self.b1 = np.random.randn(hidden_size)
        # 初始化第二層的權重與偏差（隱藏層 -> 輸出層）
        self.w2 = np.random.randn(hidden_size, output_size)
        self.b2 = np.random.randn(output_size)

        # 設定學習率
        self.lr = learning_rate

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True)) # 避免指數爆炸
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)  # Softmax 函數，轉換為機率分佈（公式2.4）

    def forward(self, x, y=None):
        # 前向傳播：輸入層 -> 隱藏層（公式2.1）
        z1 = np.dot(x, self.w1) + self.b1

        # 使用 ReLU 激活函數（公式2.2）
        a1 = np.maximum(0, z1)

        # 隱藏層 -> 輸出層（公式2.3）
        z2 = np.dot(a1, self.w2) + self.b2

        # 輸出使用 softmax 激活函數轉為機率（公式2.4）
        y_hat = self.softmax(z2)
        
        # one-hot 編碼
        y_onehot = np.zeros_like(y_hat)
        y_onehot[np.arange(len(y)), y] = 1

        # 計算交叉熵損失（公式2.5）
        loss = -np.sum(y_onehot * np.log(y_hat + 1e-9)) / len(y)  # 加入 1e-9 避免 log(0)
        return loss, y_hat, z1, a1, y_onehot

    def backward(self, x, y_onehot, y_hat, z1, a1):
        # 輸出層梯度（公式2.6）
        # ∂L/∂Z^(2) = (Y_hat - Y) / N
        dz2 = (y_hat - y_onehot) / len(x)

        # ∂L/∂W^(2) = A^(1).T · ∂L/∂Z^(2)
        dw2 = np.dot(a1.T, dz2)

        # ∂L/∂b^(2) = sum over batch of ∂L/∂Z^(2)
        db2 = np.sum(dz2, axis=0)

        # 隱藏層梯度（公式2.9）
        # ∂L/∂Z^(1) = ((Y_hat - Y) · W^(2).T) ⊙ ReLU'(Z^(1))
        da1 = np.dot(dz2, self.w2.T)
        relu_grad = (z1 > 0).astype(float)
        dz1 = da1 * relu_grad

        # ∂L/∂W^(1) = X.T · ∂L/∂Z^(1)
        dw1 = np.dot(x.T, dz1)

        # ∂L/∂b^(1) = sum over batch of ∂L/∂Z^(1)
        db1 = np.sum(dz1, axis=0)

        # 梯度更新
        self.w2 -= self.lr * dw2
        self.b2 -= self.lr * db2
        self.w1 -= self.lr * dw1
        self.b1 -= self.lr * db1

    def predict(self, x):
        # 前向傳播：輸入層 -> 隱藏層
        z1 = np.dot(x, self.w1) + self.b1

        # ReLU 激活
        a1 = np.maximum(0, z1)

        # 隱藏層 -> 輸出層
        z2 = np.dot(a1, self.w2) + self.b2

        # softmax 轉為機率
        y_hat = self.softmax(z2)

        return y_hat, np.argmax(y_hat, axis=1)
    
# 初始化兩個 MLP 模型用於比較SGD與GD
model_sgd = MLP(input_size=x_data.shape[1], hidden_size=4, output_size=2, learning_rate=0.1)
model_gd = MLP(input_size=x_data.shape[1], hidden_size=4, output_size=2, learning_rate=0.1)

In [3]:
def train(model_sgd, model_gd, x_data, y_data, epochs=10000):
    sgd_losses = []
    gd_losses = []

    for epoch in range(epochs):
        # SGD：隨機抽取一筆資料
        idx = np.random.randint(0, len(x_data))
        x = x_data[idx].reshape(1, -1)
        y = np.array([y_data[idx]])
    
        loss, y_hat, z1, a1, y_onehot = model_sgd.forward(x, y)
        model_sgd.backward(x, y_onehot, y_hat, z1, a1)

        sgd_losses.append(loss)

        # GD：使用整批資料更新一次參數
        loss_gd, y_hat_gd, z1_gd, a1_gd, y_onehot_gd = model_gd.forward(x_data, y_data)
        model_gd.backward(x_data, y_onehot_gd, y_hat_gd, z1_gd, a1_gd)

        gd_losses.append(loss_gd)

        if (epoch + 1) % 1000 == 0:
            print(f"Epoch {epoch + 1}, SGD Loss: {loss:.6f}, GD Loss: {loss_gd:.6f}")

    return sgd_losses, gd_losses

# 訓練模型並獲得 SGD 與 GD 的損失
sgd_losses, gd_losses = train(model_sgd, model_gd, x_data, y_data, epochs=10000)

Epoch 1000, SGD Loss: 0.011743, GD Loss: 0.007088
Epoch 2000, SGD Loss: 0.017702, GD Loss: 0.002655
Epoch 3000, SGD Loss: 0.011249, GD Loss: 0.001577
Epoch 4000, SGD Loss: 0.001909, GD Loss: 0.001099
Epoch 5000, SGD Loss: 0.000714, GD Loss: 0.000834
Epoch 6000, SGD Loss: 0.001088, GD Loss: 0.000667
Epoch 7000, SGD Loss: 0.004398, GD Loss: 0.000553
Epoch 8000, SGD Loss: 0.000422, GD Loss: 0.000471
Epoch 9000, SGD Loss: 0.000673, GD Loss: 0.000409
Epoch 10000, SGD Loss: 0.000293, GD Loss: 0.000361


# 預測結果

In [4]:
# 預測並輸出結果（SGD 模型）
print("\nPrediction Results (SGD):")
for i in range(len(x_data)):
    x = x_data[i].reshape(1, -1)
    y_hat, pred = model_sgd.predict(x)
    print(f"Input: {x_data[i]}, Output: {y_hat[0]}, Predicted Class: {pred[0]}")

# 預測並輸出結果（GD 模型）
print("\nPrediction Results (GD):")
for i in range(len(x_data)):
    x = x_data[i].reshape(1, -1)
    y_hat, pred = model_gd.predict(x)
    print(f"Input: {x_data[i]}, Output: {y_hat[0]}, Predicted Class: {pred[0]}")


Prediction Results (SGD):
Input: [0 0], Output: [9.99381620e-01 6.18379867e-04], Predicted Class: 0
Input: [0 1], Output: [0.00294014 0.99705986], Predicted Class: 1
Input: [1 0], Output: [2.90835325e-04 9.99709165e-01], Predicted Class: 1
Input: [1 1], Output: [9.99517405e-01 4.82595152e-04], Predicted Class: 0

Prediction Results (GD):
Input: [0 0], Output: [9.99530671e-01 4.69328505e-04], Predicted Class: 0
Input: [0 1], Output: [2.66116355e-04 9.99733884e-01], Predicted Class: 1
Input: [1 0], Output: [3.8168974e-04 9.9961831e-01], Predicted Class: 1
Input: [1 1], Output: [9.99675409e-01 3.24590633e-04], Predicted Class: 0
