In [17]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [18]:
# Load MNIST data
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

# Normalize the data
X /= 255.0

  warn(


In [20]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.to_numpy().reshape(-1, 1))




In [21]:
def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)

# Derivative of ReLU
def relu_derivative(Z):
    return Z > 0

In [22]:
def initialize_parameters(input_size, hidden_size, output_size):
    np.random.seed(42)
    W1 = np.random.randn(hidden_size, input_size) * 0.01
    b1 = np.zeros((hidden_size, 1))
    W2 = np.random.randn(output_size, hidden_size) * 0.01
    b2 = np.zeros((output_size, 1))
    return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

In [23]:
def forward_propagation(X, parameters):
    W1, b1, W2, b2 = parameters["W1"], parameters["b1"], parameters["W2"], parameters["b2"]
    Z1 = np.dot(W1, X.T) + b1
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = softmax(Z2)
    return A2, {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2}

In [24]:
def backward_propagation(X, Y, cache, parameters):
    m = X.shape[0]
    Z1, A1, Z2, A2 = cache["Z1"], cache["A1"], cache["Z2"], cache["A2"]
    dZ2 = A2 - Y.T
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
    dA1 = np.dot(parameters["W2"].T, dZ2)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1/m) * np.dot(dZ1, X)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)
    return {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

In [25]:

# Update parameters
def update_parameters(parameters, grads, learning_rate):
    parameters["W1"] -= learning_rate * grads["dW1"]
    parameters["b1"] -= learning_rate * grads["db1"]
    parameters["W2"] -= learning_rate * grads["dW2"]
    parameters["b2"] -= learning_rate * grads["db2"]
    return parameters

In [26]:
# Compute the loss
def compute_loss(A2, Y):
    m = Y.shape[0]
    log_probs = np.multiply(np.log(A2.T), Y) + np.multiply((1 - Y), np.log(1 - A2.T))
    loss = - np.sum(log_probs) / m
    return loss

In [27]:
# Predict function
def predict(X, parameters):
    A2, cache = forward_propagation(X, parameters)
    predictions = np.argmax(A2, axis=0)
    return predictions


In [38]:
def model(X_train, Y_train, X_test, Y_test, hidden_size, epochs, learning_rate):
    input_size = X_train.shape[1]
    output_size = Y_train.shape[1]

    # Initialize parameters
    parameters = initialize_parameters(input_size, hidden_size, output_size)

    # Training loop
    for i in range(epochs):
        # Forward propagation
        A2, cache = forward_propagation(X_train, parameters)

        # Compute loss
        loss = compute_loss(A2, Y_train)

        # Backward propagation
        grads = backward_propagation(X_train, Y_train, cache, parameters)

        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)

        # Print the loss every 100 iterations
        if i % 100 == 0:
            print("Loss after iteration %i: %f" % (i, loss))

    # Predictions on training set
    predictions_train = predict(X_train, parameters)
    accuracy_train = np.mean(predictions_train == np.argmax(Y_train, axis=1)) * 100

    # Predictions on test set
    predictions_test = predict(X_test, parameters)
    accuracy_test = np.mean(predictions_test == np.argmax(Y_test, axis=1)) * 100

    print("Train accuracy: {:.2f}%".format(accuracy_train))
    print("Test accuracy: {:.2f}%".format(accuracy_test))

    return parameters

In [39]:
# Set hyperparameters
hidden_size = 64
epochs = 1000
learning_rate = 0.1

In [40]:
# Train the model
parameters = model(X_train, y_train_encoded, X_test, y_test_encoded, hidden_size, epochs, learning_rate)

Loss after iteration 0: 3.251248
Loss after iteration 100: 1.738528
Loss after iteration 200: 0.903747
Loss after iteration 300: 0.705195
Loss after iteration 400: 0.620553
Loss after iteration 500: 0.572867
Loss after iteration 600: 0.540771
Loss after iteration 700: 0.516571
Loss after iteration 800: 0.496885
Loss after iteration 900: 0.479992
Train accuracy: 92.15%
Test accuracy: 91.91%
