In [2]:
import numpy as np
from torchvision.datasets import MNIST

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data', transform=lambda x: np.array(x).flatten(), download=True, train=is_train)

    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)

    return mnist_data, mnist_labels

train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

In [3]:
def normalize(data):
    return np.array(data) / 255

def one_hot_encoding(labels, num_classes):
    return np.eye(num_classes)[labels]

train_X = normalize(train_X)
test_X = normalize(test_X)
train_Y = one_hot_encoding(train_Y, 10)
test_Y = one_hot_encoding(test_Y, 10)

In [4]:
def split_data(X, y, ratio = 0.2):
    split_index = int(X.shape[0] * (1 - ratio))
    X_train, X_val = X[:split_index], X[split_index:]
    y_train, y_val = y[:split_index], y[split_index:]
    return X_train, y_train, X_val, y_val

In [5]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)

In [6]:
def entropyLoss(yM, y):
    m = y.shape[0]
    result = -np.sum(y * np.log(yM)) / m
    return result

In [7]:
def initialize_params(input_size = 784, hidden_size = 100, output_size = 10):
    np.random.seed(42)
    w1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
    b1 = np.zeros((1, hidden_size))
    w2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
    b2 = np.zeros((1, output_size))
    return w1, b1, w2, b2

In [8]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

In [9]:
def forward_propagation(X, w1, b1, w2, b2):
    z1 = X.dot(w1) + b1
    a1 = relu(z1)
    z2 = a1.dot(w2) + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2

In [10]:
def back_propagation(X, y, z1, a1, a2, w2):
    m = y.shape[0]
    delta2 = a2 - y
    dw2 = (a1.T).dot(delta2) / m
    db2 = np.sum(delta2, axis=0) / m
    delta1 = delta2.dot(w2.T) * relu_derivative(a1)
    dw1 = np.dot(X.T, delta1) / m
    db1 = np.sum(delta1, axis=0) / m
    return dw1, db1, dw2, db2

In [11]:
def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, learning_rate = 0.01, lambda_reg = 0.001):
    w1 -= learning_rate * (dw1 + lambda_reg * w1)
    b1 -= learning_rate * db1
    w2 -= learning_rate * (dw2 + lambda_reg * w2)
    b2 -= learning_rate * db2
    return w1, b1, w2, b2

In [12]:
def compute_accuracy(X, y, w1, b1, w2, b2):
    _, _, _, a2 = forward_propagation(X, w1, b1, w2, b2)
    predictions = np.argmax(a2, axis=1)
    labels = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == labels)
    return accuracy

In [13]:
def train(X, y, val_X, val_y, w1, w2, b1, b2, epochs = 10, batch_size = 64, learning_rate = 0.01, lambda_reg = 0.001):
    for epoch in range(epochs):
        ind = np.arange(X.shape[0])
        np.random.shuffle(ind)
        X = X[ind]
        y = y[ind]

        for i in range(0, X.shape[0], batch_size):
            X_batch = X[i:i + batch_size]
            y_batch = y[i:i + batch_size]

            z1, a1, z2, a2 = forward_propagation(X_batch, w1, b1, w2, b2)
            dw1, db1, dw2, db2 = back_propagation(X_batch, y_batch, z1, a1, a2, w2)
            w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, learning_rate, lambda_reg)

    train_accuracy = compute_accuracy(X, y, w1, b1, w2, b2)
    val_accuracy = compute_accuracy(val_X, val_y, w1, b1, w2, b2)
    print(f"Epoch {epoch + 1}: Train accuracy = {train_accuracy * 100:.2f}%, Validation accuracy = {val_accuracy * 100:.2f}%")

    return w1, b1, w2, b2

In [14]:
X_train, y_train, val_X, val_y = split_data(train_X, train_Y)
w1, b1, w2, b2 = initialize_params()

w1, b1, w2, b2 = train(train_X, train_Y, val_X, val_y, w1, w2, b1, b2, epochs = 50, batch_size = 64, learning_rate = 0.01, lambda_reg = 0.001)

print("Test accuracy:", compute_accuracy(test_X, test_Y, w1, b1, w2, b2) * 100)

Epoch 50: Train accuracy = 97.11%, Validation accuracy = 97.33%
Test accuracy: 96.52
