In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
# 785 cols bc of labels row - only 28*28 = 784 pixels
data = pd.read_csv('mnist_train.csv')
data_test = pd.read_csv('mnist_test.csv')

data = np.array(data)
data_test = np.array(data_test)

data = data.T
data_test = data_test.T

def preprocess_data(X):
    """Preprocess the input data"""
    # Scale pixels to [0, 1] range
    X = X / 255.0
    return X

In [3]:
x_train = data[1:data[0].size].T # 2D Array: each row (array) is a picture
x_test = data_test[1:data[0].size].T # 2D Array: each row (array) is a picture
x_train = preprocess_data(x_train)
x_test = preprocess_data(x_test)
x_labels = data[0] # Array: each number is the correct label for the corresponding index in x_train
x_test_labels = data_test[0]
print(x_train.shape)
print(x_test_labels)
print(x_test)

(59999, 784)
[7 2 1 ... 4 5 6]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
# Initialize parameters for every layer
# Weights & biases
def init_params():
    W_1 = np.random.randn(16, 784) * np.sqrt(2.0/784) # Edges from layer 1 to layer 2
    B_1 = np.zeros(16) # Biases for layer 2
    W_2 = np.random.randn(10, 16) * np.sqrt(2.0/32) # Edges from layer 2 to layer 3
    B_2 = np.zeros(10) # Biases for layer 3
    return W_1, B_1, W_2, B_2

In [5]:
def ReLU(Z):
    return np.maximum(0, Z)

def softmax(Z):
    # Ensure input is at least 1D
    Z = np.atleast_1d(Z)
    
    # Shift values by max for numerical stability
    Z_shifted = Z - np.max(Z, axis=-1, keepdims=True)
    
    # Avoid overflow
    exp_Z = np.exp(np.clip(Z_shifted, -709, 709))  # np.log(np.finfo(np.float64).max) ≈ 709
    
    # Avoid division by zero
    sum_exp_Z = np.sum(exp_Z, axis=-1, keepdims=True)
    sum_exp_Z = np.maximum(sum_exp_Z, np.finfo(float).tiny)
    
    softmax_output = exp_Z / sum_exp_Z
    
    # Ensure probabilities sum to 1
    softmax_output = softmax_output / np.sum(softmax_output, axis=-1, keepdims=True)
    
    return softmax_output

# Cross-entropy cost function
def cross_entropy_loss(y_true, y_pred):
    # y_true is a one-hot encoded vector (e.g., [0, 1, 0, ...])
    # y_pred is the softmax output
    epsilon = 1e-10  # To avoid log(0)
    return -np.sum(y_true * np.log(y_pred + epsilon))  # Sum over classes

# Cost function
def get_cost(Z_2, i):
    error_arr = np.zeros(10)
    error_arr[x_labels[i]] = 1 # Corresponding to one_picture
    cost = cross_entropy_loss(error_arr, Z_2)
    return cost, error_arr


In [6]:
# Forward Propagation
# Z_1: neurons in hidden layer
# Z_2: neurons in output layer
def forward_prop(i, W_1, B_1, W_2, B_2):
    one_picture = x_train[i] # A single sample picture
    
    Z_1 = W_1.dot(one_picture) + B_1
    Z_1 = ReLU(Z_1)
    Z_2 = W_2.dot(Z_1) + B_2
    Z_2 = softmax(Z_2)
    return Z_1, Z_2

In [7]:
# Backpropagation
def back_prop(Z_1, Z_2, W_2, i):
    one_picture = x_train[i]
    error_arr = get_cost(Z_2, i)[1]
    
    dZ_2 = Z_2 - error_arr
    dW_2 = np.outer(dZ_2, Z_1)
    dB_2 = dZ_2
    
    dZ_1 = W_2.T.dot(dZ_2)
    dZ_1 *= (Z_1 > 0)
    dW_1 = np.outer(dZ_1, one_picture)
    dB_1 = dZ_1
    
    return dW_1, dB_1, dZ_1, dW_2, dB_2, dZ_2

In [8]:
# Update parameters
def update_params(W_2, B_2, W_1, B_1, dW_2, dB_2, dW_1, dB_1, learning_rate):
    W_2 -= dW_2 * learning_rate
    B_2 -= dB_2 * learning_rate
    W_1 -= dW_1 * learning_rate
    B_1 -= dB_1 * learning_rate
    return W_1, B_1, W_2, B_2

In [9]:
def train(epochs, learning_rate):
    W_1, B_1, W_2, B_2 = init_params()
    leng = x_train.shape[0]
    
    for i in range(epochs):
        #np.random.shuffle(x_train)

        #learning_rate *= 0.9 # Varying learning rate    
        
        iteration = 0
        for j in range(leng):
            iteration = j
            Z_1, Z_2 = forward_prop(j, W_1, B_1, W_2, B_2)
            dW_1, dB_1, dZ_1, dW_2, dB_2, dZ_2 = back_prop(Z_1, Z_2, W_2, j)
            W_1, B_1, W_2, B_2 = update_params(W_2, B_2, W_1, B_1, dW_2, dB_2, dW_1, dB_1, learning_rate)    
        if i % 1 == 0:
            print(f"Epoch {i}: cost = {get_cost(Z_2, iteration)[0]}, LR = {learning_rate}")
    return W_1, B_1, W_2, B_2

In [10]:
def predict(X, W_1, B_1, W_2, B_2):
    """
    Make predictions for input images
    
    Parameters:
    X: numpy array of shape (n_samples, 784) - input images
    Returns: tuple (predictions, probabilities)
    - predictions: predicted digit for each image
    - probabilities: softmax probabilities for each digit
    """
    # Ensure input is properly scaled
    X = X / 255.0 if X.max() > 1 else X
    
    # Forward pass
    Z_1 = X.dot(W_1.T) + B_1
    A_1 = np.maximum(0, Z_1)  # ReLU
    Z_2 = A_1.dot(W_2.T) + B_2
    probabilities = softmax(Z_2)

    """
    one_picture = x_train[i] # A single sample picture
    
    Z_1 = W_1.dot(one_picture) + B_1
    Z_1 = ReLU(Z_1)
    Z_2 = W_2.dot(Z_1) + B_2
    Z_2 = softmax(Z_2)
    return Z_1, Z_2
    """
    
    
    # Get predicted digit (class with highest probability)
    predictions = np.argmax(probabilities, axis=1)
    
    return predictions, probabilities

        

In [13]:
W_1, B_1, W_2, B_2 = train(500, 0.0001)

Epoch 0: cost = 0.6962699358761829, LR = 0.0001
Epoch 1: cost = 0.35793597831221247, LR = 0.0001
Epoch 2: cost = 0.2715497263225102, LR = 0.0001
Epoch 3: cost = 0.23961916963551874, LR = 0.0001
Epoch 4: cost = 0.22322153759416474, LR = 0.0001
Epoch 5: cost = 0.21222355425967618, LR = 0.0001
Epoch 6: cost = 0.20288310933189135, LR = 0.0001
Epoch 7: cost = 0.19498841053640523, LR = 0.0001
Epoch 8: cost = 0.18834326267499577, LR = 0.0001
Epoch 9: cost = 0.18766957834494585, LR = 0.0001
Epoch 10: cost = 0.18645251461354773, LR = 0.0001
Epoch 11: cost = 0.18366244296723616, LR = 0.0001
Epoch 12: cost = 0.1812964138592144, LR = 0.0001
Epoch 13: cost = 0.1784986440622924, LR = 0.0001
Epoch 14: cost = 0.1759904243894083, LR = 0.0001
Epoch 15: cost = 0.17386795129564697, LR = 0.0001
Epoch 16: cost = 0.1721453497788862, LR = 0.0001
Epoch 17: cost = 0.17142955942434404, LR = 0.0001
Epoch 18: cost = 0.17060594364354195, LR = 0.0001
Epoch 19: cost = 0.16883235456903195, LR = 0.0001
Epoch 20: cost =

In [15]:
sample_size = 10000
X_sample = x_test[:sample_size]
X_sample = x_test[:sample_size]
predictions, probabilities = predict(X_sample, W_1, B_1, W_2, B_2)

correct = 0
for i in range(sample_size):
    correct += (predictions[i] == x_test_labels[i])

print(f"{correct/sample_size * 100}% correct" )
print(f"{correct}/{sample_size} correct" )

95.34% correct
9534/10000 correct
