## Handwritten digit classifier
Implementing neural network from scratch to classify handwritten numbers to their corresponding digits.

### Installs and Imports
Remember to manually upload kaggle.json to use API

In [1]:
!pip install kaggle -q
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle competitions download -c digit-recognizer

Downloading digit-recognizer.zip to /content
100% 15.3M/15.3M [00:00<00:00, 51.0MB/s]
100% 15.3M/15.3M [00:00<00:00, 49.1MB/s]


In [None]:
# open zip file without downloading manually
from zipfile import ZipFile
file_name = "digit-recognizer.zip"
with ZipFile(file_name, 'r') as zip:
    zip.extractall()
    print('Done')

In [9]:
# visualize pixel value structure
with open('train.csv', 'r') as f:
    for i in range(3):
        line = f.readline()
        pixels = line.split(',')
        digit = pixels[0]
        pixels = pixels[1:]
        print(f'Digit: {digit}')
        print(pixels)

Digit: label
['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7', 'pixel8', 'pixel9', 'pixel10', 'pixel11', 'pixel12', 'pixel13', 'pixel14', 'pixel15', 'pixel16', 'pixel17', 'pixel18', 'pixel19', 'pixel20', 'pixel21', 'pixel22', 'pixel23', 'pixel24', 'pixel25', 'pixel26', 'pixel27', 'pixel28', 'pixel29', 'pixel30', 'pixel31', 'pixel32', 'pixel33', 'pixel34', 'pixel35', 'pixel36', 'pixel37', 'pixel38', 'pixel39', 'pixel40', 'pixel41', 'pixel42', 'pixel43', 'pixel44', 'pixel45', 'pixel46', 'pixel47', 'pixel48', 'pixel49', 'pixel50', 'pixel51', 'pixel52', 'pixel53', 'pixel54', 'pixel55', 'pixel56', 'pixel57', 'pixel58', 'pixel59', 'pixel60', 'pixel61', 'pixel62', 'pixel63', 'pixel64', 'pixel65', 'pixel66', 'pixel67', 'pixel68', 'pixel69', 'pixel70', 'pixel71', 'pixel72', 'pixel73', 'pixel74', 'pixel75', 'pixel76', 'pixel77', 'pixel78', 'pixel79', 'pixel80', 'pixel81', 'pixel82', 'pixel83', 'pixel84', 'pixel85', 'pixel86', 'pixel87', 'pixel88', 'pixel89', 'pixel

In [10]:
import numpy as np
import random

## Create Neural Network
First, we need to  randomly create the weights and biases for each neuron in their respective layers. Based on these parameters, neuron's can be created as a function to accept input from the previous layer and return their input * parameters after passed through the activation function. For the activiation function, we'll use a sigmoid as a non-linear function to output from each neuron.

In [18]:
# initialize w&b randomly for 4 layers, where layer_dims is the dimension of a given layer(neurons)
def init_params(layer_dims):
    np.random.seed(3)
    params = {}
    L = len(layer_dims)

    for l in range(1, L):
        params['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*0.01
        params['b'+str(l)] = np.zeros((layer_dims[l], 1))

    return params

# non-linear activation function to determined neuron's result
def sigmoid(Z):
    return 1/(1+np.exp(np.dot(-1, Z)))

# Linear activation function for a given layer
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))  # for numerical stability
    return exp_x / np.sum(exp_x, axis=0, keepdims=True)

Next, we need to define the forward pass function, which should accept input data (pixels of an image) and iterate through the nerwork. For each hidden layer, we need to compute the Weight matrix * Activation of prev layer + bias, while storing each neuron for later use.

In [56]:
def forward(X, params):
    """
    for a given layer, multiply its w&b based on activations of previous layer for each neuron

    X = input data
    params = dictionary of weights and biases
    """
    A = X  # activations vector
    L = len(params)//2  # num layers
    caches = {}  # store intermediate linear transformations and activations

    # Store initial input
    caches['A0'] = X

    # propogate forward for all remaining layers
    for layer in range(1, L + 1):
        A_prev = A
        W, b = params['W'+str(layer)], params['b'+str(layer)]

        Z = np.dot(W, A_prev) + b

        # Store cache for backward pass
        caches['A'+str(layer-1)] = A_prev
        caches['Z'+str(layer)] = Z

        # Apply activation function
        if layer < L:
            A = sigmoid(Z)
        else:
            A = softmax(Z)  # Only apply softmax at the final layer, since we r doin classification

    return A, caches

## Backpropogation
Now that we've defined the network and the forward pass, we need to enable learning. Based on cached linear transformation and activation values, we need to compute gradient's to determine the best way to minimize our cost function. For cost, we'll use mean squared error and optimize using gradient descent.

In [33]:
# utils
def mse_cost(A, Y):
    m = Y.shape[1]
    return 1/m * np.sum(np.square(A-Y))

def cross_entropy_cost(A, Y):
    """
    Cross-entropy cost function for classification.

    A: Predictions (softmax probabilities).
    Y: True labels (one-hot encoded).
    """
    m = Y.shape[1]  # Number of samples
    cost = -1/m * np.sum(Y * np.log(A + 1e-8))  # Add 1e-8 for numerical stability
    return cost

def derivative_mse_cost(A, Y):
    """
    derivitive of the cost function vs the output layers, to help use nudge w&b accordingly

    A = activation of output layer
    Y = labels
    """
    m = Y.shape[1]
    return (2/m)*(A-Y)

def sigmoid_derivative(Z):
    """
    Derivative of the sigmoid activation function.
    Z = Linear combination of weights and inputs (pre-activation).
    """
    sigmoid_Z = sigmoid(Z)
    return sigmoid_Z * (1 - sigmoid_Z)

In [36]:
def backprop(AL, Y, caches, params):
    """
    backpropagation for the network.

    AL = Output of the forward pass (final layer activations).
    Y = True labels.
    caches = Dictionary with forward pass intermediate values (Z, A).
    params = Dictionary of weights and biases.

    Returns:
    grads = Gradients for weights and biases.
    """
    grads = {}
    L = len(params) // 2  # num  layers
    m = Y.shape[1]        # num samples

    # 1. compute gradient for output layer (L)
    dZ = AL - Y  # Gradient of cross-entropy cost w.r.t. ZL
    grads['dW' + str(L)] = np.dot(dZ, caches['A' + str(L - 1)].T) / m
    grads['db' + str(L)] = np.sum(dZ, axis=1, keepdims=True) / m

    # 2. backpropagate through hidden layers
    for l in reversed(range(1, L)):
        dA = np.dot(params['W' + str(l + 1)].T, dZ)  # Gradient w.r.t. activations
        dZ = dA * sigmoid_derivative(caches['Z' + str(l)])  # Apply activation derivative
        grads['dW' + str(l)] = np.dot(dZ, caches['A' + str(l - 1)].T) / m
        grads['db' + str(l)] = np.sum(dZ, axis=1, keepdims=True) / m

    return grads

def update_params(params, grads, learning_rate=0.2):
    """
    Update weights and biases using gradient descent.

    params = Dictionary of weights and biases.
    grads = Gradients for weights and biases from backpropagation.
    learning_rate = Learning rate for gradient descent.

    Returns:
    params = Updated weights and biases.
    """
    L = len(params) // 2  # layers

    for l in range(1, L + 1):
        params['W' + str(l)] -= learning_rate * grads['dW' + str(l)]
        params['b' + str(l)] -= learning_rate * grads['db' + str(l)]

    return params

## Cleaning and Training


1.   Init random params for network
2.   Split data into batches
2.   Apply forward pass for an input in a given batch
4.   Compute the cost.
3.   Backprop, computing gradients and updating parameters.



In [26]:
data = []
labels = []

with open('train.csv', 'r') as f:
    next(f)
    for line in f:
        values = line.strip().split(',')
        labels.append(int(values[0]))  # First value is the label (digit)
        data.append([int(p) for p in values[1:]])  # Remaining values are pixels

X = np.array(data).T  # Transpose for shape (features x samples)
Y = np.array(labels)

# normalize pixel values (0-255 -> 0-1)
X = X / 255.0

# One-hot encode labels (10 classes for digits 0-9)
Y_one_hot = np.zeros((10, Y.size))
Y_one_hot[Y, np.arange(Y.size)] = 1


In [58]:
def train(X, Y, params, epochs, learning_rate):
    """
    Train the neural network using gradient descent.

    X = Input data.
    Y = True labels.
    params = Dictionary of initial weights and biases.
    epochs = Number of training iterations.
    learning_rate = Learning rate for gradient descent.

    Returns:
    params = Trained weights and biases.
    """
    # add momentum to help escape local minima
    velocities = {k: np.zeros_like(v) for k, v in params.items()}
    beta = 0.9  # momentum factor

    for epoch in range(epochs):
        # Forward pass
        AL, caches = forward(X, params)

        # Compute cost
        cost = cross_entropy_cost(AL, Y)

        # Backpropagation
        grads = backprop(AL, Y, caches, params)

        # Update parameters with momentum
        for k in params.keys():
            velocities[k] = beta * velocities[k] + (1 - beta) * grads['d'+k]
            params[k] -= learning_rate * velocities[k]

        # Print cost every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Cost: {cost:.4f}")

    return params

In [59]:
# train
layer_dims = [784, 128, 64, 10]  # Input(784 pixels) -> Hidden Layer 1 -> Hidden Layer 2 -> Output

params = init_params(layer_dims)

params = train(X, Y_one_hot, params, epochs=1000, learning_rate=0.1)

Epoch 0, Cost: 2.3031
Epoch 100, Cost: 2.3012
Epoch 200, Cost: 2.3011
Epoch 300, Cost: 2.3011
Epoch 400, Cost: 2.3011
Epoch 500, Cost: 2.3011
Epoch 600, Cost: 2.3011
Epoch 700, Cost: 2.3011
Epoch 800, Cost: 2.3011
Epoch 900, Cost: 2.3010


## Testing

In [60]:
import pandas as pd

def load_test_data(filename='test.csv'):
    test_data = []
    with open(filename, 'r') as f:
        next(f)  # skip header
        for line in f:
            pixels = line.strip().split(',')
            test_data.append([int(p) for p in pixels])

    X_test = np.array(test_data).T  # shape: (784, samples)
    X_test = X_test / 255.0  # normalize
    return X_test

def predict(X, params):
    # get predictions
    predictions, _ = forward(X, params)
    return np.argmax(predictions, axis=0)  # class with highest prob

def generate_submission(predictions, filename='submission.csv'):
    # create submission df
    submission = pd.DataFrame({
        'ImageId': range(1, len(predictions) + 1),
        'Label': predictions
    })

    # save and show results
    submission.to_csv(filename, index=False)
    print(f"saved to {filename}")

    print("\nfirst few:")
    print(submission.head())

    print("\nfull csv:")
    print("ImageId,Label")
    for idx, label in enumerate(predictions, 1):
        print(f"{idx},{label}")

def test_model(params, test_filename='test.csv', submission_filename='submission.csv'):
    # run full pipeline
    print("loading test data...")
    X_test = load_test_data(test_filename)
    print(f"test shape: {X_test.shape}")

    print("\nmaking predictions...")
    predictions = predict(X_test, params)
    print(f"num predictions: {len(predictions)}")

    print("\ngenerating submission...")
    generate_submission(predictions, submission_filename)

    return predictions

# usage: predictions = test_model(params)

In [61]:
predictions = test_model(params)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
23001,1
23002,1
23003,1
23004,1
23005,1
23006,1
23007,1
23008,1
23009,1
23010,1
23011,1
23012,1
23013,1
23014,1
23015,1
23016,1
23017,1
23018,1
23019,1
23020,1
23021,1
23022,1
23023,1
23024,1
23025,1
23026,1
23027,1
23028,1
23029,1
23030,1
23031,1
23032,1
23033,1
23034,1
23035,1
23036,1
23037,1
23038,1
23039,1
23040,1
23041,1
23042,1
23043,1
23044,1
23045,1
23046,1
23047,1
23048,1
23049,1
23050,1
23051,1
23052,1
23053,1
23054,1
23055,1
23056,1
23057,1
23058,1
23059,1
23060,1
23061,1
23062,1
23063,1
23064,1
23065,1
23066,1
23067,1
23068,1
23069,1
23070,1
23071,1
23072,1
23073,1
23074,1
23075,1
23076,1
23077,1
23078,1
23079,1
23080,1
23081,1
23082,1
23083,1
23084,1
23085,1
23086,1
23087,1
23088,1
23089,1
23090,1
23091,1
23092,1
23093,1
23094,1
23095,1
23096,1
23097,1
23098,1
23099,1
23100,1
23101,1
23102,1
23103,1
23104,1
23105,1
23106,1
23107,1
23108,1
23109,1
23110,1
23111,1
23112,1
23113,1
23114,1
23115,1
23116,1
23117,1