In [1]:
import pandas as pd
import numpy as np

mnist_train = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
mnist_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")
mnist_train

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
y_train = mnist_train['label']
X_train = mnist_train.iloc[: ,1:].T
X_test = mnist_test.T

# X_train = X_train / 255.
# X_test = X_test / 255.

print(f"X_train => {X_train.shape}")
print(f"y_train => {y_train.shape}")
print(f"X_test => {X_test.shape}")

X_train => (784, 42000)
y_train => (42000,)
X_test => (784, 28000)


### Preprocessing the labels

In [3]:
def one_hot_encode(y):
    m = y.shape[0]
    one_hot_y = np.zeros((10, m))
    one_hot_y[y, np.arange(m)] = 1
    return one_hot_y

y_train = one_hot_encode(y_train)
print(f"y_train => {y_train.shape}")

y_train => (10, 42000)


### The general methodology to build a Neural Network is to:

1. Define the neural network structure ( # of input units,  # of hidden units, etc). 
2. Initialize the model's parameters
3. Loop:
    - Implement forward propagation
    - Compute loss
    - Implement backward propagation to get the gradients
    - Update parameters (gradient descent)

## 1. Define the architecture

In [4]:
# Define the architecture
def layer_sizes(X, y):
    n_x = X.shape[0]
    n_h = 256
    n_y = y.shape[0]
    
    return (n_x, n_h, n_y)

(n_x, n_h, n_y) = layer_sizes(X_train, y_train)
print("The size of the input layer is: n_x = " + str(n_x))
print("The size of the hidden layer is: n_h = " + str(n_h))
print("The size of the output layer is: n_y = " + str(n_y))

The size of the input layer is: n_x = 784
The size of the hidden layer is: n_h = 256
The size of the output layer is: n_y = 10


## 2. Initialize the parameters

In [5]:
def initialize_parameters(n_x, n_h, n_y):
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters

parameters = initialize_parameters(n_x, n_h, n_y)

print(f"W1 => {parameters['W1'].shape}")
print(f"b1 => {parameters['b1'].shape}")
print(f"W2 => {parameters['W2'].shape}")
print(f"b2 => {parameters['b2'].shape}")

W1 => (256, 784)
b1 => (256, 1)
W2 => (10, 256)
b2 => (10, 1)


## 3.1 Implementing Forward Propagation

### Let's make some activation functions

In [6]:
def softmax(z):
    z -= np.max(z, axis=0, keepdims=True)
    return np.exp(z) / np.sum(np.exp(z), axis=0, keepdims=True)

In [7]:
def ReLU(z):
    return np.maximum(0, z)

def ReLU_derivative(z):
    return z>0

In [8]:
def forward_propagation(X, parameters):
    Z1 = np.dot(parameters['W1'], X) + parameters["b1"]
    A1 = ReLU(Z1)
    Z2 = np.dot(parameters['W2'], A1) + parameters["b2"]
    A2 = softmax(Z2)

    forward_cache = {
        "Z1": Z1,
        "A1": A1,
        "Z2": Z2,
        "A2": A2,
    }
    return A2, forward_cache

A2, forward_cache = forward_propagation(X_train, parameters)
print(f"A2 => {A2.shape}")
print("A2 = "+ str(A2))
#print(f"Forward cache => {forward_cache}")

A2 => (10, 42000)
A2 = [[2.22968114e-01 8.34034175e-05 6.73912585e-01 ... 4.77843514e-04
  1.91380985e-01 1.15331344e-02]
 [8.75785964e-02 8.40423378e-05 5.63727031e-02 ... 3.57395044e-03
  1.20392431e-03 5.42323815e-04]
 [6.27506670e-03 8.59737677e-05 6.37749460e-03 ... 2.91630582e-04
  9.85413839e-04 1.74608277e-04]
 ...
 [3.31848258e-04 8.02627934e-05 3.56347748e-03 ... 1.03289885e-01
  1.78582233e-04 7.52145225e-03]
 [1.12180869e-01 4.15955782e-02 7.03717499e-03 ... 1.65111307e-01
  7.10436831e-01 7.85061878e-03]
 [1.91194366e-01 5.90898131e-05 1.56367147e-01 ... 9.50711718e-04
  1.42265499e-02 9.78919013e-02]]


## Make Predictions

In [9]:
def predict(X, parameters):
    A2, forward_cache = forward_propagation(X, parameters)
    y_pred = np.argmax(A2, axis=0)
    return y_pred

y_pred = predict(X_train, parameters)
y_pred

array([3, 6, 0, ..., 6, 8, 6])

## 3.2 Computing Cost and Accuracy

In [10]:
def compute_cost(A2, y):
    m = y.shape[1]
    epsilon = 1e-8
    cost = -np.sum(y * np.log(A2 + epsilon)) / m
    return np.squeeze(cost)

def compute_accuracy(y_pred, y_true):
    labels = np.argmax(y_true, axis=0)
    return np.mean(y_pred == labels) * 100

compute_accuracy(y_pred, y_train)

11.571428571428571

## 3.3 Implementing Backward Propagation

In [11]:
def backward_propagation(parameters, forward_cache, X, Y):
    W1 = parameters["W1"]
    W2 = parameters["W2"]

    Z1 = forward_cache["Z1"]
    A1 = forward_cache["A1"]
    A2 = forward_cache["A2"]

    m = X.shape[1]

    dZ2 = A2 - Y
    dW2 = np.dot(dZ2, A1.T) / m
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m
    dZ1 = np.dot(W2.T, dZ2) * ReLU_derivative(Z1)
    dW1 = np.dot(dZ1, X.T) / m
    db1 = np.sum(dZ1, axis=1, keepdims=True)

    grads = {
        "dW2": dW2,
        "db2": db2,
        "dW1": dW1,
        "db1": db1
    }
    return grads

grads = backward_propagation(parameters, forward_cache, X_train, y_train)
print ("dW2 => ", grads["dW2"].shape)
print ("db2 => ", grads["db2"].shape)
print ("dW1 => ", grads["dW1"].shape)
print ("db1 => ", grads["db1"].shape)

dW2 =>  (10, 256)
db2 =>  (10, 1)
dW1 =>  (256, 784)
db1 =>  (256, 1)


## 3.4 Updating Parameters

In [12]:
def update_parameters(parameters, grads, learning_rate=0.1):
    parameters["W1"] -= learning_rate * grads["dW1"]
    parameters["b1"] -= learning_rate * grads["db1"]
    parameters["W2"] -= learning_rate * grads["dW2"]
    parameters["b2"] -= learning_rate * grads["db2"]
    return parameters

parameters = update_parameters(parameters, grads)

### Putting all pieces together

In [13]:
def nn_model(X, y, n_h, iterations=30, learning_rate=0.1):
    np.random.seed(42)
    
    n_x = layer_sizes(X, y)[0]
    n_y = layer_sizes(X, y)[2]

    parameters = initialize_parameters(n_x, n_h, n_y)
    for i in range(iterations):
        # Forward Propagation
        A2, forward_cache = forward_propagation(X, parameters)
        # Calculate Predictions
        y_pred = predict(X_train, parameters)
        # Calculate Accuracy
        accuracy = compute_accuracy(y_pred, y_train)
        # Calculate cost
        cost = compute_cost(A2, y)
        # Backward Propagation
        grads = backward_propagation(parameters, forward_cache, X, y)
        # Gradient Descent
        parameters = update_parameters(parameters, grads, learning_rate=learning_rate)
        if (i%100 == 0):
            print(f"Epoch {i}/{iterations}")
            print(f" categorical cross entropy: {cost} - accuracy = {accuracy}")
    print(f"Final Cost: {cost} - Training Accuracy: {accuracy}")
    return parameters

final_parameters = nn_model(X_train, y_train, 256, iterations=1200, learning_rate=0.0024)

Epoch 0/1200
 categorical cross entropy: 5.99908469474703 - accuracy = 7.869047619047619
Epoch 100/1200
 categorical cross entropy: 0.23490757250121164 - accuracy = 93.0904761904762
Epoch 200/1200
 categorical cross entropy: 0.1692931394593458 - accuracy = 95.03809523809524
Epoch 300/1200
 categorical cross entropy: 0.1343557450724123 - accuracy = 96.1
Epoch 400/1200
 categorical cross entropy: 0.11171615020557729 - accuracy = 96.76428571428572
Epoch 500/1200
 categorical cross entropy: 0.09552632392320437 - accuracy = 97.26190476190476
Epoch 600/1200
 categorical cross entropy: 0.08313926402394349 - accuracy = 97.69285714285715
Epoch 700/1200
 categorical cross entropy: 0.07321103082773882 - accuracy = 98.03333333333333
Epoch 800/1200
 categorical cross entropy: 0.06508368982907115 - accuracy = 98.28095238095239
Epoch 900/1200
 categorical cross entropy: 0.05826778054300678 - accuracy = 98.52142857142857
Epoch 1000/1200
 categorical cross entropy: 0.05245900573476295 - accuracy = 98.7

## Final Submission

In [14]:
y_pred_test = predict(X_test, final_parameters)
print("X_test shape:", X_test.shape)
print("y_pred_test shape:", y_pred_test.shape)

X_test shape: (784, 28000)
y_pred_test shape: (28000,)


In [15]:
submission = pd.DataFrame({"ImageId": range(1, (X_test.shape[1])+1), "Label": y_pred_test})
submission.to_csv("aak_mnist_from_scratch", index=False)