In [304]:
import numpy as np
import matplotlib as plt
import pandas as pd

In [305]:
data = pd.read_csv("dataset/train.csv")

data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [306]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)
test_size = 6000

data_dev = data[0:test_size].T
Y_test = data_dev[0]
X_test = data_dev[1:].T

data_train = data[test_size:].T
Y_train = data_train[0]
X_train = data_train[1:].T

In [315]:
# len(X_train), len(X_train[0])
X_train.shape

(36000, 784)

In [None]:
import math

learning_rate = 0.01


def init_params():
    # Input to hidden layer
    W1 = np.random.rand(10, 784) * 0.01
    b1 = np.zeros((10, 1))

    # Hidden to output layer
    W2 = np.random.rand(10, 10) * 0.01
    b2 = np.zeros((10, 1))

    return W1, b1, W2, b2


def ReLU(z):
    return np.maximum(0, z)


def ReLU_derivative(Z1):
    return (Z1 > 0).astype(float)


def softmax(z):
    return np.exp(z) / np.sum(np.exp(z))


def forward_prop(W1, b1, W2, b2, X):
    # Input to hidden layer
    Z1 = W1.dot(X).reshape(10, 1) + b1
    # Apply activation function (ReLU)
    A1 = ReLU(Z1)

    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)

    return Z1, A1, A2

In [309]:
vec = np.random.rand(10, 1)

np.log(vec)

array([[-2.07261937],
       [-0.3866211 ],
       [-1.00663289],
       [-0.35389358],
       [-0.10850241],
       [-0.09430202],
       [-1.75701781],
       [-1.88116792],
       [-0.36423385],
       [-1.11765724]])

In [310]:
def cross_entropy_loss(A2, Y):
    m = Y.shape[0]
    loss = -np.sum(Y * np.log(A2 + 1e-15)) / m
    return loss


def gradient_W2(A1, A2, Y):
    dZ2 = A2 - Y
    W2_grad = dZ2.dot(A1.T) / m

    return W2_grad, dZ2


def gradient_W1(dZ2, W2, X, Z1, m):
    dZ1 = (W2.T.dot(dZ2)) * ReLU_derivative(Z1)
    W1_grad = dZ1.dot(X.reshape(1, 784)) / m

    return W1_grad, dZ1


def b_grad(dZ, m):
    return np.sum(dZ, axis=1, keepdims=True) / m


def one_hot_encode(label, num_classes=10):
    one_hot = np.zeros((num_classes, 1))
    one_hot[label] = 1
    return one_hot


def back_prop(W1, b1, W2, b2, X, Y):
    Z1, A1, A2 = forward_prop(W1, b1, W2, b2, X)

    m = 1

    W2_grad, dZ2 = gradient_W2(A1, A2, Y)
    b2_grad = b_grad(dZ2, m)

    W1_grad, dZ1 = gradient_W1(dZ2, W2, X, Z1, m)
    b1_grad = b_grad(dZ1, m)

    return W1_grad, b1_grad, W2_grad, b2_grad, A2


def update_params(W1, b1, W2, b2, W1_grad, b1_grad, W2_grad, b2_grad):
    W1 -= learning_rate * W1_grad
    b1 -= learning_rate * b1_grad
    W2 -= learning_rate * W2_grad
    b2 -= learning_rate * b2_grad
    return W1, b1, W2, b2

In [311]:
W1, b1, W2, b2 = init_params()

batch_size = 100
num_epochs = 10
num_examples = X_train.shape[0]

for epoch in range(num_epochs):
    # indices = np.random.perm

    # for i in range(0, nutation(num_examples)
    # X_train = X_train[indices]
    # Y_train = Y_train[indices]um_examples, batch_size):
    #     batch_X = X_train[i : i + batch_size].T
    #     batch_Y = Y_train[i : i + batch_size].T
    for i in range(0, num_examples, 1):
        batch_X = X_train[i].T
        batch_Y = one_hot_encode(Y_train[i])

        # if batch_X.shape[1] == 0:
        #     continue

        W1_grad, b1_grad, W2_grad, b2_grad, A2 = back_prop(
            W1, b1, W2, b2, batch_X, batch_Y
        )

        W1, b1, W2, b2 = update_params(
            W1, b1, W2, b2, W1_grad, b1_grad, W2_grad, b2_grad
        )

        avg_loss = cross_entropy_loss(A2, batch_Y)
        if i % 1000 == 0:
            print(f"Epoch {epoch}, Batch {i//batch_size}, Loss: {avg_loss}")

print("Training complete!")

Epoch 0, Batch 0, Loss: 0.26778852901576433
Epoch 0, Batch 10, Loss: 0.27057670969288705
Epoch 0, Batch 20, Loss: 0.21787045909277825
Epoch 0, Batch 30, Loss: 0.040941055828393975
Epoch 0, Batch 40, Loss: 0.0046705245773683915
Epoch 0, Batch 50, Loss: 0.2663506365090585
Epoch 0, Batch 60, Loss: 0.005440126020430303
Epoch 0, Batch 70, Loss: 0.001192268836448814
Epoch 0, Batch 80, Loss: 0.00663909198264076
Epoch 0, Batch 90, Loss: 0.05928823497707819
Epoch 0, Batch 100, Loss: 0.00323267484384699
Epoch 0, Batch 110, Loss: 0.004616663393806768
Epoch 0, Batch 120, Loss: 0.0034478520527875442
Epoch 0, Batch 130, Loss: 0.0028460112281162974
Epoch 0, Batch 140, Loss: 0.004940359078754132
Epoch 0, Batch 150, Loss: 0.041659597458349976
Epoch 0, Batch 160, Loss: 0.0049942558900445124
Epoch 0, Batch 170, Loss: 0.0019049994130515863
Epoch 0, Batch 180, Loss: 0.00036924917636252185
Epoch 0, Batch 190, Loss: 0.000877037826046948
Epoch 0, Batch 200, Loss: 5.0117889259164394e-05
Epoch 0, Batch 210, Los

In [312]:
def test_model(W1, b1, W2, b2, X_test, Y_test):
    num_test_examples = X_test.shape[0]
    correct_predictions = 0

    for i in range(0, num_test_examples):
        _, _, A2 = forward_prop(W1, b1, W2, b2, X_test[i])

        predictions = np.argmax(A2, axis=0)

        true_labels = Y_test[i]

        if predictions[0] == true_labels:
            correct_predictions += 1

    accuracy = correct_predictions / num_test_examples * 100
    return accuracy


accuracy = test_model(W1, b1, W2, b2, X_test, Y_test)
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 90.93%


In [313]:
def test_single_example(W1, b1, W2, b2, X, Y):
    _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    print(A2)
    prediction = np.argmax(A2)
    true_label = Y
    print(
        f"Predicted: {prediction}, True: {true_label}, Correct: {prediction == true_label}"
    )


test_single_example(W1, b1, W2, b2, X_test[0], Y_test[0])

[[6.56609130e-08]
 [1.71308731e-11]
 [5.45939317e-08]
 [7.64174480e-05]
 [5.39772118e-12]
 [9.99921383e-01]
 [9.71568401e-07]
 [4.15109106e-11]
 [4.03632156e-08]
 [1.06705555e-06]]
Predicted: 5, True: 5, Correct: True


In [314]:
data_test = pd.read_csv("dataset/test.csv")

data_test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_test = np.array(data_test)

(28000, 784)

In [320]:
def get_predictions(W1, b1, W2, b2, X_test):
    Y_test = []
    num_test_examples = X_test.shape[0]

    for i in range(0, num_test_examples):
        _, _, A2 = forward_prop(W1, b1, W2, b2, X_test[i])

        predictions = np.argmax(A2, axis=0)

        Y_test.append(predictions[0])

    return Y_test


predictions = get_predictions(W1, b1, W2, b2, data_test)
ids = [i for i in range(1, data_test.shape[0] + 1)]

print("predictions: ", len(predictions))
print("ids: ", len(ids))

df = pd.DataFrame({"ImageId": ids, "Label": predictions})

df.to_csv("submission.csv", index=False)

predictions:  28000
ids:  28000
