In [2]:
import pandas as pd
import numpy as np

In [4]:
#https://www.kaggle.com/competitions/digit-recognizer/data?select=train.csv
data = pd.read_csv("/content/train.csv")

In [5]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]

In [6]:
def init_params():

    W1 = np.random.rand(10, 784)
    b1 = np.random.rand(10, 1)
    W2 = np.random.rand(10, 10)
    b2 = np.random.rand(10, 1)

    return W1, b1, W2, b2

In [7]:
def ReLU(Z):

    return np.maximum(0, Z)

def ReLU_deriv(Z):

    return Z > 0

In [8]:
def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)

In [9]:
def forward_propagation(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [10]:
def one_hot_encoding(Y):
    one_hot_Y = np.zeros((Y.max() + 1, Y.size))
    one_hot_Y[Y, np.arange(Y.size)] = 1
    return one_hot_Y

In [11]:
def backpropagation(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot_encoding(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

In [12]:
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, LR):
    W1 = W1 - LR * dW1
    b1 = b1 - LR * db1
    W2 = W2 - LR * dW2
    b2 = b2 - LR * db2
    return W1, b1, W2, b2

In [13]:
def predictions(A2):
    return np.argmax(A2, axis=0)

In [14]:
def accuracy(preds, Y):
    return np.sum(preds == Y) / Y.size

In [15]:
def gradient_descent(X, Y, LR, iterations):
    W1, b1, W2, b2 = init_params()

    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_propagation(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backpropagation(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, LR)

        if i % 10 == 0:
            print("Iter:", i)
            preds = predictions(A2)
            print("Accuracy:", accuracy(preds, Y))

    return W1, b1, W2, b2

In [None]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.001, 300)

Iter: 0
Accuracy: 0.0984390243902439
Iter: 10
Accuracy: 0.09804878048780488
Iter: 20
Accuracy: 0.09734146341463415
Iter: 30
Accuracy: 0.09709756097560976
Iter: 40
Accuracy: 0.09702439024390244
Iter: 50
Accuracy: 0.09687804878048781
Iter: 60
Accuracy: 0.09673170731707317
Iter: 70
Accuracy: 0.09663414634146342
Iter: 80
Accuracy: 0.09660975609756098
Iter: 90
Accuracy: 0.09663414634146342
Iter: 100
Accuracy: 0.09663414634146342
Iter: 110
Accuracy: 0.09670731707317073
Iter: 120
Accuracy: 0.09673170731707317
Iter: 130
Accuracy: 0.09675609756097561
Iter: 140
Accuracy: 0.09675609756097561
Iter: 150
Accuracy: 0.09675609756097561
Iter: 160
Accuracy: 0.09675609756097561
Iter: 170
Accuracy: 0.09678048780487805
Iter: 180
Accuracy: 0.09673170731707317
Iter: 190
Accuracy: 0.09670731707317073
Iter: 200
Accuracy: 0.09675609756097561
Iter: 210
Accuracy: 0.09678048780487805
Iter: 220
Accuracy: 0.09682926829268293
Iter: 230
Accuracy: 0.09687804878048781
Iter: 240
Accuracy: 0.09690243902439025
Iter: 250
Ac