In [None]:
import numpy as np
import pandas as pd

# Load and preprocess data
data = pd.read_csv("train.csv")
data = np.array(data)

# shuffle data
np.random.shuffle(data)

# Split data into training and development sets
data_dev = data[1:1000].T
y_dev = data_dev[0]
x_dev = data_dev[1:] / 255

data_train = data[1000:].T
x_train = data_train[1:] / 255
y_train = data_train[0]

def init_params():
    """Function to initialize weights and biases"""
    w1 = np.random.randn(16, 784) - 0.5
    b1 = np.random.randn(16, 1) - 0.5
    w2 = np.random.randn(10, 16) - 0.5
    b2 = np.random.randn(10, 1) - 0.5
    return w1, b1, w2, b2

def sigmoid(z):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-1* z))

def softmax(Z):
    """Softmax activation function"""
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))  # stability
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def forward_prop(x, w1, b1, w2, b2):
    """function for forward propagation with 2 layers
    Parameters:
    x -- input data of shape (784, number of examples)
    w1 -- weights for layer 1 of shape (16, 784)
    b1 -- biases for layer 1 of shape (16, 1)
    w2 -- weights for layer 2 of shape (10, 16)
    b2 -- biases for layer 2 of shape (10, 1)
    
    Returns:
    a1 -- activations from layer 1 of shape (16, number of examples)
    a2 -- activations from layer 2 of shape (10, number of examples)"""
    z1 = (w1 @ x) + b1
    a1 = sigmoid(z1)
    z2 = (w2 @ a1) + b2
    a2 = softmax(z2)
    return a1, a2

def back_prop(a1, a2, y, w2, x):
    """Function for backpropagation with 2 layers
    Parameters:
    a1 -- activations from layer 1 of shape (16, number of examples)
    a2 -- activations from layer 2 of shape (10, number of examples)
    y -- true "labels" vector of shape (10, number of examples)
    w2 -- weights for layer 2 of shape (10, 16)
    x -- input data of shape (784, number of examples)"""
    dz2 = a2 - y

    dw2  = dz2 @ a1.T
    db2 = dz2

    dz1 = (w2.T @ dz2) * a1 * (1 - a1)

    dw1 = dz1 @ x.T
    db1 = dz1

    return dw2, db2, dw1, db1

def update_params(dw1, db1, dw2, db2, w1, b1, w2, b2, alpha):
    """Function to update weights and biases using gradient descent
    Parameters:
    dw1 -- gradient of weights for layer 1
    db1 -- gradient of biases for layer 1
    dw2 -- gradient of weights for layer 2
    db2 -- gradient of biases for layer 2
    w1 -- weights for layer 1
    b1 -- biases for layer 1
    w2 -- weights for layer 2
    b2 -- biases for layer 2
    alpha -- learning rate"""
    w1 -= alpha * dw1
    b1 -= alpha * db1
    w2 -= alpha * dw2
    b2 -= alpha * db2

    return w1, b1, w2, b2

def one_hot_y(y):
    """Function to convert labels to one-hot encoding
    Parameters:
    y -- true "labels" vector of shape (number of examples,)"""
    y = y.astype(int)
    one_hot = np.zeros((10, y.size))
    one_hot[y, np.arange(y.size)] = 1
    return one_hot



In [None]:
def get_accuracy(preds, labels):
    """Function to calculate accuracy
    Parameters:
    preds -- predicted labels
    labels -- true labels"""
    return np.mean(np.array(preds) == np.array(labels)) * 100

def SGD(x_train, y_train, epochs):
    """Function to perform Stochastic Gradient Descent
    Parameters:
    x_train -- training data of shape (784, number of examples)
    y_train -- training labels of shape (number of examples,)
    epochs -- number of epochs to train (number of iterations over the training data)"""
    w1, b1, w2, b2 = init_params()
    y_train_encoded = one_hot_y(y_train)

    for epoch in range(epochs):
        X, Y = [], []

        for i in range(x_train.shape[1]):
            x = x_train[:, i].reshape(784, 1)
            y = y_train_encoded[:, i].reshape(10, 1)

            a1, a2 = forward_prop(x, w1, b1, w2, b2)
            dw2, db2, dw1, db1 = back_prop(a1, a2, y, w1, w2, x)
            w1, b1, w2, b2 = update_params(dw1, db1, dw2, db2, w1, b1, w2, b2, 0.9)

            X.append(y_train[i])
            Y.append(np.argmax(a2))

        acc = get_accuracy(X, Y)
        print(f"Epoch {epoch + 1}: Training Accuracy = {acc:.2f}%")

    return w1, b1, w2, b2

def evaluate(w1, b1, w2, b2, x_dev, y_dev):
    """Function to evaluate the model on development set
    Parameters:
    w1 -- weights for layer 1
    b1 -- biases for layer 1
    w2 -- weights for layer 2
    b2 -- biases for layer 2
    x_dev -- development data of shape (784, number of examples)
    y_dev -- development labels of shape (number of examples,)"""
    preds = []
    for i in range(x_dev.shape[1]):
        x = x_dev[:, i].reshape(784, 1)
        _, a2 = forward_prop(x, w1, b1, w2, b2)
        preds.append(np.argmax(a2))

    acc = get_accuracy(preds, y_dev)
    print(f"Dev Accuracy: {acc:.2f}%")

            

In [18]:
w1, b1, w2, b2 = SGD(x_train, y_train, epochs=50)
evaluate(w1, b1, w2, b2, x_train, y_train)  

Epoch 1: Training Accuracy = 9.85%
Epoch 2: Training Accuracy = 12.65%
Epoch 3: Training Accuracy = 26.66%


  return 1 / (1 + np.exp(-1* z))


Epoch 4: Training Accuracy = 29.06%
Epoch 5: Training Accuracy = 29.03%
Epoch 6: Training Accuracy = 29.29%
Epoch 7: Training Accuracy = 29.45%
Epoch 8: Training Accuracy = 31.16%
Epoch 9: Training Accuracy = 48.73%
Epoch 10: Training Accuracy = 55.95%
Epoch 11: Training Accuracy = 62.50%
Epoch 12: Training Accuracy = 64.04%
Epoch 13: Training Accuracy = 65.78%
Epoch 14: Training Accuracy = 65.89%
Epoch 15: Training Accuracy = 66.34%
Epoch 16: Training Accuracy = 67.08%
Epoch 17: Training Accuracy = 67.45%
Epoch 18: Training Accuracy = 68.00%
Epoch 19: Training Accuracy = 69.96%
Epoch 20: Training Accuracy = 75.08%
Epoch 21: Training Accuracy = 75.17%
Epoch 22: Training Accuracy = 74.60%
Epoch 23: Training Accuracy = 75.89%
Epoch 24: Training Accuracy = 76.43%
Epoch 25: Training Accuracy = 77.46%
Epoch 26: Training Accuracy = 76.74%
Epoch 27: Training Accuracy = 77.51%
Epoch 28: Training Accuracy = 78.05%
Epoch 29: Training Accuracy = 78.99%
Epoch 30: Training Accuracy = 78.95%
Epoch 3