In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv("datas/mnist_train.csv")
test_df = pd.read_csv("datas/mnist_test.csv")

In [3]:
train_df.sample(1)

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
13808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_train = train_df.iloc[:, 1:]
y_train = train_df.iloc[:, 0]
X_test = test_df.iloc[:, 1:]
y_test = test_df.iloc[:, 0]

In [5]:
y_train.head()

0    5
1    0
2    4
3    1
4    9
Name: label, dtype: int64

In [6]:
X_train.head()

Unnamed: 0,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,1x10,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_train.shape

(60000, 784)

In [8]:
# ReLU activation function
def ReLU(z):
    return np.maximum(0, z)

In [9]:
# Softmax function
def sft_max(z):
    return np.exp(z)/np.sum(np.exp(z), axis = 1,  keepdims=True) #If z is a matrix (batch of inputs), softmax should be applied row-wise

In [10]:
# one_hot encoding
def one_hot(val): 
    val = np.array(val)
    one_hot = np.zeros(shape= (val.shape[0], 10))
    one_hot[np.arange(val.size), val] = 1
    return one_hot

In [11]:
# Initialization of weights
#layer 0
W0 = np.random.randn(784, 4) * 0.01
B0 = np.zeros(shape=(1, 4))

#Layer 1
W1 = np.random.randn(4, 8)  * 0.01
B1 = np.zeros(shape=(1, 8))

#OUTPUT layer
W2 = np.random.randn(8, 10)  * 0.01
B2 = np.zeros(shape=(1, 10))

# We don't initillize w to 0 because this will make each neuron to learn same thing
# We multiply the random w by a small number (0.01) to make it small if not this can make vanishing gradinet problem

In [12]:
# Forward Pass
def forward_pass(X):
    global  W0, B0, W1, B1, W2, B2
    Z0 = np.dot(X, W0) + B0
    A0 = ReLU(Z0)
    
    Z1 = np.dot(A0, W1) + B1
    A1 = ReLU(Z1)
    
    Z2 = np.dot(A1, W2) + B2
    A2 = sft_max(Z2)
    return Z0, A0, Z1, A1, Z2, A2

In [13]:
# Prediction
def predict(A2):
    """
    We Use argmax because the softmax function gives us probablity distrubution of all classes so use the index of the higest prediction
    """
    prediction =  np.argmax(A2, axis = 1) 
    return prediction

In [14]:
# Cost function
def cost(y, a):
    """ a is raw soft max output A3 """
    return - (np.sum(y * np.log(a + 0.0000000001))/a.shape[0])

In [15]:
#Derevatives
def d_ReLU(z):
    return (z > 0).astype(float)  # Derivative of ReLU: 1 if z > 0, else 0

def d_sft_max(A2, Y):
    """Gradient of softmax loss (cross-entropy)."""
    return A2 - Y  # Since we use softmax + cross-entropy, this is simplified

In [16]:
# Back Propagation
def back_prop(W0, B0, W1, B1, W2, B2, Z0, A0, Z1, A1, Z2, A2,X, Y):
    """ Y is ground truth"""
    n = Y.shape[0] # numnber of the exmap
    
    # Output layer
    dZ2 = d_sft_max(A2, Y)
    dW2 = np.dot(A1.T, dZ2) / n
    dB2 = np.sum(dZ2, axis=0, keepdims=True) / n
    
    # Layer 1
    dZ1 = np.dot(dZ2, W2.T) * d_ReLU(Z1)  
    dW1 = np.dot(A0.T, dZ1) / n
    dB1 = np.sum(dZ1, axis=0, keepdims=True) / n
    
    # Layer 0
    dZ0 = np.dot(dZ1, W1.T) * d_ReLU(Z0)  
    dW0 = np.dot(X.T, dZ0) / n
    dB0 = np.sum(dZ0, axis=0, keepdims=True) / n

    return dW0, dB0, dW1, dB1, dW2, dB2

In [17]:
# Update
def update(alpha, dW0, dB0, dW1, dB1, dW2, dB2):
    global W0, B0, W1, B1, W2, B2
    
    W2 -= alpha * dW2
    B2 -= alpha * dB2
    
    W1 -= alpha * dW1
    B1 -= alpha * dB1
    
    W0 -= alpha * dW0
    B0 -= alpha * dB0

In [18]:
# Accuracy
def accuracy(x, y):
    params = forward_pass(x)
    predictions = predict(params[-1])
    accuracy = np.mean(predictions == y) * 100 
    return accuracy

In [19]:
def train(alpha, epochs):
    global W0, B0, W1, B1, W2, B2  # Use global weights and biases

    Y_train_oh = one_hot(y_train)  # Convert labels to one-hot encoding
    losses = []  # To track loss over epochs

    for epoch in range(epochs):
        # Forward Pass
        Z0, A0, Z1, A1, Z2, A2 = forward_pass(X_train)

        # Compute Cost
        loss = cost(Y_train_oh, A2)
        losses.append(loss)

        # Backpropagation
        dW0, dB0, dW1, dB1, dW2, dB2 = back_prop(W0, B0, W1, B1, W2, B2, Z0, A0, Z1, A1, Z2, A2, X_train, Y_train_oh)

        # Update Weights
        update(alpha, dW0, dB0, dW1, dB1, dW2, dB2)

        # Print loss every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}, Training Accuracy: {accuracy(X_train, y_train):.4f}, Test Accuracy: {accuracy(X_test, y_test):.4f}")

    # Plot the loss curve
    plt.plot(losses)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training Loss Curve")
    plt.show()


In [None]:
train(alpha=0.01, epochs=1000)

Epoch 0, Loss: 2.3027, Training Accuracy: 6.7267, Test Accuracy: 6.7100
Epoch 100, Loss: 2.0412, Training Accuracy: 13.5400, Test Accuracy: 13.6900
