In [3]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt

# Load and preprocess data
data = pd.read_csv("train.csv")
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)

# Splitting data into development and training sets
data_dev = data[0:1000].T
y_dev = data_dev[0]
x_dev = data_dev[1:n]

data_train = data[10000:m].T
y_train = data_train[0]
x_train = data_train[1:n]

def init_params(layer_sizes):
    params = {}
    for i in range(1, len(layer_sizes)):
        params[f'w{i}'] = np.random.randn(layer_sizes[i], layer_sizes[i-1]) * 0.01
        params[f'b{i}'] = np.zeros((layer_sizes[i], 1))
    return params

def ReLU(z):
    return np.maximum(0, z)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # for numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def forward_prop(params, x):
    a = x
    cache = {'a0': a}
    
    L = len(params) // 2  # Number of layers
    for l in range(1, L + 1):
        z = params[f'w{l}'].dot(a) + params[f'b{l}']
        a = ReLU(z) if l < L else softmax(z)  # Use softmax only in the last layer
        cache[f'z{l}'] = z
        cache[f'a{l}'] = a
    
    return cache

def one_hot(y):
    one_hot_y = np.zeros((y.size, y.max() + 1))
    one_hot_y[np.arange(y.size), y] = 1
    return one_hot_y.T  # Transpose for shape consistency

def back_prop(params, cache, y):
    m = y.size
    one_hot_y = one_hot(y)
    L = len(params) // 2  # Number of layers
    
    dz = cache[f'a{L}'] - one_hot_y  # Output layer
    grads = {}
    
    for l in range(L, 0, -1):
        a_prev = cache[f'a{l-1}'] if l > 1 else cache['a0']
        grads[f'dw{l}'] = dz.dot(a_prev.T) / m
        grads[f'db{l}'] = np.sum(dz, axis=1, keepdims=True) / m
        if l > 1:
            dz = params[f'w{l}'].T.dot(dz) * (a_prev > 0)  # ReLU derivative (use a_prev for proper backpropagation)
    
    return grads

def update_params(params, grads, learning_rate):
    for l in range(1, (len(params) // 2) + 1):
        params[f'w{l}'] -= learning_rate * grads[f'dw{l}']
        params[f'b{l}'] -= learning_rate * grads[f'db{l}']
    return params

def get_predictions(a):
    return np.argmax(a, axis=0)

def get_accuracy(predictions, y):
    return np.mean(predictions == y)

def gradient_descent(x, y, layer_sizes, iterations, learning_rate):
    params = init_params(layer_sizes)
    for i in range(iterations):
        cache = forward_prop(params, x)
        grads = back_prop(params, cache, y)
        params = update_params(params, grads, learning_rate)
        
        if i % 50 == 0:
            predictions = get_predictions(cache[f'a{len(layer_sizes)-1}'])
            accuracy = get_accuracy(predictions, y)
            print(f"Iteration: {i}, Accuracy: {accuracy:.4f}")
    
    return params

# Define layer sizes: input layer (784), two hidden layers (128, 64), output layer (10)
layer_sizes = [784, 128, 64, 10]
params = gradient_descent(x_train, y_train, layer_sizes, 500, 0.1)



Iteration: 0, Accuracy: 0.0972
Iteration: 50, Accuracy: 0.1054
Iteration: 100, Accuracy: 0.1108
Iteration: 150, Accuracy: 0.1108
Iteration: 200, Accuracy: 0.1108
Iteration: 250, Accuracy: 0.1108
Iteration: 300, Accuracy: 0.1108
Iteration: 350, Accuracy: 0.1108
Iteration: 400, Accuracy: 0.1108
Iteration: 450, Accuracy: 0.1108
