In [None]:

from sklearn.datasets import fetch_openml
from tensorflow.keras.utils import to_categorical
import numpy as np

In [None]:
from sklearn.exceptions import DataDimensionalityWarning



def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def softmax(z):
    # Subtract max for numerical stability (prevents exploding exponentials)
    exp_z = np.exp(z - np.max(z))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

print("Loading MNIST data...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='pandas')
images, target = mnist['data'], mnist['target']
target = target.astype(int)

encoded_target = to_categorical(target,10)

w_h = np.random.uniform(-0.5,0.5,(128,784)) ## increased the hidden neuron size to 128 from 20 to increase accuracy
w_o = np.random.uniform(-0.5,0.5,(10,128))

b_h = np.zeros((128,1))
b_o = np.zeros((10,1))

epochs = 5

count_correct = 0;

alpha = 0.01;

images = images/255;

print(f"Starting training on {len(images)} images...")

for epoch in range(epochs):
  for image, label_one_hot in zip(images,encoded_target):


    image_column_vector = np.reshape(image,(-1,1))
    label_column_vector = np.reshape(label_one_hot,(-1,1))

    h_pre = w_h@image_column_vector + b_h
  ##  h = 1/(1+np.exp(-h_pre)); changing to relu
    h = relu(h_pre)


    o_pre = w_o@h + b_o;
## o = 1/(1+np.exp(-o_pre));  changing to softmax
    o = softmax(o_pre)

    error = 1/len(o) * np.sum((o-label_column_vector)**2,axis=0)

    if np.argmax(o) == np.argmax(label_column_vector):
      count_correct += 1;

    # delta_o = (o - label_column_vector) * ( o * (1 - o) ) changing to softmax plus cross enhtrpy loss derivative

    delta_o = o - label_column_vector


    delta_h = (w_o.T @ delta_o) * relu_derivative(h_pre) ## from chain rule delta_h will be delta of next layer matrix multiplied by weights of next layer multipled by derivative of the activation function


    w_o = w_o - alpha *(delta_o @ h.T) ## h.t came from chain rule and delta @ input gives the shape of weight matrix for any layer
    w_h  = w_h - alpha * ( delta_h @ image_column_vector.T) ## from chain rule too h is also a linear function( W * H + B) so by chain rule we multiply by derivative of fucntion wrt w to complete_calculation



    b_o = b_o - delta_o
    b_h = b_h - delta_h






## Testing the network

count_correct = 0;

for image, label_one_hot in zip(images,encoded_target):


  image_column_vector = np.reshape(image,(-1,1))
  label_column_vector = np.reshape(label_one_hot,(-1,1))

  h_pre = w_h@image_column_vector + b_h
  # h = 1/(1+np.exp(-h_pre)); changing to relu activation functions for better accuracy
  h = relu(h_pre)

  o_pre = w_o@h + b_o;
  # o = 1/(1+np.exp(-o_pre));
  o = softmax(o_pre)


  if np.argmax(o) == np.argmax(label_column_vector):
    count_correct += 1;


    ## Printing accuracy

print("Accuracy is : ")

print(count_correct/len(images) * 100)




# =============================================================================
# NEURAL NETWORK MATH & GENERALIZATIONS
# =============================================================================

# --- 1. NOTATION ---
# L       : Current layer index
# A[L]    : Activation (output) of layer L. (Input is A[0])
# Z[L]    : Pre-activation (linear input) of layer L -> Z[L] = W[L] @ A[L-1] + B[L]
# W[L]    : Weight matrix connecting layer L-1 to L
# m       : Batch size (number of images processed at once)

# --- 2. FORWARD PROPAGATION ---
# The general formula for any layer L:
# Z[L] = W[L] @ A[L-1] + B[L]
# A[L] = activation_function(Z[L])  <-- e.g., ReLU for hidden, Softmax for output

# --- 3. BACKPROPAGATION (THE TWO GOLDEN RULES) ---

# RULE A: Calculating the "Error Term" (Delta) for any Hidden Layer
# We pull the error from the NEXT layer (L+1) backwards to the current layer (L).
# Formula: Delta[L] = (W[L+1].T @ Delta[L+1]) * derivative_of_activation(Z[L])
# Logic:   "Distribute the future error back to where it came from,
#           then switch off neurons that weren't active (derivative term)."

# RULE B: Calculating the Gradient for Weights (Delta_W)
# To find how much to change weights, we combine the layer's error with its input.
# Formula: Gradient_W[L] = (1/m) * (Delta[L] @ A[L-1].T)
# Logic:   "Error * Input". If Input was high AND Error was high,
#           this weight needs a big change.

# --- 4. LOSS FUNCTIONS & SPECIAL CASES ---

# Case 1: Output Layer with Softmax + Categorical Cross-Entropy
# The complex derivatives cancel out perfectly to give a simple subtraction.
# Delta_Output = A[Output] - Y_Target
# (Where Y_Target is the one-hot encoded ground truth)


# --- 5. WHY RELU & CROSS-ENTROPY WORK: VANISHING GRADIENTS ---
#
# Phenomenon: Switching from Sigmoid/MSE to ReLU/Cross-Entropy sped up learning.
# Problem:    "Vanishing Gradient Problem". Sigmoid derivatives are small (<0.25).
#             Multiplying small numbers in deep networks causes gradients to approach zero.
# Solution:
#   - ReLU: Derivative is either 0 or 1. Gradients flow through the network
#     without shrinking, solving the vanishing gradient problem.
#   - Cross-Entropy + Softmax: Creates a convex-like loss surface with steep gradients
#     when predictions are wrong, forcing faster correction than MSE.


# --- 6. WHY INCREASING SIZE WORKS: MODEL CAPACITY & UNIVERSAL APPROXIMATION ---
#
# Phenomenon: Increasing hidden neurons from 20 -> 128 improved accuracy (90% -> 96.7%).
# Concept:    "Model Capacity" or "Representational Power".
# Theorem:    The "Universal Approximation Theorem".
#
# Explanation:
#   - A neural network with a single hidden layer can approximate ANY continuous
#     function to arbitrary precision, *provided* it has enough neurons.
#   - With 20 neurons, the network was "Underfitting". It lacked the memory/capacity
#     to learn the complex shapes of all 10 digits simultaneously.
#   - With 128 neurons, we increased the "Hypothesis Space", allowing the network
#     to learn distinct features (loops, lines, curves) without interference.

# Case 2: Standard Gradient Descent vs. Mini-Batch
# If using Mini-Batch (m > 1):
#   - Z and A become matrices of shape (Neurons, m)
#   - We MUST divide the weight gradients by 'm' to keep updates stable.
#   - Bias update: Gradient_B[L] = (1/m) * np.sum(Delta[L], axis=1, keepdims=True)

# =============================================================================








Loading MNIST data...
Starting training on 70000 images...
Accuracy is : 
96.41714285714286
