In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load MNIST Dataset
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

# Preprocessing
X_train = X_train.reshape(X_train.shape[0], 28*28).astype('float32') / 255
X_test = X_test.reshape(X_test.shape[0], 28*28).astype('float32') / 255
Y_train = to_categorical(Y_train, 10)
Y_test = to_categorical(Y_test, 10)


In [2]:
X_train.shape

(60000, 784)

In [3]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def sigmoid_derivative(Z):
    return Z * (1 - Z)


In [4]:
def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return np.where(Z > 0, 1, 0)


In [None]:
def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)


In [None]:
def apply_dropout(A, keep_prob):
    D = np.random.rand(A.shape[0], A.shape[1]) < keep_prob  # Dropout mask
    A = np.multiply(A, D)  # Apply mask
    A = A / keep_prob  # Scale activations to maintain expectations
    return A, D


In [None]:
def initialize_parameters(input_size, hidden1_size, hidden2_size, output_size):
    np.random.seed(1)
    
    W1 = np.random.randn(hidden1_size, input_size) * 0.01
    b1 = np.zeros((hidden1_size, 1))
    
    W2 = np.random.randn(hidden2_size, hidden1_size) * 0.01
    b2 = np.zeros((hidden2_size, 1))
    
    W3 = np.random.randn(output_size, hidden2_size) * 0.01
    b3 = np.zeros((output_size, 1))
    
    return W1, b1, W2, b2, W3, b3


In [None]:
def forward_propagation(X, W1, b1, W2, b2, W3, b3, keep_prob1=1, keep_prob2=1):
    # Layer 1: Input to Hidden Layer 1
    Z1 = np.dot(W1, X) + b1
    A1 = sigmoid(Z1)
    A1, D1 = apply_dropout(A1, keep_prob1)  # Dropout for Hidden Layer 1
    
    # Layer 2: Hidden Layer 1 to Hidden Layer 2
    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)
    A2, D2 = apply_dropout(A2, keep_prob2)  # Dropout for Hidden Layer 2
    
    # Layer 3: Hidden Layer 2 to Output Layer
    Z3 = np.dot(W3, A2) + b3
    A3 = softmax(Z3)
    
    cache = (Z1, A1, D1, Z2, A2, D2, Z3, A3)
    return A3, cache


In [None]:
def backward_propagation(X, Y, cache, W1, W2, W3, keep_prob1=1, keep_prob2=1):
    m = X.shape[1]
    (Z1, A1, D1, Z2, A2, D2, Z3, A3) = cache

    # Output layer
    dZ3 = A3 - Y
    dW3 = (1/m) * np.dot(dZ3, A2.T)
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)
    
    # Hidden layer 2
    dA2 = np.dot(W3.T, dZ3)
    dA2 = dA2 * D2  # Apply dropout mask
    dA2 = dA2 / keep_prob2  # Scale back
    dZ2 = dA2 * relu_derivative(A2)
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
    
    # Hidden layer 1
    dA1 = np.dot(W2.T, dZ2)
    dA1 = dA1 * D1  # Apply dropout mask
    dA1 = dA1 / keep_prob1  # Scale back
    dZ1 = dA1 * sigmoid_derivative(A1)
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    gradients = (dW1, db1, dW2, db2, dW3, db3)
    return gradients


In [None]:
def update_parameters(params, grads, learning_rate):
    W1, b1, W2, b2, W3, b3 = params
    dW1, db1, dW2, db2, dW3, db3 = grads

    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W3 -= learning_rate * dW3
    b3 -= learning_rate * db3
    
    return W1, b1, W2, b2, W3, b3


In [None]:
def model(X_train, Y_train, X_test, Y_test, learning_rate=0.01, iterations=1000, keep_prob1=0.8, keep_prob2=0.8):
    input_size = 784  # 28x28 pixels
    hidden1_size = 128
    hidden2_size = 64
    output_size = 10  # 10 classes (0-9)
    
    # Initialize parameters
    W1, b1, W2, b2, W3, b3 = initialize_parameters(input_size, hidden1_size, hidden2_size, output_size)
    
    # Training loop
    for i in range(iterations):
        # Forward propagation
        A3, cache = forward_propagation(X_train, W1, b1, W2, b2, W3, b3, keep_prob1, keep_prob2)
        
        # Compute cost (cross-entropy loss)
        cost = -np.mean(Y_train * np.log(A3 + 1e-8))
        
        # Backward propagation
        gradients = backward_propagation(X_train, Y_train, cache, W1, W2, W3, keep_prob1, keep_prob2)
        
        # Update parameters
        W1, b1, W2, b2, W3, b3 = update_parameters((W1, b1, W2, b2, W3, b3), gradients, learning_rate)
        
        # Print cost every 100 iterations
        if i % 100 == 0:
            print(f"Iteration {i}: Cost {cost}")
    
    # Predict accuracy on test data
    A3, _ = forward_propagation(X_test, W1, b1, W2, b2, W3, b3, 1, 1)
    predictions = np.argmax(A3, axis=0)
    labels = np.argmax(Y_test, axis=1)
    accuracy = np.mean(predictions == labels)
    
    print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
model(X_train.T, Y_train.T, X_test.T, Y_test.T, learning_rate=0.01, iterations=1000)
yy