# Working w. MNIST and weight drop out


In this we turn of a random section of the nodes in a hidden layer given a drop out fraction p.

In [5]:
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical
import tensorflow

def softmax(y_hat):
    '''
    INPUT 
    y_hat: The output of the feedforward pure procedure. 

    OUTPUT
    The softmax of y_hat
    '''
    y_hat = y_hat - np.max(y_hat, axis=0, keepdims=True)  # prevent overflow
    exp_scores = np.exp(y_hat)
    return exp_scores / np.sum(exp_scores, axis=0, keepdims=True)

# Initialize Weights

def init(dims):
    """
    Initialize weights for a multi-layer feedforward neural network.
    
    INPUT:
        dims : Layer dimensions [input, hidden1, hidden2, ..., output]
    
    OUTPUT:
        W : List of weight matrices for each layer.
            Each W[l] has shape (dims[l]+1, dims[l+1]) and includes bias weights.
    """
    
    W = []
    for i in range(len(dims)-1):
        W.append(np.random.randn(dims[i]+1, dims[i+1]) * np.sqrt(2 / dims[i]))
    return W

# Forward Pass

def forward(X, W, dropout_on=False, dropout_p=None):
    """
    Perform the forward pass through all layers, with dropout possible. The dropout is done on each hidden unit layer, 
    where a different dropout % is possible for each layer, here we turn off hidden units aka set them to 0, so its as close to the dropout %. 
    
    The still turned on hidden units are scaled so the total "energy/value" of the rest of the hidden units are the same as before turn off. 
    
    INPUT:
        X: Input data (shape: features × # of batches)
        W: Weight matrices for each layer
        dropout_on: True/False statement about the use of dropout
        dropout_p: dropout percentage for each hidden layer, not used in the input or output layer. 
    
    OUTPUT:
        y : Output of forward with softmax applied
        h : List containing activations (hidden layers and final output pre-softmax)
        masks: contain the dropout masks used, so that they can be used in the backwards propagation. 
    """
    #initalize 
    h = []
    masks = []
    a = X.copy()
    num_hidden = len(W)-1
    
    # define p for when no dropout matrix is given, and check if the p vector is the right size. 
    if dropout_p is None:
        dropout_p = [0.0]*num_hidden
    elif len(dropout_p) != num_hidden:
        raise ValueError(f"dropout_p must have {num_hidden} values")

    # Loop through hidden layers
    for l in range(num_hidden):
        #input layer
        a = np.vstack([a, np.ones(a.shape[1])])  # add bias
        z = W[l].T @ a
        a = np.maximum(0, z)  # ReLU

        # hidden layer
        if dropout_on and dropout_p[l] > 0.0:
            p = dropout_p[l]
            mask = (np.random.rand(*a.shape) > p).astype(float) / (1.0 - p) # random numbers 0-1 with length a, scaled so activation energy preserved. 
            a *= mask
            
            # Debug: check dropout
            #frac_active_per_neuron = np.sum(mask != 0, axis=1) / mask.shape[1] # for each sample fraction turned off
            #print(f"Layer {l}: avg {100*(1-np.mean(frac_active_per_neuron)):.1f}% neurons turned off") # average for batch 

        else:
            mask = np.ones_like(a) # set all to 1 if there is not dropout 

        h.append(a)
        masks.append(mask)

    # Output layer
    a = np.vstack([a, np.ones(a.shape[1])])
    y_hat = W[-1].T @ a
    y = softmax(y_hat)
    return y, h, masks


# Backward Pass

def backward(X, T, W, y, h, masks, eta):
    """
    Perform one backward pass and update weights.
    
    INPUT:
        X : Input data (features × samples)
        T : Target labels (one-hot encoded)
        W : Current weight matrices
        eta : Learning rate
        masks: The dropout mask
    
    OUTPUT:
        W : Updated weight matrices
        loss : Total loss for this batch
    """
    m = X.shape[1]
    delta = y - T
    num_hidden = len(W) - 1

    # Output layer
    
    a_prev = np.vstack([h[-1], np.ones(h[-1].shape[1])])  # last hidden -> output
    Q = a_prev @ delta.T
    W[-1] -= (eta / m) * Q
    delta = W[-1][:-1, :] @ delta  # backprop to last hidden layer

    # loop hidden layers 

    for l in range(num_hidden-1, 0, -1):  
        relu_grad = (h[l] > 0).astype(float)
        delta *= relu_grad * masks[l]  # apply mask

        a_prev = np.vstack([h[l-1], np.ones(h[l-1].shape[1])])
        Q = a_prev @ delta.T
        W[l] -= (eta / m) * Q

        delta = W[l][:-1, :] @ delta  # propagate delta backward

    # First hidden layer
    
    relu_grad = (h[0] > 0).astype(float)
    delta *= relu_grad * masks[0]

    a_prev = np.vstack([X, np.ones(X.shape[1])])  # input -> first hidden
    Q = a_prev @ delta.T
    W[0] -= (eta / m) * Q

    # Compute loss
    epsilon = 1e-12
    loss = -np.sum(np.log(np.sum(y * T, axis=0) + epsilon))
    return W, loss


# Training Loop

def train(X, T, W, epochs, eta, batchsize=32, dropout_on=False, dropout_p=None):
    """
    Train the multi-layer neural network using gradient descent.
    
    INPUT:
        X : Input data (features × samples)
        T : Target one-hot labels (classes × samples)
        W : Initialized weight matrices
        epochs :  Number of training epochs
        eta : Learning rate
        batchsize : Size of each training batch
        dropout_on: True/False statement about the use of dropout
        dropout_p: dropout percentage for each hidden layer, not used in the input or output layer.
    
    OUTPUT:
        W : Trained weight matrices
        losses : Total loss for each epoch
    """
    m = X.shape[1]
    losses = []
    for epoch in range(epochs):
        order = np.random.permutation(m)
        epoch_loss = 0
        for i in range(0, m, batchsize):
            batch = order[i:i+batchsize]
            X_batch = X[:, batch]
            T_batch = T[:, batch]

            y, h, masks = forward(X_batch, W, dropout_on, dropout_p)
            W, loss = backward(X_batch, T_batch, W, y, h, masks, eta)
            epoch_loss += loss

        losses.append(epoch_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")
        
    return W, losses


# Prediction

def predict(X, W):
    '''
    INPUT:
    X: Test data 
    W: Weight matrices

    OUTPUT:
    The predicted label. 

    '''
    y, _, _ = forward(X, W, dropout_on=False)
    return np.argmax(y, axis=0)



In [6]:
# --------------------------
# Load MNIST
# --------------------------

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test  = X_test.reshape(-1, 28*28) / 255.0
T_train = to_categorical(y_train, num_classes=10)
T_test  = to_categorical(y_test, num_classes=10)

# --------------------------
# Initialize network
# --------------------------

dims = [784, 32, 32, 10]
W = init(dims)

# --------------------------
# Train
# --------------------------

# dropout fractions: hidden layers
dropout_p = [0.3, 0.5]  # hidden1 30%, hidden2 50%

W, losses = train(
    X_train.T, T_train.T, W,
    epochs=50,
    eta=0.001,
    batchsize=32,
    dropout_on=True,
    dropout_p=dropout_p
)

# --------------------------
# Evaluate
# --------------------------

y_pred = predict(X_test.T, W)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Epoch 1/50, Loss: 128821.7245
Epoch 2/50, Loss: 108043.8058
Epoch 3/50, Loss: 94840.6707
Epoch 4/50, Loss: 85533.9991
Epoch 5/50, Loss: 78925.6997
Epoch 6/50, Loss: 73872.2594
Epoch 7/50, Loss: 69590.5968
Epoch 8/50, Loss: 66333.7258
Epoch 9/50, Loss: 63592.8952
Epoch 10/50, Loss: 60724.1267
Epoch 11/50, Loss: 58777.9915
Epoch 12/50, Loss: 57161.3534
Epoch 13/50, Loss: 55395.6671
Epoch 14/50, Loss: 53796.4436
Epoch 15/50, Loss: 52827.3757
Epoch 16/50, Loss: 51152.9119
Epoch 17/50, Loss: 50386.1601
Epoch 18/50, Loss: 49093.9597
Epoch 19/50, Loss: 48389.3018
Epoch 20/50, Loss: 47471.6146
Epoch 21/50, Loss: 46748.7468
Epoch 22/50, Loss: 45835.6456
Epoch 23/50, Loss: 45455.7021
Epoch 24/50, Loss: 44427.3001
Epoch 25/50, Loss: 43700.3616
Epoch 26/50, Loss: 43451.5236
Epoch 27/50, Loss: 42710.6251
Epoch 28/50, Loss: 42418.4476
Epoch 29/50, Loss: 41850.4140
Epoch 30/50, Loss: 41579.9785
Epoch 31/50, Loss: 41315.8041
Epoch 32/50, Loss: 40723.1266
Epoch 33/50, Loss: 39808.1450
Epoch 34/50, Loss