In [1]:
import torch
import pandas as pd
import math

In [2]:
def relu(x):
    return (x + torch.abs(x)) / 2

def relu_derivative(x):
    return (x > 0).float()

def softmax(x):
    x_stable = x - torch.max(x, dim=1, keepdim=True)[0]
    exp_x = torch.exp(x_stable)
    return exp_x / torch.sum(exp_x, dim=1, keepdim=True)

def cross_entropy_loss(probs, labels):
    N = probs.shape[0]
    correct_probs = probs[torch.arange(N, device=probs.device), labels]
    loss = -torch.log(correct_probs)
    return loss.mean()

def load_data(csv_path, device):
    df = pd.read_csv(csv_path, skiprows=1, low_memory=False, header=None)
    data = df.values
    labels = data[:, 0].astype(int)
    images = data[:, 1:].astype('float32') / 255.0  # Normalize to [0,1]
    # Create tensors and then move them to the device
    images = torch.tensor(images, dtype=torch.float32).to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)
    return images, labels

In [3]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
torch.manual_seed(42)

in_features = 784
hidden1 = 512
hidden2 = 256
hidden3 = 128
out_features = 10

# He initialization for each layer:
W1 = torch.randn(in_features, hidden1, dtype=torch.float32, device=device) * math.sqrt(2.0/in_features)
b1 = torch.zeros(1, hidden1, dtype=torch.float32, device=device)

W2 = torch.randn(hidden1, hidden2, dtype=torch.float32, device=device) * math.sqrt(2.0/hidden1)
b2 = torch.zeros(1, hidden2, dtype=torch.float32, device=device)

W3 = torch.randn(hidden2, hidden3, dtype=torch.float32, device=device) * math.sqrt(2.0/hidden2)
b3 = torch.zeros(1, hidden3, dtype=torch.float32, device=device)

W4 = torch.randn(hidden3, out_features, dtype=torch.float32, device=device) * math.sqrt(2.0 / hidden3)
b4 = torch.zeros(1, out_features, dtype=torch.float32, device=device)

In [5]:
# Load the data
train_images, train_labels = load_data('mnist_train.csv', device)
test_images, test_labels   = load_data('mnist_test.csv', device)

#Data to device:
train_images = train_images.to(device)
train_labels = train_labels.to(device)
test_images = test_images.to(device)
test_labels = test_labels.to(device)

# Parameters
lambda_l2 = 0.001
num_epochs = 10
batch_size = 32
num_train = train_images.shape[0]

In [6]:
# Adam parameters
lr = 0.0001
beta1, beta2, epsilon = 0.9, 0.999, 1e-8
m, v = {}, {}
params = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2, 'W3': W3, 'b3': b3, 'W4': W4, 'b4': b4}
for key in params:
    m[key] = torch.zeros_like(params[key])
    v[key] = torch.zeros_like(params[key])

In [7]:
t = 0

for epoch in range(num_epochs):
    permutation = torch.randperm(num_train, device=device)
    running_loss = 0.0
    correct_train = 0

    for i in range(0, num_train, batch_size):
        t += 1
        indices = permutation[i:i+batch_size]
        X = train_images[indices]  # (B, 784)
        y = train_labels[indices]  # (B,)
        B = X.shape[0]

        # Forward Pass
        z1 = torch.matmul(X, W1) + b1         # (B, 512)
        a1 = relu(z1)

        z2 = torch.matmul(a1, W2) + b2          # (B, 256)
        a2 = relu(z2)

        z3 = torch.matmul(a2, W3) + b3          # (B, 128)
        a3 = relu(z3)

        logits = torch.matmul(a3, W4) + b4      # (B, 10)
        probs = softmax(logits)                # (B, 10)

        loss = cross_entropy_loss(probs, y)
        # Regularization loss added to cross-entropy loss:
        reg_loss = lambda_l2 * (torch.sum(W1**2) + torch.sum(W2**2) + torch.sum(W3**2) + torch.sum(W4**2))
        loss_total = loss + reg_loss

        # Backward Pass (manual gradients)
        one_hot = torch.zeros_like(probs)
        one_hot[torch.arange(B, device=device), y] = 1

        d_logits = (probs - one_hot) / B  # (B, 10)

        dW4 = torch.matmul(a3.t(), d_logits)         # (hidden3, 10)
        db4 = d_logits.sum(dim=0, keepdim=True)          # (1, 10)

        d_a3 = torch.matmul(d_logits, W4.t())            # (B, hidden3)
        d_z3 = d_a3 * relu_derivative(z3)                # (B, hidden3)

        dW3 = torch.matmul(a2.t(), d_z3)                # (hidden2, hidden3)
        db3 = d_z3.sum(dim=0, keepdim=True)              # (1, hidden3)

        d_a2 = torch.matmul(d_z3, W3.t())                # (B, hidden2)
        d_z2 = d_a2 * relu_derivative(z2)                # (B, hidden2)

        dW2 = torch.matmul(a1.t(), d_z2)               # (hidden1, hidden2)
        db2 = d_z2.sum(dim=0, keepdim=True)              # (1, hidden2)

        d_a1 = torch.matmul(d_z2, W2.t())                # (B, hidden1)
        d_z1 = d_a1 * relu_derivative(z1)                # (B, hidden1)

        dW1 = torch.matmul(X.t(), d_z1)                # (in_features, hidden1)
        db1 = d_z1.sum(dim=0, keepdim=True)              # (1, hidden1)

        # Add regularization gradients: derivative of reg_loss is 2 * lambda_l2 * W
        dW1 += 2 * lambda_l2 * W1
        dW2 += 2 * lambda_l2 * W2
        dW3 += 2 * lambda_l2 * W3
        dW4 += 2 * lambda_l2 * W4

        # Adam Optimizer update
        # Collect gradients into a dictionary
        grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2, 'W3': dW3, 'b3': db3, 'W4': dW4, 'b4': db4}

        # Initialize bias-corrected moment estimates
        m_hat, v_hat = {}, {}

        # Update moving averages of the gradients (m) and squared gradients (v)
        for key in params:
            m[key] = beta1 * m[key] + (1 - beta1) * grads[key]
            v[key] = beta2 * v[key] + (1 - beta2) * grads[key]**2
            m_hat[key] = m[key] / (1 - beta1**t) # Compute bias-corrected first moment estimate
            v_hat[key] = v[key] / (1 - beta2**t) # Compute bias-corrected second moment estimate

        # Update parameters using Adam optimizer
        W1 -= lr * m_hat['W1'] / (torch.sqrt(v_hat['W1']) + epsilon)
        b1 -= lr * m_hat['b1'] / (torch.sqrt(v_hat['b1']) + epsilon)
        W2 -= lr * m_hat['W2'] / (torch.sqrt(v_hat['W2']) + epsilon)
        b2 -= lr * m_hat['b2'] / (torch.sqrt(v_hat['b2']) + epsilon)
        W3 -= lr * m_hat['W3'] / (torch.sqrt(v_hat['W3']) + epsilon)
        b3 -= lr * m_hat['b3'] / (torch.sqrt(v_hat['b3']) + epsilon)
        W4 -= lr * m_hat['W4'] / (torch.sqrt(v_hat['W4']) + epsilon)
        b4 -= lr * m_hat['b4'] / (torch.sqrt(v_hat['b4']) + epsilon)

        # Accumulate total loss and correct predictions for accuracy calculation
        running_loss += loss_total.item() * B
        preds = torch.argmax(probs, dim=1)
        correct_train += (preds == y).sum().item()

    # Compute average training loss and accuracy
    train_loss = running_loss / num_train
    train_accuracy = 100.0 * correct_train / num_train

    # Evaluate on Test Set
    z1_test = torch.matmul(test_images, W1) + b1
    a1_test = relu(z1_test)
    z2_test = torch.matmul(a1_test, W2) + b2
    a2_test = relu(z2_test)
    z3_test = torch.matmul(a2_test, W3) + b3
    a3_test = relu(z3_test)
    logits_test = torch.matmul(a3_test, W4) + b4
    probs_test = softmax(logits_test)

    # Compute average test loss and accuracy
    test_loss = cross_entropy_loss(probs_test, test_labels)
    reg_loss_test = lambda_l2 * (torch.sum(W1**2) + torch.sum(W2**2) + torch.sum(W3**2) + torch.sum(W4**2))
    test_loss_total = test_loss + reg_loss_test  # Final test loss with regularization

    preds_test = torch.argmax(probs_test, dim=1)
    test_accuracy = 100.0 * (preds_test == test_labels).sum().item() / test_labels.shape[0]

    # Printing only epochs 5 and 10
    #if epoch + 1 in [5, 10]:
    print(f"Epoch {epoch+1}/{num_epochs}: "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
          f"Test Loss: {test_loss_total:.4f}, Test Acc: {test_accuracy:.2f}%")

    # Store final metrics
    final_train_loss = train_loss
    final_train_accuracy = train_accuracy
    final_test_loss = test_loss_total
    final_test_accuracy = test_accuracy

Epoch 1/10: Train Loss: 1.5778, Train Acc: 90.44%, Test Loss: 1.1280, Test Acc: 95.36%
Epoch 2/10: Train Loss: 0.9868, Train Acc: 95.71%, Test Loss: 0.8551, Test Acc: 96.35%
Epoch 3/10: Train Loss: 0.7611, Train Acc: 96.63%, Test Loss: 0.6758, Test Acc: 96.92%
Epoch 4/10: Train Loss: 0.6087, Train Acc: 97.18%, Test Loss: 0.5539, Test Acc: 96.97%
Epoch 5/10: Train Loss: 0.4984, Train Acc: 97.54%, Test Loss: 0.4650, Test Acc: 97.03%
Epoch 6/10: Train Loss: 0.4194, Train Acc: 97.70%, Test Loss: 0.3965, Test Acc: 97.25%
Epoch 7/10: Train Loss: 0.3590, Train Acc: 97.97%, Test Loss: 0.3479, Test Acc: 97.53%
Epoch 8/10: Train Loss: 0.3153, Train Acc: 98.11%, Test Loss: 0.3090, Test Acc: 97.70%
Epoch 9/10: Train Loss: 0.2826, Train Acc: 98.22%, Test Loss: 0.2920, Test Acc: 97.51%
Epoch 10/10: Train Loss: 0.2582, Train Acc: 98.33%, Test Loss: 0.2742, Test Acc: 97.40%


In [8]:
# Final Results after all epochs
print("\nFinal Results:")
print(f"Train Loss: {final_train_loss:.4f}, Train Acc: {final_train_accuracy:.2f}%")
print(f"Test Loss: {final_test_loss:.4f}, Test Acc: {final_test_accuracy:.2f}%")


Final Results:
Train Loss: 0.2582, Train Acc: 98.33%
Test Loss: 0.2742, Test Acc: 97.40%


In [9]:
import pandas as pd

# Define constants
float_size = 4  # float32 = 4 bytes

# Network layer shapes: [784, 512, 256, 128, 10]
# Total parameters: weights + biases
W1_params = 784 * 512  # 401,408
b1_params = 512        # 512
W2_params = 512 * 256  # 131,072
b2_params = 256        # 256
W3_params = 256 * 128  # 32,768
b3_params = 128        # 128
W4_params = 128 * 10   # 1,280
b4_params = 10         # 10

total_weights = W1_params + b1_params + W2_params + b2_params + W3_params + b3_params + W4_params + b4_params  # 567,434

# Activations and pre-activations stored in forward pass per sample
a0 = 784          # Input
z1_a1 = 512 * 2   # Z1 + A1
z2_a2 = 256 * 2   # Z2 + A2
z3_a3 = 128 * 2   # Z3 + A3
z4_a4 = 10 * 2    # Z4 + A4 (output)
activations_per_sample = a0 + z1_a1 + z2_a2 + z3_a3 + z4_a4  # 2,596

# Intermediate gradients in backward pass per sample (dZ1, dA1, dZ2, dA2, dZ3, dA3, dZ4)
deltas_per_sample = 512 + 512 + 256 + 256 + 128 + 128 + 10  # 1,802
adam_states = 2 * total_weights                              # 1,134,868 (m and v for each parameter)

batch_sizes = [1, 32, 1024]
results = []

for B in batch_sizes:
    # ----- Forward Pass Memory -----
    # Parameters (weights + biases)
    params_bytes = total_weights * float_size
    params_mb = params_bytes / 1_000_000

    # Activations (per batch)
    activations_vals = activations_per_sample * B
    activations_bytes = activations_vals * float_size
    activations_mb = activations_bytes / 1_000_000

    # Total Forward Pass
    forward_bytes = params_bytes + activations_bytes
    forward_mb = forward_bytes / 1_000_000

    # ----- Backward Pass (SGD) Memory -----
    backward_vals_sgd = total_weights + deltas_per_sample * B  # Parameters + intermediates
    backward_bytes_sgd = backward_vals_sgd * float_size
    backward_mb_sgd = backward_bytes_sgd / 1_000_000

    # ----- Backward Pass (Adam) Memory -----
    backward_bytes_adam = backward_bytes_sgd + adam_states * float_size
    backward_mb_adam = backward_bytes_adam / 1_000_000

    # ----- Totals -----
    total_mb_sgd = forward_mb + backward_mb_sgd
    total_mb_adam = forward_mb + backward_mb_adam

    results.append({
        "Batch Size": B,
        "Forward Pass Total (MB)": round(forward_mb, 3),
        "Backward Pass SGD (MB)": round(backward_mb_sgd, 3),
        "Backward Pass Adam (MB)": round(backward_mb_adam, 3),
        "Total (SGD) MB": round(total_mb_sgd, 3),
        "Total (Adam) MB": round(total_mb_adam, 3)
    })

memory_summary_df = pd.DataFrame(results)
print(memory_summary_df.to_string(index=False))

 Batch Size  Forward Pass Total (MB)  Backward Pass SGD (MB)  Backward Pass Adam (MB)  Total (SGD) MB  Total (Adam) MB
          1                    2.280                   2.277                    6.816           4.557            9.097
         32                    2.602                   2.500                    7.040           5.102            9.642
       1024                   12.903                   9.651                   14.190          22.554           27.093
