In [2]:
import pickle
import os
import pandas as pd
import numpy as np


In [5]:
train_file = "extended_mnist_train.pkl"
test_file = "extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)

with open(test_file, "rb") as fp:
    test = pickle.load(fp)

In [6]:
train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten())
    train_labels.append(label)


In [7]:
test_data = []
for image, label in test:
    test_data.append(image.flatten())


In [21]:
# 4 Homework - 15 points
# In this exercise, you are tasked with implementing both the forward and backward propagation processes for a neural network with 784 inputs and 10 outputs
# using NumPy. This network can be thought of as consisting of 10 perceptrons,
# each responsible for predicting one of the 10 output classes.
# 4.1 Problem Statement
# Given an input matrix X of shape (m, 784), where m is the batch size and 784
# is the number of features (input neurons), a weight matrix W of shape (784, 10),
# and a bias matrix b of shape (10,), compute the output of the network for each
# example in the batch, calculate the error, and update the weights and biases
# accordingly


# You must use NumPy to implement from scratch
X_train = np.array(train_data)
y_train = np.array(train_labels)

X_train = X_train / 255.0

def softmax(z):
    # Subtract max for numerical stability
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def one_hot(y, num_classes):
    m = y.shape[0]
    y_one_hot = np.zeros((m, num_classes))
    y_one_hot[np.arange(m), y] = 1
    return y_one_hot

def forward(X, W, b):
    Z = np.dot(X, W) + b
    A = softmax(Z)
    return Z, A

def backward(X, Y, A):
    m = X.shape[0]
    dZ = A - Y   # Gradient of the loss with respect to Z
    dW = (1 / m) * np.dot(X.T, dZ)  # Gradient of the loss with respect to W
    db = (1 / m) * np.sum(dZ, axis=0)  # Gradient of the loss with respect to b
    return dW, db

def update_params(W, b, dW, db, learning_rate):
    W = W - learning_rate * dW
    b = b - learning_rate * db
    return W, b

def calculate_accuracy(A, y):
    predictions = np.argmax(A, axis=1)
    return np.mean(predictions == y)

n_inputs = X_train.shape[1] # 784
n_outputs = 10
np.random.seed(1) # for reproducibility
W = np.random.randn(n_inputs, n_outputs) * 0.01
b = np.zeros(n_outputs)


Y_train_one_hot = one_hot(y_train, n_outputs)

# --- Training ---
epochs = 200
learning_rate = 0.3
batch_size = 216

for epoch in range(epochs):
    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i:i+batch_size]
        Y_batch = Y_train_one_hot[i:i+batch_size]

        # Forward propagation
        Z, A = forward(X_batch, W, b)

        # Backward propagation
        dW, db = backward(X_batch, Y_batch, A)

        # Update parameters
        W, b = update_params(W, b, dW, db, learning_rate)

    # --- Print loss and accuracy at the end of each epoch ---
    Z_full, A_full = forward(X_train, W, b)
    # Cross-entropy loss
    loss = -np.mean(np.sum(Y_train_one_hot * np.log(A_full + 1e-8), axis=1))
    accuracy = calculate_accuracy(A_full, y_train)

    if (epoch % 10 == 0) or (epoch == epochs - 1):
        print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {accuracy:.4f}")

_, A_final = forward(X_train, W, b)
final_accuracy = calculate_accuracy(A_final, y_train)
print(f"\nFinal Training Accuracy: {final_accuracy * 100:.2f}%")


Epoch 0: Loss = 0.3748, Accuracy = 0.8967
Epoch 10: Loss = 0.2856, Accuracy = 0.9216
Epoch 20: Loss = 0.2728, Accuracy = 0.9255
Epoch 30: Loss = 0.2663, Accuracy = 0.9274
Epoch 40: Loss = 0.2620, Accuracy = 0.9287
Epoch 50: Loss = 0.2589, Accuracy = 0.9295
Epoch 60: Loss = 0.2565, Accuracy = 0.9300
Epoch 70: Loss = 0.2546, Accuracy = 0.9307
Epoch 80: Loss = 0.2529, Accuracy = 0.9310
Epoch 90: Loss = 0.2515, Accuracy = 0.9315
Epoch 100: Loss = 0.2503, Accuracy = 0.9317
Epoch 110: Loss = 0.2493, Accuracy = 0.9320
Epoch 120: Loss = 0.2483, Accuracy = 0.9323
Epoch 130: Loss = 0.2475, Accuracy = 0.9325
Epoch 140: Loss = 0.2467, Accuracy = 0.9328
Epoch 150: Loss = 0.2460, Accuracy = 0.9330
Epoch 160: Loss = 0.2453, Accuracy = 0.9332
Epoch 170: Loss = 0.2447, Accuracy = 0.9333
Epoch 180: Loss = 0.2442, Accuracy = 0.9335
Epoch 190: Loss = 0.2437, Accuracy = 0.9335
Epoch 199: Loss = 0.2432, Accuracy = 0.9337

Final Training Accuracy: 93.38%


In [16]:
# Normalize test data
X_test = np.array(test_data) / 255.0

# Get predictions on the test set
_, A_test = forward(X_test, W, b)
predictions = np.argmax(A_test, axis=1)



In [17]:
# This is how you prepare a submission for the competition
predictions_csv = {
    "ID": [],
    "target": [],
}

for i, label in enumerate(predictions):
    predictions_csv["ID"].append(i)
    predictions_csv["target"].append(label)

df = pd.DataFrame(predictions_csv)
df.to_csv("submission.csv", index=False)