In [21]:
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255

test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255

train_labels_onehot = to_categorical(train_labels)
test_labels_onehot = to_categorical(test_labels)

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.weights1 = np.random.randn(input_size, hidden_size)a
        self.weights2 = np.random.randn(hidden_size, output_size)
        self.bias1 = np.random.randn(hidden_size)
        self.bias2 = np.random.randn(output_size)

    def forward(self, x):
        self.z1 = x.dot(self.weights1) + self.bias1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = self.a1.dot(self.weights2) + self.bias2
        self.a2 = self.sigmoid(self.z2)
        return self.a2

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_prime(self, x):
        return self.sigmoid(x) * (1 - self.sigmoid(x))

    def compute_loss(self, y, y_pred):
        return -np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

    def backward(self, x, y, learning_rate=0.01):
        delta2 = (self.a2 - y) * self.sigmoid_prime(self.z2)
        delta1 = delta2.dot(self.weights2.T) * self.sigmoid_prime(self.z1)

        self.weights2 -= learning_rate * self.a1.T.dot(delta2)
        self.bias2 -= learning_rate * np.sum(delta2, axis=0)

        self.weights1 -= learning_rate * x.T.dot(delta1)
        self.bias1 -= learning_rate * np.sum(delta1, axis=0)

def compute_confusion_matrix(true, pred):
    K = len(np.unique(true))
    result = np.zeros((K, K))
    for i in range(len(true)):
        result[true[i]][pred[i]] += 1
    return result

def cross_validation(images, labels, k=5):
    fold_size = len(images) // k
    for fold in range(k):
        val_start = fold * fold_size
        val_end = (fold + 1) * fold_size

        x_val = images[val_start:val_end]
        y_val = labels[val_start:val_end]

        x_train = np.vstack([images[:val_start], images[val_end:]])
        y_train = np.vstack([labels[:val_start], labels[val_end:]])

        yield x_train, y_train, x_val, y_val

all_accuracies = []
all_sensitivities = []
all_specificities = []
all_f1_scores = []

for x_train, y_train, x_val, y_val in cross_validation(train_images, train_labels_onehot):

    nn = NeuralNetwork(input_size=28*28, hidden_size=64, output_size=10)

    epochs = 20
    batch_size = 8
    learning_rate = 0.01
    num_batches = x_train.shape[0] // batch_size

    for epoch in range(epochs):
        total_loss = 0
        for i in range(num_batches):
            x_batch = x_train[i*batch_size:(i+1)*batch_size]
            y_batch = y_train[i*batch_size:(i+1)*batch_size]

            y_pred = nn.forward(x_batch)
            total_loss += nn.compute_loss(y_batch, y_pred)

            nn.backward(x_batch, y_batch, learning_rate)

        average_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{epochs}, Loss: {average_loss:.4f}")



    y_val_pred = nn.forward(x_val)
    predicted_labels = np.argmax(y_val_pred, axis=1)
    true_labels = np.argmax(y_val, axis=1)

    confusion = compute_confusion_matrix(true_labels, predicted_labels)

    FP = confusion.sum(axis=0) - np.diag(confusion)
    FN = confusion.sum(axis=1) - np.diag(confusion)
    TP = np.diag(confusion)
    TN = confusion.sum() - (FP + FN + TP)

    accuracy = np.sum(TP) / np.sum(confusion)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)

    precision = TP / (TP + FP + 1e-7)
    recall = TP / (TP + FN + 1e-7)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-7)

    all_accuracies.append(accuracy)
    all_sensitivities.append(np.mean(sensitivity))
    all_specificities.append(np.mean(specificity))
    all_f1_scores.append(np.mean(f1))

print("Average Accuracy:", np.mean(all_accuracies))
print("Average Sensitivity:", np.mean(all_sensitivities))
print("Average Specificity:", np.mean(all_specificities))
print("Average F1 Score:", np.mean(all_f1_scores))



Epoch 1/20, Loss: 42.7375
Epoch 2/20, Loss: 24.4986
Epoch 3/20, Loss: 19.2521
Epoch 4/20, Loss: 17.6841
Epoch 5/20, Loss: 16.9413
Epoch 6/20, Loss: 16.4872
Epoch 7/20, Loss: 16.1611
Epoch 8/20, Loss: 15.9048
Epoch 9/20, Loss: 15.6963
Epoch 10/20, Loss: 15.5237
Epoch 11/20, Loss: 15.3768
Epoch 12/20, Loss: 15.2492
Epoch 13/20, Loss: 15.1389
Epoch 14/20, Loss: 15.0441
Epoch 15/20, Loss: 14.9622
Epoch 16/20, Loss: 14.8904
Epoch 17/20, Loss: 14.8265
Epoch 18/20, Loss: 14.7686
Epoch 19/20, Loss: 14.7154
Epoch 20/20, Loss: 14.6660
Epoch 1/20, Loss: 31.9864
Epoch 2/20, Loss: 18.2768
Epoch 3/20, Loss: 12.6674
Epoch 4/20, Loss: 10.2708
Epoch 5/20, Loss: 9.0076
Epoch 6/20, Loss: 8.2566
Epoch 7/20, Loss: 7.7191
Epoch 8/20, Loss: 7.2915
Epoch 9/20, Loss: 6.9299
Epoch 10/20, Loss: 6.6191
Epoch 11/20, Loss: 6.3485
Epoch 12/20, Loss: 6.1122
Epoch 13/20, Loss: 5.9062
Epoch 14/20, Loss: 5.7238
Epoch 15/20, Loss: 5.5581
Epoch 16/20, Loss: 5.4049
Epoch 17/20, Loss: 5.2625
Epoch 18/20, Loss: 5.1326
Epoch 

1. Issue We Want to Solve
We hope to build a simple feed-forward neural network. The main purpose is to classify handwritten numbers in the MNIST dataset. Our aim is to predict which number from 0 to 9 the given handwritten image represents.

2. Data Information

Data Source: We are using the MNIST dataset from the Keras library. This dataset is a very famous set for hand-written digits.
Data Type and Size: It has grayscale images of hand-written numbers. For training, 60,000 images are provided, and for testing, there are 10,000 images.
Data Quality: All images are centered and bounded to fit in a 28x28 pixel box.
3. Data Preparation Steps

Reshaping Images: Each image from the dataset, originally in a 28x28 matrix form, is flattened into a vector of length 784.
Normalization: Pixel values are originally between 0 to 255. We scale these values between 0 to 1 by dividing with 255. This step can make training faster and more stable.
One-hot Encoding: The original labels, which are integers from 0 to 9, are transformed into one-hot encoded vectors. This means, for example, number 2 is represented as [0, 0, 1, 0, 0, 0, 0, 0, 0, 0].
4. How Neural Network is Built
Our neural network is quite basic but effective:

Input Layer: Takes in a vector of size 784, which comes from our 28x28 pixel images.
Hidden Layer: This is a middle layer of 64 neurons. Sigmoid function is used here for activation. It helps to introduce non-linearity to the model.
Output Layer: 10 neurons are here, matching the 10 possible digits. Each neuron predicts the probability of a particular number.
5. Training the Neural Network

Loss Function: We are using negative log-likelihood. It measures the difference between our predicted probabilities and the actual labels.
Backpropagation: After forwarding data through the network, we use backpropagation to update the weights and biases. This algorithm calculates the gradient of the loss function concerning each weight by the chain rule.
Learning Rate: Set at 0.01, it controls how much we adjust the weights in response to the calculated error.
6. About Cross-validation
We don’t want to overfit, so we use k-fold cross-validation (k=5). It means we split our training data into 5 subsets. We train on 4 and validate on 1, then rotate and repeat. This helps in getting a better understanding of the model's performance.

7. Checking Model's Performance
After training, on the validation set, we measure:

Accuracy: Percentage of correctly predicted instances.
Sensitivity (Recall): The ability of our model to identify positive classes correctly.
Specificity: The ability of the classifier to find out the negative classes.
F1 Score: A balance between precision and recall. Higher is better.
8. Final Words on Results
Our neural network, with the settings and parameters given, has been trained on the MNIST dataset. After i test, the 20,8,0.1 is the best parameter for this project, the batch size and learning rate which if i set up too high, it will reduce the accuacy of this model. Future work can look into improving performance by changing architecture, introducing more advanced techniques, or optimizing parameters.

