In [1]:
# David Darigan (C00263218)

# The following is work taken from Chapter 12 of "Programming Machine Learning (Paolo Perrotta)"
# Placing it here for reference about the underlying features of neural networks
# This is a functional but imperfect neural network 

import numpy as np

def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def softmax(logits):
  exponentials = np.exp(logits)
  return exponentials / np.sum(exponentials, axis=1).reshape(-1, 1)

def sigmoid_gradient(sigmoid):
  return np.multiply(sigmoid, (1 - sigmoid))

def loss(Y, y_hat):
  return -np.sum(Y * np.log(y_hat)) / Y.shape[0]

def prepend_bias(X):
  return np.insert(X, 0, 1, axis=1)

def forward(X, w1, w2):
  h = sigmoid(np.matmul(prepend_bias(X), w1))
  y_hat = softmax(np.matmul(prepend_bias(h), w2))
  return (y_hat, h)

def back(X, Y, y_hat, w2, h):
  w2_gradient = np.matmul(prepend_bias(h).T, (y_hat - Y)) / X.shape[0]
  w1_gradient = np.matmul(prepend_bias(X).T, np.matmul(y_hat - Y, w2[1:].T) * sigmoid_gradient(h)) / X.shape[0]
  return (w1_gradient, w2_gradient)

def classify(X, w1, w2):
  y_hat, _ = forward(X, w1, w2)
  labels = np.argmax(y_hat, axis=1)
  return labels.reshape(-1, 1)

def initialize_weights(n_input_variables, n_hidden_nodes, n_classes):
  w1_rows = n_input_variables + 1
  w1 = np.random.randn(w1_rows, n_hidden_nodes) * np.sqrt(1 / w1_rows)
  w2_rows = n_hidden_nodes + 1
  w2 = np.random.randn(w2_rows, n_classes) * np.sqrt(1 / w2_rows)
  return (w1, w2)

def report(iteration, X_train, Y_train, X_test, Y_test, w1, w2):
  y_hat, _ = forward(X_train, w1, w2)
  training_loss = loss(Y_train, y_hat)
  classifications = classify(X_test, w1, w2)
  accuracy = np.average(classifications == Y_test) * 100.0
  print("Iteration: %5d, Loss: %.8f, Accuracy: %.2f%%" % (iteration, training_loss, accuracy))

def train(X_train, Y_train, X_test, Y_test, n_hidden_nodes, iterations, lr):
  n_input_variables = X_train.shape[1]
  n_classes = Y_train.shape[1]
  w1, w2 = initialize_weights(n_input_variables, n_hidden_nodes, n_classes)
  for iteration in range(iterations):
    y_hat, h = forward(X_train, w1, w2)
    w1_gradient, w2_gradient = back(X_train, Y_train, y_hat, w2, h)
    w1 = w1 - (w1_gradient * lr)
    w2 = w2 - (w2_gradient * lr)
    report(iteration, X_train, Y_train, X_test, Y_test, w1, w2)
  return (w1, w2)

import mnist
w1, w2 = train(mnist.X_train, mnist.Y_train, mnist.X_test, mnist.Y_test, n_hidden_nodes=2200, iterations=10000, lr=0.01)



0 - Loss: 6.93147180559945397249, 9.80%
1 - Loss: 8.43445687508333641347, 68.04%
2 - Loss: 5.51204748892387641490, 68.10%
3 - Loss: 2.95687007359365416903, 68.62%
4 - Loss: 1.89853876570570934135, 73.75%
5 - Loss: 1.75582891552667441637, 81.99%
6 - Loss: 1.67488127292621791220, 81.25%
7 - Loss: 1.62387524342028100044, 82.89%
8 - Loss: 1.56528056897466516517, 82.69%
9 - Loss: 1.52926926510555771799, 83.61%
10 - Loss: 1.48349685001838960119, 83.55%
11 - Loss: 1.45473907235372745816, 84.30%
12 - Loss: 1.41878447814394381687, 84.27%
13 - Loss: 1.39425656696842192872, 84.84%
14 - Loss: 1.36593509106222588123, 84.96%
15 - Loss: 1.34458751883476468336, 85.34%
16 - Loss: 1.32201982320960964579, 85.40%
17 - Loss: 1.30346341841935919881, 85.81%
18 - Loss: 1.28511711376623205538, 85.86%
19 - Loss: 1.26906831515005369404, 86.18%
20 - Loss: 1.25378277537179583234, 86.22%
21 - Loss: 1.23989638166385729434, 86.55%
22 - Loss: 1.22684690399574325426, 86.51%
23 - Loss: 1.21474057573247784525, 86.74%
24 