In [1]:
import numpy as np
from collections import OrderedDict

### Define the layers and activation functions

In [2]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

In [3]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        y = y.reshape(1, -1)
        t = t.reshape(1, -1)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

In [4]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.X = None
        self.dW = None
        self.db = None
        
    def forward(self, X):
        out = X.dot(self.W) + self.b
        self.X = X
        return out
    
    def backward(self, dout):
        dx = dout.dot(self.W.T)
        self.dW = self.X.T.dot(dout)
        self.db = dout.sum(axis=0)
        return dx

In [5]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dx = dout.copy()
        dx[self.mask] = 0
        return dx

In [6]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

In [7]:
class MyNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastlayer = SoftmaxWithLoss()
    
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastlayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        accuracy = np.mean(y == t)
        return accuracy
    
    def gradient(self, x, t):
        
        self.loss(x, t)
        
        dout = 1
        dout = self.lastlayer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        
        for layer in layers:
            dout = layer.backward(dout)
            
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

### Prepare the Data

In [8]:
from tensorflow.keras.datasets.mnist import load_data
from keras.utils import to_categorical

Using TensorFlow backend.


In [9]:
(X_train, y_train), (X_test, y_test) = load_data()

In [10]:
X_train = X_train.reshape(60000, -1) / 255
X_test = X_test.reshape(10000, -1) / 255

In [11]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

### Train the Model and See the Performance

In [12]:
network = MyNeuralNetwork(input_size=784, hidden_size=50, output_size=10)

iters_num = 20000
train_size = X_train.shape[0]
batch_size = 200
learning_rate = 0.1

for i in range(iters_num+1):
    batch_mask = np.random.choice(train_size, batch_size)
    X_batch = X_train[batch_mask]
    y_batch = y_train[batch_mask]

    grad = network.gradient(X_batch, y_batch)

    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    if i % 1000 == 0:
        print('iteration: {}, training_accuracy: {:.4f}, test_accuracy: {:.4f}'.
              format(i, network.accuracy(X_train, y_train), network.accuracy(X_test, y_test)))

iteration: 0, training_accuracy: 0.0804, test_accuracy: 0.0850
iteration: 1000, training_accuracy: 0.9213, test_accuracy: 0.9253
iteration: 2000, training_accuracy: 0.9414, test_accuracy: 0.9407
iteration: 3000, training_accuracy: 0.9531, test_accuracy: 0.9505
iteration: 4000, training_accuracy: 0.9602, test_accuracy: 0.9561
iteration: 5000, training_accuracy: 0.9660, test_accuracy: 0.9606
iteration: 6000, training_accuracy: 0.9705, test_accuracy: 0.9641
iteration: 7000, training_accuracy: 0.9729, test_accuracy: 0.9658
iteration: 8000, training_accuracy: 0.9755, test_accuracy: 0.9677
iteration: 9000, training_accuracy: 0.9773, test_accuracy: 0.9672
iteration: 10000, training_accuracy: 0.9800, test_accuracy: 0.9699
iteration: 11000, training_accuracy: 0.9819, test_accuracy: 0.9706
iteration: 12000, training_accuracy: 0.9832, test_accuracy: 0.9713
iteration: 13000, training_accuracy: 0.9852, test_accuracy: 0.9718
iteration: 14000, training_accuracy: 0.9862, test_accuracy: 0.9722
iteratio