In [8]:
import numpy as np

In [9]:
def one_hot(y, C):
    Y = np.zeros((y.size, C), dtype=np.float32)
    Y[np.arange(y.size), y] = 1
    return Y

class MLP:
    def __init__(self, d_in, d_h1, d_h2, d_out):
        self.W1 = np.random.randn(d_in, d_h1)*np.sqrt(2/d_in)
        self.b1 = np.zeros((1, d_h1))
        self.W2 = np.random.randn(d_h1, d_h2)*np.sqrt(2/d_h1)
        self.b2 = np.zeros((1, d_h2))
        self.W3 = np.random.randn(d_h2, d_out)*np.sqrt(2/d_h2)
        self.b3 = np.zeros((1, d_out))
        self.v = {k: np.zeros_like(v) for k, v in self.params().items()}

    def params(self):
        return {
            'W1': self.W1, 'b1': self.b1,
            'W2': self.W2, 'b2': self.b2,
            'W3': self.W3, 'b3': self.b3
        }
    
    @staticmethod
    def relu(x):
        return np.maximum(x, 0.0)
    
    def forward(self, X):
        z1 = X@self.W1 + self.b1
        a1 = self.relu(z1)
        z2 = a1@self.W2 + self.b2
        a2 = self.relu(z2)
        z3 = a2@self.W3 + self.b3
        z3m = z3.max(axis=1, keepdims=True)
        exp = np.exp(z3 - z3m)
        probs = exp/(exp.sum(axis=1, keepdims=True)+1e-12)
        cache = (X, z1, a1, z2, a2, z3, probs)
        return probs, cache
    
    def loss_and_grads(self, X, y, l2=0):
        N = X.shape[0]
        probs, cache = self.forward(X)
        yoh = one_hot(y, probs.shape[1])
        ce = -np.sum(yoh*np.log(probs+1e-12))/N
        l2_term = 0.5*l2*(np.sum(self.W1**2) + np.sum(self.W2**2) + np.sum(self.W3**2))
        loss = ce + l2_term

        X, z1, a1, z2, a2, z3, _ = cache
        dZ3 = (probs - yoh) / N
        dW3 = a2.T @ dZ3 + l2 * self.W3
        db3 = dZ3.sum(axis=0, keepdims=True)

        dA2 = dZ3 @ self.W3.T
        dZ2 = dA2 * (z2 > 0)
        dW2 = a1.T @ dZ2 + l2 * self.W2
        db2 = dZ2.sum(axis=0, keepdims=True)

        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * (z1 > 0)
        dW1 = X.T @ dZ1 + l2 * self.W1
        db1 = dZ1.sum(axis=0, keepdims=True)

        grads = {"W1": dW1, "b1": db1, "W2": dW2, "b2": db2, "W3": dW3, "b3": db3}
        return loss, grads
    
    def step(self, grads, lr=1e-2, momentum=0.9):
        for k in self.params().keys():
            self.v[k] = momentum * self.v[k] - lr * grads[k]
            setattr(self, k, getattr(self, k) + self.v[k])


In [10]:
def iterate_minibatches(X, y, batch_size=128, shuffle=True, rng=None):
    N = X.shape[0]
    idx = np.arange(N)
    if shuffle:
        if rng is None:
            np.random.shuffle(idx)
        else:
            rng.shuffle(idx)
    for s in range(0, N, batch_size):
        b = idx[s:s+batch_size]
        yield X[b], y[b]