In [1]:
from d2l import mxnet as d2l
from mxnet import gluon, autograd, np, npx
from mxnet.gluon import nn

npx.set_np()

In [2]:
# Define the weight, bias and super parameters
num_inputs, num_hidden1, num_hidden2, num_outputs = 28 * 28, 256, 256, 10
learning_rate = 0.5
batch_size = 256
num_epochs = 10
dropout_p1, dropout_p2 = 0.2, 0.5

W1 = np.random.normal(scale = 0.01, size = (num_inputs, num_hidden1))
b1 = np.zeros(shape = (1, num_hidden1))

W2 = np.random.normal(scale = 0.01, size = (num_hidden1, num_hidden2))
b2 = np.zeros(shape = (1, num_hidden2))

W3 = np.random.normal(scale = 0.01, size = (num_hidden2, num_outputs))
b3 = np.zeros(shape = (1, num_outputs))

params = [W1, b1, W2, b2, W3, b3]
weight = [W1, W2, W3]
bias = [b1, b2, b3]

In [49]:
# Define loss cross entropy
def loss(y_hat, y):
    y_hat = np.softmax(y_hat, axis = 1)
    return -np.log(y_hat[range(y_hat.shape[0]), y])

def optimizer(weight, bias, learning_rate, batch_size, wd, wd_mult = 1):
    for W in weight:
        W[:] = W - learning_rate / batch_size * W.grad - learning_rate * wd * W
    for b in bias:
        b[:] = b - learning_rate / batch_size * b.grad - learning_rate * wd_mult * wd * W 

# Define trainer function
def epoch_trainer(net, train_iter, wd = 0):
    # 3 metric: train_loss, train_accuracy, total_examples
    metric = d2l.Accumulator(3)

    for X, y in train_iter:
        with autograd.record():
            y_hat = net(X)
            l = loss(y_hat, y)
        l.backward()
        optimizer(weight, bias, learning_rate, batch_size, wd)
        metric.add(l.sum(), d2l.accuracy(y_hat, y), y.size)
    return metric[0] / metric[2], metric[1] / metric[2]

def train(net, train_iter, test_iter):
    anim = d2l.Animator(
        xlabel="epochs", legend = ["train_loss", "train_accuracy", "test_accuracy"]
    )

    for epoch in range(num_epochs):
        train_loss, train_acc = epoch_trainer(net, train_iter)
        for X, y in test_iter:
            test_acc = d2l.accuracy(net(X), y) / len(y)
            break
        anim.add([epoch + 1] * 3, [train_loss, train_acc, test_acc])


In [47]:
# Define the dropout layer
def dropout(X, percentage):
    if percentage == 0:
        return X
    if percentage == 1:
        return np.zeros_like(X)
    
    choose_index = np.random.uniform(0, 1, size = X.shape) > percentage
    return choose_index.astype("float32") * X / (1 - percentage)

X = np.arange(0, 16).reshape((2, 8))
print(X)
print(dropout(X, 0.5))
print(dropout(X, 0))
print(dropout(X, 1))

[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
[[ 0.  0.  0.  0.  0. 10.  0.  0.]
 [16.  0. 20.  0. 24. 26. 28. 30.]]
[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


In [None]:
# Define the network
def net(X):
    H1 = np.dot(X, W1) + b1
    H1 = dropout(H1, dropout_p1)

    H2 = np.dot(H1, W2) + b2
    H2 = dropout(H2, dropout_p2)

    return np.dot(H2, W3) + b3