In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下把全部元素都丢弃
    if keep_prob == 0:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob
    return mask * X / keep_prob

In [2]:
X = nd.arange(16).reshape((2, 8))
dropout(X, 0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [3]:
dropout(X, 0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X, 1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

In [5]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

In [9]:
drop_prob1, drop_prob2 = 0.5,0.2

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training():  # 只在训练模型时使用丢弃法
        H1 = dropout(H1, drop_prob1)  # 在第一层全连接后添加丢弃层
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)  # 在第二层全连接后添加丢弃层
    return nd.dot(H2, W3) + b3

In [10]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params, lr)

epoch 1, loss 0.4366, train acc 0.840, test acc 0.859
epoch 2, loss 0.4183, train acc 0.848, test acc 0.860
epoch 3, loss 0.4011, train acc 0.853, test acc 0.872
epoch 4, loss 0.3910, train acc 0.857, test acc 0.874
epoch 5, loss 0.3823, train acc 0.859, test acc 0.874


In [16]:
# 简洁实现
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1),  # 在第一个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2),  # 在第二个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.1832, train acc 0.534, test acc 0.773
epoch 2, loss 0.6075, train acc 0.772, test acc 0.826
epoch 3, loss 0.5131, train acc 0.809, test acc 0.844
epoch 4, loss 0.4745, train acc 0.826, test acc 0.856
epoch 5, loss 0.4468, train acc 0.836, test acc 0.855


In [14]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1),  # 在第一个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2),  # 在第二个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1),  # 在第二个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2),  # 在第二个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 5, batch_size, None,
              None, trainer)

epoch 1, loss 2.3031, train acc 0.098, test acc 0.100
epoch 2, loss 2.3014, train acc 0.106, test acc 0.200
epoch 3, loss 1.7850, train acc 0.259, test acc 0.384
epoch 4, loss 1.2700, train acc 0.476, test acc 0.663
epoch 5, loss 0.8095, train acc 0.673, test acc 0.744


In [15]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dense(256, activation="relu"),
        nn.Dense(256, activation="relu"),
        nn.Dense(256, activation="relu"),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 5, batch_size, None,
              None, trainer)

epoch 1, loss 2.3029, train acc 0.102, test acc 0.100
epoch 2, loss 2.2843, train acc 0.114, test acc 0.202
epoch 3, loss 1.5485, train acc 0.348, test acc 0.496
epoch 4, loss 0.9888, train acc 0.583, test acc 0.707
epoch 5, loss 0.6727, train acc 0.737, test acc 0.813


In [17]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dense(256, activation="relu"),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.0908, train acc 0.573, test acc 0.786
epoch 2, loss 0.5375, train acc 0.797, test acc 0.838
epoch 3, loss 0.4577, train acc 0.831, test acc 0.851
epoch 4, loss 0.4101, train acc 0.846, test acc 0.861
epoch 5, loss 0.3852, train acc 0.855, test acc 0.862


In [31]:
# 简洁实现
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1),  # 在第一个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2),  # 在第二个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.1482, train acc 0.554, test acc 0.776
epoch 2, loss 0.6068, train acc 0.769, test acc 0.835
epoch 3, loss 0.5147, train acc 0.808, test acc 0.849
epoch 4, loss 0.4781, train acc 0.824, test acc 0.849
epoch 5, loss 0.4484, train acc 0.835, test acc 0.853
