In [1]:
# 3.13.2 从零开始实现
import d2lzh as d2l
from mxnet import autograd,gluon,init,nd
from mxnet.gluon import loss as gloss,nn
# dropout函数将以drop_prob的概率丢弃NDArray输⼊X中的元素。
def dropout(X,drop_prob):
    assert 0<=drop_prob<=1
    keep_prob=1-drop_prob
    # drop_prob=0时X中的全部元素都被丢弃 
    if keep_prob ==0:
        return X.zeros_like()
    # uniform(0,1)为[0, 1)的均匀分布,mask中的元素为0或者1，0代表丢弃，1代表除以1-p拉伸
    mask =nd.random.uniform(0,1,X.shape)<keep_prob
    return mask*X/keep_prob

In [2]:
X=nd.arange(16).reshape((2,8))
dropout(X,0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [3]:
dropout(X,0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X,1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

In [5]:
# 定义模型参数 
num_inputs,num_outputs,num_hiddens1,num_hiddens2 =784,10,256,256

W1=nd.random.normal(scale=0.01,shape=(num_inputs,num_hiddens1))
b1=nd.zeros(num_hiddens1)
W2=nd.random.normal(scale=0.01,shape=(num_hiddens1,num_hiddens2))
b2=nd.zeros(num_hiddens2)
W3=nd.random.normal(scale=0.01,shape=(num_hiddens2,num_outputs))
b3=nd.zeros(num_outputs)

params =[W1,b1,W2,b2,W3,b3]
for param in params:
    param.attach_grad()

In [6]:
# 定义模型 
# 通常的建议是把靠近输⼊层的丢弃概率设得小⼀点
drop_prob1,drop_prob2=0.2,0.5

def net(X):
    X=X.reshape((-1,num_inputs))
    H1=(nd.dot(X,W1)+b1).relu()
    if autograd.is_training(): # 只在训练模型时使⽤丢弃法 
        H1=dropout(H1,drop_prob1) # 在第⼀层全连接后添加丢弃层 
    H2=(nd.dot(H1,W2)+b2).relu()
    if autograd.is_training():
        H2=dropout(H2,drop_prob2) # 在第⼆层全连接后添加丢弃层
    return nd.dot(H2,W3)+b3

In [7]:
# 训练和测试模型
num_epochs,lr,batch_size=5,0.5,256
loss=gloss.SoftmaxCrossEntropyLoss()
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,params,lr)

epoch 1, loss 1.1734, train acc 0.546, test acc 0.763
epoch 2, loss 0.5888, train acc 0.783, test acc 0.836
epoch 3, loss 0.4973, train acc 0.818, test acc 0.844
epoch 4, loss 0.4519, train acc 0.834, test acc 0.861
epoch 5, loss 0.4224, train acc 0.846, test acc 0.866


In [8]:
# 3.13.3 简洁实现
# 在Gluon中，我们只需要在全连接层后添加Dropout层并指定丢弃概率。
# 在训练模型时， Dropout层将以指定的丢弃概率随机丢弃上⼀层的输出元素；
# 在测试模型时，Dropout层并不发挥作⽤。 
net=nn.Sequential()
net.add(nn.Dense(256,activation="relu"),
        nn.Dropout(drop_prob1),# 在第⼀个全连接层后添加丢弃层 
        nn.Dense(256,activation="relu"),
        nn.Dropout(drop_prob2),
        nn.Dense(10) )
net.initialize(init.Normal(sigma=0.01))
# 下⾯训练并测试模型。 
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 1.2509, train acc 0.519, test acc 0.779
epoch 2, loss 0.6051, train acc 0.774, test acc 0.825
epoch 3, loss 0.5051, train acc 0.815, test acc 0.845
epoch 4, loss 0.4596, train acc 0.832, test acc 0.858
epoch 5, loss 0.4312, train acc 0.843, test acc 0.860


In [9]:
# 256无dropout
net=nn.Sequential()
net.add(nn.Dense(256,activation="relu"),
        
        nn.Dense(256,activation="relu"),
      
        nn.Dense(10) )
net.initialize(init.Normal(sigma=0.01))
# 下⾯训练并测试模型。 
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 1.1987, train acc 0.528, test acc 0.752
epoch 2, loss 0.6036, train acc 0.774, test acc 0.830
epoch 3, loss 0.4828, train acc 0.820, test acc 0.849
epoch 4, loss 0.4287, train acc 0.843, test acc 0.854
epoch 5, loss 0.4025, train acc 0.850, test acc 0.861


In [10]:
# 512有dropout
net=nn.Sequential()
net.add(nn.Dense(512,activation="relu"),
        nn.Dropout(drop_prob1),# 在第⼀个全连接层后添加丢弃层 
        nn.Dense(512,activation="relu"),
        nn.Dropout(drop_prob2),
        nn.Dense(10) )
net.initialize(init.Normal(sigma=0.01))
# 下⾯训练并测试模型。 
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 1.0636, train acc 0.594, test acc 0.755
epoch 2, loss 0.5502, train acc 0.796, test acc 0.842
epoch 3, loss 0.4767, train acc 0.823, test acc 0.858
epoch 4, loss 0.4282, train acc 0.840, test acc 0.861
epoch 5, loss 0.3999, train acc 0.852, test acc 0.871


In [11]:
# 512无dropout
net=nn.Sequential()
net.add(nn.Dense(512,activation="relu"),
        
        nn.Dense(512,activation="relu"),
      
        nn.Dense(10) )
net.initialize(init.Normal(sigma=0.01))
# 下⾯训练并测试模型。 
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 1.1030, train acc 0.581, test acc 0.753
epoch 2, loss 0.5435, train acc 0.795, test acc 0.835
epoch 3, loss 0.4558, train acc 0.831, test acc 0.850
epoch 4, loss 0.4334, train acc 0.841, test acc 0.863
epoch 5, loss 0.3816, train acc 0.859, test acc 0.863


In [29]:
# 使用权重衰减和dropout
wd=0.005
net=nn.Sequential()
net.add(nn.Dense(256,activation="relu"),
        nn.Dropout(drop_prob1),# 在第⼀个全连接层后添加丢弃层 
        nn.Dense(256,activation="relu"),
        nn.Dropout(drop_prob2),
        nn.Dense(10) )
net.initialize(init.Normal(sigma=0.01))
# 下⾯训练并测试模型。 
trainer_w=gluon.Trainer(net.collect_params('.*weight'),'sgd',{'learning_rate':lr,'wd':wd})
trainer_b=gluon.Trainer(net.collect_params('.*bias'),'sgd',{'learning_rate':lr})
# d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)
for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer_w.step(batch_size)
            trainer_b.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = d2l.evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

epoch 1, loss 1.2304, train acc 0.521, test acc 0.758
epoch 2, loss 0.6897, train acc 0.744, test acc 0.752
epoch 3, loss 0.6202, train acc 0.773, test acc 0.810
epoch 4, loss 0.6850, train acc 0.758, test acc 0.812
epoch 5, loss 0.5846, train acc 0.789, test acc 0.826


In [13]:
 nd.random.uniform(0,1,X.shape)


[[[[0.594398   0.439149   0.00113363 ... 0.273761   0.27252573
    0.34036335]
   [0.99925375 0.9863009  0.60216784 ... 0.2524284  0.8843359
    0.88571775]
   [0.25586182 0.39232728 0.90083915 ... 0.7731324  0.6875209
    0.6698372 ]
   ...
   [0.6274647  0.44942456 0.9586854  ... 0.7581768  0.93534374
    0.25146893]
   [0.84788525 0.651443   0.01042388 ... 0.49793133 0.07201391
    0.7817697 ]
   [0.7108818  0.5428142  0.44883537 ... 0.6423631  0.3888265
    0.13341165]]]


 [[[0.14736313 0.08054358 0.06622369 ... 0.16392337 0.5063136
    0.45165986]
   [0.3956871  0.8919537  0.2511476  ... 0.9411405  0.7232552
    0.8548682 ]
   [0.7233531  0.42067823 0.9748896  ... 0.5872125  0.97198504
    0.578409  ]
   ...
   [0.35847178 0.14997399 0.4684393  ... 0.6692542  0.16101307
    0.7498409 ]
   [0.85284626 0.09155352 0.99380416 ... 0.92838496 0.9244678
    0.26277018]
   [0.8921542  0.99566656 0.7558916  ... 0.6583087  0.79715794
    0.04568885]]]


 [[[0.21520002 0.7271118  0.6110321

In [14]:
d2l.load_data_fashion_mnist??

In [30]:
d2l.train_ch3??
net.initialize??

In [32]:
import mxnet
mxnet.initializer.Uniform??