In [1]:
import torch
from torch import nn
from d2l import torch as d2l
from tqdm import trange

In [2]:
"""
训练中 dropout
防止过拟合
h' = dropout(h)
E(h') = p * 0 + (1-p) * h / (1-p) = h
h' = 0 || h / (1-p)
"""

"\n训练中 dropout\n防止过拟合\nh' = dropout(h)\nE(h') = p * 0 + (1-p) * h / (1-p) = h\nh' = 0 || h / (1-p)\n"

In [3]:
# 手动实现drooput
def dropout_layer(X, dropout):
    assert 0 <= dropout <= 1, "dropout need in range(0, 1)"
    if dropout == 0:
        return X
    if dropout == 1:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape) > dropout).float()
    return X * mask / (1.0 - dropout)

In [4]:
X = torch.arange(15, dtype=torch.float32).reshape(3, 5)
dropout_layer(X, 0), dropout_layer(X, 0.5), dropout_layer(X, 1)

(tensor([[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14.]]),
 tensor([[ 0.,  0.,  4.,  0.,  0.],
         [10., 12.,  0., 16.,  0.],
         [20., 22., 24.,  0., 28.]]),
 tensor([[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]))

In [5]:
dropout_layer(X, 1.1)

AssertionError: dropout need in range(0, 1)

In [6]:
dropout1, dropout2 = 0.2, 0.5
# 简洁实现dropout
net = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.Dropout(dropout1),
                    nn.Linear(256, 256), nn.ReLU(), nn.Dropout(dropout2),
                    nn.Linear(256, 10))

# 初始化参数
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=256, out_features=256, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=256, out_features=10, bias=True)
)

In [8]:
# 优化器
trainer = torch.optim.SGD(net.parameters(), lr=0.01)

# 损失函数
loss = nn.CrossEntropyLoss(reduction='none')

In [11]:
# 传入数据
num_epochs, batch_size = 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

In [10]:
def accuracy_score(data_iter):
    ac_cnt, sum_cnt = 0, 0
    for X, y in data_iter:
        y_hat = net(X).argmax(axis=1)
        ac_cnt += (y_hat == y).sum()
        sum_cnt += y.numel()
    return ac_cnt / sum_cnt

In [12]:
# 训练
for epoch in trange(num_epochs, desc="Training", unit="epoch"):
    for X, y in train_iter:
        trainer.zero_grad()
        l = loss(net(X), y)
        l.mean().backward()
        trainer.step()

Training: 100%|██████████| 10/10 [00:57<00:00,  5.76s/epoch]


In [13]:
# 评估
ac_train = accuracy_score(train_iter)
ac_test = accuracy_score(test_iter)
print(f'train accuracy: {ac_train:.3f}\ntest accuracy: {ac_test:.3f}')

train accuracy: 0.641
test accuracy: 0.632
