Implementation from scratch 

In [1]:
import torch
from torch import nn

dropout means doing this to the layer:

$$
\begin{aligned}
h' =
\begin{cases}
    0 & \textrm{ with probability } p \\
    \frac{h}{1-p} & \textrm{ otherwise}
\end{cases}
\end{aligned}
$$


In [None]:
def dropout_layer(x, dropout_p):
    assert 0 <= dropout_p <= 1  # Make sure the prop is between 1 and 0
    if dropout_p == 1:
        return torch.zeros_like(x)

    # generate random number form 0 to 1 and only keep what is larger than p
    # then turn it into numbers
    mask = (torch.rand(x.shape) > dropout_p).float()
    return (mask * x) / (1 - dropout_p)

In [None]:
x = torch.ones((3, 3))
dropout_layer(x, 0.5), dropout_layer(x, 0), dropout_layer(x, 1)

(tensor([[0., 0., 2.],
         [2., 0., 0.],
         [2., 0., 2.]]),
 tensor([[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]),
 tensor([[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]))

In [None]:
import torchvision
from torchvision import datasets
from torchvision.transforms import ToTensor

import matplotlib.pylab as plt

train_data = datasets.FashionMNIST(
    root="data",
    train=True,
    transform=ToTensor(),
    download=True,
    target_transform=None
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    transform=ToTensor(),
    download=True,
    target_transform=None
)

100%|██████████| 26.4M/26.4M [00:07<00:00, 3.35MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 486kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 3.22MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 12.0MB/s]


In [15]:
from torch.utils.data import DataLoader

batch_size = 64

train_dataloader = DataLoader(dataset=train_data,
                              batch_size=batch_size,
                              shuffle=True)
test_dataloader = DataLoader(dataset=test_data,
                             batch_size=batch_size,
                             shuffle=False)

In [40]:
class DropoutScratchFashionMNIST(nn.Module):
    def __init__(self, in_shape, out_shape, hidden_1, hidden_2,
                 dropout_1, dropout_2):
        super().__init__()
        self.flatten = nn.Flatten()
        self.lin1 = nn.LazyLinear(hidden_1)
        self.relu = nn.ReLU()
        self.lin2 = nn.LazyLinear(hidden_2)
        self.lin3 = nn.LazyLinear(out_shape)
        self.dropout_1 = dropout_1
        self.dropout_2 = dropout_2

    def forward(self, X):
        H1 = self.relu(self.lin1(self.flatten(X)))

        if (self.training):
            H1 = dropout_layer(H1, self.dropout_1)

        H2 = self.relu(self.lin2(H1))

        if (self.training):
            H2 = dropout_layer(H2, self.dropout_2)

        return self.lin3(H2)


labels = train_data.classes
model_0 = DropoutScratchFashionMNIST(in_shape=784,
                                     out_shape=len(labels),
                                     hidden_1=256,
                                     hidden_2=256,
                                     dropout_1=0.3,
                                     dropout_2=0.3)

In [41]:
from torchmetrics import Accuracy

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model_0.parameters(), lr=0.001)

accuracy_fn = Accuracy(task="multiclass",
                       num_classes=len(labels))

In [42]:
from tqdm.auto import tqdm
epochs = 10
for epoch in tqdm(range(epochs)):
    model_0.train()
    train_loss = 0
    for X, y in train_dataloader:

        y_pred = model_0(X)

        loss = loss_fn(y_pred, y)
        train_loss += loss

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    train_loss /= len(train_dataloader)

    test_loss = test_acc = 0
    model_0.eval()
    with torch.inference_mode():
        for X, y in test_dataloader:
            test_preds = model_0(X)

            test_loss += loss_fn(test_preds, y)

            test_acc += accuracy_fn(y, test_preds.argmax(dim=1))

        test_loss /= len(test_dataloader)
        test_acc /= len(test_dataloader)

    print(
        f"\nTrain loss: {train_loss:.5f} | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

  0%|          | 0/10 [00:00<?, ?it/s]


Train loss: 0.56936 | Test loss: 0.43066, Test acc: 0.85%

Train loss: 0.41496 | Test loss: 0.38802, Test acc: 0.86%

Train loss: 0.38007 | Test loss: 0.37503, Test acc: 0.86%

Train loss: 0.36161 | Test loss: 0.35531, Test acc: 0.87%

Train loss: 0.34532 | Test loss: 0.39363, Test acc: 0.86%

Train loss: 0.33350 | Test loss: 0.35220, Test acc: 0.87%

Train loss: 0.32636 | Test loss: 0.33689, Test acc: 0.88%

Train loss: 0.31827 | Test loss: 0.34034, Test acc: 0.88%

Train loss: 0.30987 | Test loss: 0.33871, Test acc: 0.88%

Train loss: 0.30536 | Test loss: 0.32527, Test acc: 0.88%


Concise implementation

In [43]:
class DropoutMLP(nn.Module):
    def __init__(self, out_shape, hidden_1, hidden_2,
                 dropout_1, dropout_2):
        super().__init__()
        self.layer_stack = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(hidden_1), nn.ReLU(),
            nn.Dropout(dropout_1),
            nn.LazyLinear(hidden_2), nn.ReLU(),
            nn.Dropout(dropout_2),
            nn.LazyLinear(out_shape)
        )

    def forward(self, x):
        return self.layer_stack(x)


model_1 = DropoutMLP(
    out_shape=len(labels),
    hidden_1=256,
    hidden_2=256,
    dropout_1=0.3,
    dropout_2=0.3)

In [44]:
from torchmetrics import Accuracy

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model_1.parameters(), lr=0.001)

accuracy_fn = Accuracy(task="multiclass",
                       num_classes=len(labels))

In [45]:
for epoch in tqdm(range(epochs)):
    model_1.train()
    train_loss = 0

    for X, y in train_dataloader:
        y_preds = model_1(X)
        loss = loss_fn(y_preds, y)
        
        train_loss += loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    train_loss /= len(train_dataloader)
    
    test_acc = 0
    test_loss = 0

    model_1.eval()
    with torch.inference_mode():
        for X, y in test_dataloader:
            y_preds = model_1(X)
            test_loss += loss_fn(y_preds, y)
            test_acc += accuracy_fn(y_preds, y)
        
        test_loss /= len(test_dataloader)
        test_acc /= len(test_dataloader)
        
    print(f"\nTrain loss: {train_loss:.5f} | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")


  0%|          | 0/10 [00:00<?, ?it/s]


Train loss: 0.56894 | Test loss: 0.43997, Test acc: 0.84%

Train loss: 0.41454 | Test loss: 0.42851, Test acc: 0.84%

Train loss: 0.38279 | Test loss: 0.39120, Test acc: 0.86%

Train loss: 0.36165 | Test loss: 0.36551, Test acc: 0.87%

Train loss: 0.34809 | Test loss: 0.35509, Test acc: 0.87%

Train loss: 0.33560 | Test loss: 0.35413, Test acc: 0.87%

Train loss: 0.32620 | Test loss: 0.35543, Test acc: 0.87%

Train loss: 0.31856 | Test loss: 0.33854, Test acc: 0.88%

Train loss: 0.30988 | Test loss: 0.33818, Test acc: 0.88%

Train loss: 0.30319 | Test loss: 0.34472, Test acc: 0.87%
