# Experiment 010

In this experiment, we try out some architectural changes to the block landing model to improve its predictions. In particular, we train each architecture multiple times and calculate statistics of the runs so our judgement is not affected by random variations in the training process.

For the different architectures, we'll try increasing the number of convolutional filters and the number of 3x3 conv layers.

In [2]:
import os
from pathlib import Path
import random
import shutil
import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

In [3]:
class BlockLandingDataset(Dataset):
    def __init__(self, path: str):
        self.path = path
        if not os.path.exists(path):
            raise FileNotFoundError()
        with os.scandir(self.path) as it:
            entry: os.DirEntry = next(iter(it))
            _, self.ext = os.path.splitext(entry.name)
            self.highest_index = max((int(Path(file.path).stem) for file in it))

    def __len__(self):
        return self.highest_index + 1

    def __getitem__(self, idx):
        file = os.path.join(self.path, f"{idx}{self.ext}")
        if not os.path.exists(file):
            raise IndexError()
        boards = np.load(file)
        assert boards.shape[0] == 4
        x = boards[:3]
        b1 = boards[2] # We can identify a block landing by the fact that a block spawns in the next time step
        b2 = boards[3]
        y = (np.all(b1[0] == 0) & np.any(b2[0] == 1)).astype(np.float32)
        x, y = torch.tensor(x), torch.tensor(y)
        return x, y
        

In [4]:
train_dataset = BlockLandingDataset(os.path.join("data", "block_landing", "train"))
test_dataset = BlockLandingDataset(os.path.join("data", "block_landing", "test"))
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

x, y = next(iter(train_dataloader))
print(x.shape, x.dtype)
print(y.shape, y.dtype)

torch.Size([4, 3, 22, 10]) torch.int32
torch.Size([4]) torch.float32


In [5]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [6]:
class BlockLandingModel(nn.Module):
    """Predicts whether a block has landed.

    Inputs:
        x: Tensor of int32 of shape (batch_size, seq_length, height, width). height = 22 and width = 10 are the dimensions of the game
           board. seq_length = 2 is the number of game board inputs. The entries should be 0 for empty cells and 1 for blocks.
    
    Returns: float32 scalar, logit for block landing prediction.
    """

    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 5, 3, padding=1)
        self.conv1 = nn.Conv2d(5, 10, 3, padding=1)
        self.lin = nn.Linear(30, 1)

    def forward(self, x):
        assert (x.dim() in (3, 4)), f"Expected input to be 3-D or 4-D but received {x.dim()}-D tensor."
        batched = x.dim() == 4
        if not batched:
            x = x.unsqueeze(0)
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes, which adds an extra dimension
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = F.relu(self.conv0(x))          # (22, 10) -> (22, 10)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = F.relu(self.conv1(x))          # (11,  5) -> (11,  5)
        x = F.max_pool2d(x, kernel_size=3) # (11,  5) -> ( 3,  1)
        x = torch.flatten(x, start_dim=1)
        logits = self.lin(x).squeeze(-1)
        if not batched:
            logits = logits.squeeze(0)
        return logits

In [7]:
class BiggerModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 16, 3, padding=1)
        self.conv1 = nn.Conv2d(16, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.lin = nn.Linear(160, 1)

    def forward(self, x):
        assert (x.dim() in (3, 4)), f"Expected input to be 3-D or 4-D but received {x.dim()}-D tensor."
        batched = x.dim() == 4
        if not batched:
            x = x.unsqueeze(0)
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes, which adds an extra dimension
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = F.relu(self.conv0(x))          # (22, 10) -> (22, 10)
        x = F.relu(self.conv1(x))          # (22, 10) -> (22, 10)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = F.relu(self.conv2(x))          # (11,  5) -> (11,  5)
        x = F.max_pool2d(x, kernel_size=2) # (11,  5) -> ( 5,  2)
        x = torch.flatten(x, start_dim=1)
        logits = self.lin(x).squeeze(-1)
        if not batched:
            logits = logits.squeeze(0)
        return logits

In [10]:
model = BiggerModel().to(device)
print(model)

with torch.no_grad():
    x, y = train_dataset[0]
    logits = model(x)
    preds = F.sigmoid(logits)
    print(f"Predicted probability: {preds:.5f}")

BiggerModel(
  (conv0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (lin): Linear(in_features=160, out_features=1, bias=True)
)
Predicted probability: 0.52937


In [11]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()

    num_batches = len(dataloader)
    size = len(dataloader.dataset)
    epoch_loss = 0.0
    accuracy = 0.0

    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metric calculations
        epoch_loss += loss.item()
        classes = (pred >= 0).type(torch.float)
        accuracy += (classes == y).type(torch.float).mean().item()

        if batch % 20 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    
    epoch_loss /= num_batches
    accuracy /= num_batches
    return {
        "loss": epoch_loss,
        "acc": accuracy,
    }


def test_loop(dataloader, model, loss_fn):
    model.eval()
    
    num_batches = len(dataloader)
    epoch_loss = 0.0
    accuracy = 0.0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            epoch_loss += loss_fn(pred, y).item()
            classes = (pred >= 0).type(torch.float)
            accuracy += (classes == y).type(torch.float).mean().item()

    epoch_loss /= num_batches
    accuracy /= num_batches
    print(f"Test Error: \n Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {epoch_loss:>8f} \n")
    return {
        "loss": epoch_loss,
        "acc": accuracy,
    }

In [14]:
configs = {
    "baseline": { "class": BlockLandingModel },
    "bigger": { "class": BiggerModel },
}

results = {}

repeats = 10

learning_rate = 1e-2
batch_size = 4
epochs = 200

for name, config in configs.items():
    results[name] = np.zeros(repeats)
    
    for repeat in range(repeats):
        model_class = config["class"]

        train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
        test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

        model = model_class().to(device)
        loss_fn = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        log_dir = os.path.join("runs", "experiment_010")

        log_subdir = os.path.join(log_dir, f"{model_class.__name__}_repeat_{repeat}_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        tb = SummaryWriter(log_subdir)

        print("Training...")
        for t in range(epochs):
            print(f"Epoch {t}\n-------------------------------")
            train_metrics = train_loop(train_dataloader, model, loss_fn, optimizer)
            test_metrics = test_loop(test_dataloader, model, loss_fn)
            tb.add_scalar("Loss/train", train_metrics["loss"], t)
            tb.add_scalar("Accuracy/train", train_metrics["acc"], t)
            tb.add_scalar("Loss/test", test_metrics["loss"], t)
            tb.add_scalar("Accuracy/test", test_metrics["acc"], t)
            for param_name, weight in model.named_parameters():
                tb.add_histogram(f"Weights/{param_name}", weight, t)
                tb.add_histogram(f"Gradients/{param_name}", weight.grad, t)
        print("Done!")

        tb.close()

        results[name][repeat] = train_metrics["acc"]

results

Training...
Epoch 0
-------------------------------
loss: 0.675073  [    4/  183]
loss: 0.744227  [   84/  183]
loss: 0.688450  [  164/  183]
Test Error: 
 Accuracy: 62.5%, Avg loss: 0.689498 

Epoch 1
-------------------------------
loss: 0.692347  [    4/  183]
loss: 0.677009  [   84/  183]
loss: 0.702833  [  164/  183]
Test Error: 
 Accuracy: 79.2%, Avg loss: 0.687109 

Epoch 2
-------------------------------
loss: 0.689928  [    4/  183]
loss: 0.729751  [   84/  183]
loss: 0.689026  [  164/  183]
Test Error: 
 Accuracy: 75.0%, Avg loss: 0.684617 

Epoch 3
-------------------------------
loss: 0.684978  [    4/  183]
loss: 0.695347  [   84/  183]
loss: 0.680465  [  164/  183]
Test Error: 
 Accuracy: 81.2%, Avg loss: 0.684074 

Epoch 4
-------------------------------
loss: 0.694287  [    4/  183]
loss: 0.673306  [   84/  183]
loss: 0.687632  [  164/  183]
Test Error: 
 Accuracy: 81.2%, Avg loss: 0.681748 

Epoch 5
-------------------------------
loss: 0.671720  [    4/  183]
loss: 0.

{'baseline': array([0.99456522, 0.99456522, 0.99456522, 1.        , 0.99456522,
        1.        , 0.99456522, 0.98913043, 0.99456522, 1.        ]),
 'bigger': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}

In [19]:
print(results["baseline"].shape)
print(results["bigger"].shape)

(10,)
(10,)


In [20]:
from scipy.stats import ttest_ind

ttest_results = ttest_ind(results["baseline"], results["bigger"], equal_var=False)
ttest_results

  ttest_results = ttest_ind(results["baseline"], results["bigger"], equal_var=False)


Ttest_indResult(statistic=-3.9999999999999587, pvalue=0.003110428310386048)

We see that the bigger model performs significantly better, on average, than the baseline. From the Tensorboard graphs, we can confirm this visually. We collected stats only for the training accuracy, but the graphs also show that the test accuracy in the larger model is better.

From the spikiness of the training curves, it seems the larger model could do with some batch normalization.

# Batch normalization

In [41]:
def training_process(model_class, batch_size=4, epochs=200):
    repeats = 10

    learning_rate = 1e-2

    results = {
        "train_accuracy": np.zeros(repeats),
        "test_accuracy": np.zeros(repeats)
    }
    
    for repeat in range(repeats):
        train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
        test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

        model = model_class().to(device)
        loss_fn = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        log_dir = os.path.join("runs", "experiment_010")

        log_subdir = os.path.join(log_dir, f"{model_class.__name__}_batch_size_{batch_size}_repeat_{repeat}_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        tb = SummaryWriter(log_subdir)

        print("Training...")
        for t in range(epochs):
            print(f"Epoch {t}\n-------------------------------")
            train_metrics = train_loop(train_dataloader, model, loss_fn, optimizer)
            test_metrics = test_loop(test_dataloader, model, loss_fn)
            tb.add_scalar("Loss/train", train_metrics["loss"], t)
            tb.add_scalar("Accuracy/train", train_metrics["acc"], t)
            tb.add_scalar("Loss/test", test_metrics["loss"], t)
            tb.add_scalar("Accuracy/test", test_metrics["acc"], t)
            for param_name, weight in model.named_parameters():
                tb.add_histogram(f"Weights/{param_name}", weight, t)
                tb.add_histogram(f"Gradients/{param_name}", weight.grad, t)
        print("Done!")

        tb.close()

        results["train_accuracy"][repeat] = train_metrics["acc"]
        results["test_accuracy"][repeat] = test_metrics["acc"]

    return results

In [22]:
class BiggerModelWithBatchNorm(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 16, 3, padding=1)
        self.norm0 = nn.BatchNorm2d(16)
        self.conv1 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm2 = nn.BatchNorm1d(160)
        self.lin = nn.Linear(160, 1)

    def forward(self, x):
        assert (x.dim() in (3, 4)), f"Expected input to be 3-D or 4-D but received {x.dim()}-D tensor."
        batched = x.dim() == 4
        if not batched:
            x = x.unsqueeze(0)
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes, which adds an extra dimension
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = F.relu(self.conv0(x))          # (22, 10) -> (22, 10)
        x = self.norm0(x)
        x = F.relu(self.conv1(x))          # (22, 10) -> (22, 10)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = self.norm1(x)
        x = F.relu(self.conv2(x))          # (11,  5) -> (11,  5)
        x = F.max_pool2d(x, kernel_size=2) # (11,  5) -> ( 5,  2)
        x = torch.flatten(x, start_dim=1)
        x = self.norm2(x)
        logits = self.lin(x).squeeze(-1)
        if not batched:
            logits = logits.squeeze(0)
        return logits

In [24]:
model = BiggerModelWithBatchNorm().to(device)
print(model)

model.eval()
with torch.no_grad():
    x, y = train_dataset[0]
    logits = model(x)
    preds = F.sigmoid(logits)
    print(f"Predicted probability: {preds:.5f}")

BiggerModelWithBatchNorm(
  (conv0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm2): BatchNorm1d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=160, out_features=1, bias=True)
)
Predicted probability: 0.50544


In [25]:
results_bigger_model_with_batch_norm = training_process(BiggerModelWithBatchNorm)
results_bigger_model_with_batch_norm

Training...
Epoch 0
-------------------------------
loss: 0.967711  [    4/  183]
loss: 0.444617  [   84/  183]
loss: 0.511074  [  164/  183]
Test Error: 
 Accuracy: 77.1%, Avg loss: 0.522546 

Epoch 1
-------------------------------
loss: 0.391596  [    4/  183]
loss: 0.358153  [   84/  183]
loss: 0.212299  [  164/  183]
Test Error: 
 Accuracy: 91.7%, Avg loss: 0.336961 

Epoch 2
-------------------------------
loss: 0.188371  [    4/  183]
loss: 0.453664  [   84/  183]
loss: 0.322549  [  164/  183]
Test Error: 
 Accuracy: 95.8%, Avg loss: 0.248331 

Epoch 3
-------------------------------
loss: 0.301566  [    4/  183]
loss: 0.062190  [   84/  183]
loss: 0.054717  [  164/  183]
Test Error: 
 Accuracy: 95.8%, Avg loss: 0.232195 

Epoch 4
-------------------------------
loss: 0.294488  [    4/  183]
loss: 0.103281  [   84/  183]
loss: 0.128192  [  164/  183]
Test Error: 
 Accuracy: 81.2%, Avg loss: 0.293673 

Epoch 5
-------------------------------
loss: 0.278554  [    4/  183]
loss: 0.

{'train_accuracy': array([0.95289855, 0.94384058, 0.90217391, 0.93297101, 0.97282609,
        0.93478261, 0.92391304, 0.88405797, 0.94746377, 0.9673913 ]),
 'test_accuracy': array([1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.97916667, 0.97916667, 0.97916667, 1.        ])}

Strangely the curves become much spikier with batch normalization. Perhaps this is because the batch size is only 4, which is quite small. The training accuracy also suffers a lot, although strangely the test accuracy still gets to 100%.

In [31]:
print(results_bigger_model_with_batch_norm["train_accuracy"].mean())
print(results_bigger_model_with_batch_norm["test_accuracy"].mean())

0.9362318842307381
0.99375


In [33]:
class BiggerModelWithBatchNormIncludingBeforeConv0(nn.Module):
    def __init__(self):
        super().__init__()
        self.norm0 = nn.BatchNorm2d(2)
        self.conv0 = nn.Conv2d(2, 16, 3, padding=1)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv1 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm2 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm3 = nn.BatchNorm1d(160)
        self.lin = nn.Linear(160, 1)

    def forward(self, x):
        assert (x.dim() in (3, 4)), f"Expected input to be 3-D or 4-D but received {x.dim()}-D tensor."
        batched = x.dim() == 4
        if not batched:
            x = x.unsqueeze(0)
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes, which adds an extra dimension
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = self.norm0(x)
        x = F.relu(self.conv0(x))          # (22, 10) -> (22, 10)
        x = self.norm1(x)
        x = F.relu(self.conv1(x))          # (22, 10) -> (22, 10)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = self.norm2(x)
        x = F.relu(self.conv2(x))          # (11,  5) -> (11,  5)
        x = F.max_pool2d(x, kernel_size=2) # (11,  5) -> ( 5,  2)
        x = torch.flatten(x, start_dim=1)
        x = self.norm3(x)
        logits = self.lin(x).squeeze(-1)
        if not batched:
            logits = logits.squeeze(0)
        return logits


model = BiggerModelWithBatchNormIncludingBeforeConv0().to(device)
print(model)

model.eval()
with torch.no_grad():
    x, y = train_dataset[0]
    logits = model(x)
    preds = F.sigmoid(logits)
    print(f"Predicted probability: {preds:.5f}")

BiggerModelWithBatchNormIncludingBeforeConv0(
  (norm0): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm3): BatchNorm1d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=160, out_features=1, bias=True)
)
Predicted probability: 0.47389


In [34]:
results_bigger_model_with_batch_norm_including_before_conv0 = training_process(BiggerModelWithBatchNormIncludingBeforeConv0)
results_bigger_model_with_batch_norm_including_before_conv0

Training...
Epoch 0
-------------------------------
loss: 0.762211  [    4/  183]
loss: 0.891814  [   84/  183]
loss: 0.297953  [  164/  183]
Test Error: 
 Accuracy: 77.1%, Avg loss: 0.464340 

Epoch 1
-------------------------------
loss: 0.392092  [    4/  183]
loss: 0.428717  [   84/  183]
loss: 0.494768  [  164/  183]
Test Error: 
 Accuracy: 91.7%, Avg loss: 0.341180 

Epoch 2
-------------------------------
loss: 0.280064  [    4/  183]
loss: 0.812551  [   84/  183]
loss: 0.918179  [  164/  183]
Test Error: 
 Accuracy: 93.8%, Avg loss: 0.259437 

Epoch 3
-------------------------------
loss: 0.169180  [    4/  183]
loss: 0.239752  [   84/  183]
loss: 0.217228  [  164/  183]
Test Error: 
 Accuracy: 93.8%, Avg loss: 0.209271 

Epoch 4
-------------------------------
loss: 0.167507  [    4/  183]
loss: 0.265432  [   84/  183]
loss: 0.206158  [  164/  183]
Test Error: 
 Accuracy: 97.9%, Avg loss: 0.157697 

Epoch 5
-------------------------------
loss: 0.787330  [    4/  183]
loss: 0.

{'train_accuracy': array([0.96195652, 0.93478261, 0.93478261, 0.94021739, 0.94021739,
        0.91847826, 0.94021739, 0.90398551, 0.94021739, 0.94565217]),
 'test_accuracy': array([1.        , 0.97916667, 0.97916667, 1.        , 0.97916667,
        1.        , 1.        , 1.        , 1.        , 1.        ])}

The spikiness and poor accuracy problems persist even when we add batch normalization before the first conv layer, so that's not the issue.

Perhaps the problem is that the batch normalization happens after the relu activations, or after the max pooling layers. Let's try putting the batch normalization before both of these to see if there's any improvement.

In [35]:
class BiggerModelWithBatchNormBeforeRelu(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 16, 3, padding=1)
        self.norm0 = nn.BatchNorm2d(16)
        self.conv1 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm2 = nn.BatchNorm2d(16)
        self.lin = nn.Linear(160, 1)

    def forward(self, x):
        assert (x.dim() in (3, 4)), f"Expected input to be 3-D or 4-D but received {x.dim()}-D tensor."
        batched = x.dim() == 4
        if not batched:
            x = x.unsqueeze(0)
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes, which adds an extra dimension
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = self.conv0(x)                  # (22, 10) -> (22, 10)
        x = self.norm0(x)
        x = F.relu(x)
        x = self.conv1(x)                  # (22, 10) -> (22, 10)
        x = self.norm1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = self.conv2(x)                  # (11,  5) -> (11,  5)
        x = self.norm2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # (11,  5) -> ( 5,  2)
        x = torch.flatten(x, start_dim=1)
        logits = self.lin(x).squeeze(-1)
        if not batched:
            logits = logits.squeeze(0)
        return logits


model = BiggerModelWithBatchNormBeforeRelu().to(device)
print(model)

model.eval()
with torch.no_grad():
    x, y = train_dataset[0]
    logits = model(x)
    preds = F.sigmoid(logits)
    print(f"Predicted probability: {preds:.5f}")

BiggerModelWithBatchNormBeforeRelu(
  (conv0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=160, out_features=1, bias=True)
)
Predicted probability: 0.51116


In [36]:
results_bigger_model_with_batch_norm_before_relu = training_process(BiggerModelWithBatchNormBeforeRelu)
results_bigger_model_with_batch_norm_before_relu

Training...
Epoch 0
-------------------------------
loss: 0.684452  [    4/  183]
loss: 0.366601  [   84/  183]
loss: 0.460547  [  164/  183]
Test Error: 
 Accuracy: 79.2%, Avg loss: 0.516157 

Epoch 1
-------------------------------
loss: 0.524882  [    4/  183]
loss: 0.205511  [   84/  183]
loss: 0.506153  [  164/  183]
Test Error: 
 Accuracy: 89.6%, Avg loss: 0.336008 

Epoch 2
-------------------------------
loss: 0.418567  [    4/  183]
loss: 0.208543  [   84/  183]
loss: 0.183176  [  164/  183]
Test Error: 
 Accuracy: 91.7%, Avg loss: 0.236532 

Epoch 3
-------------------------------
loss: 0.317567  [    4/  183]
loss: 0.064267  [   84/  183]
loss: 0.068327  [  164/  183]
Test Error: 
 Accuracy: 97.9%, Avg loss: 0.132908 

Epoch 4
-------------------------------
loss: 0.162071  [    4/  183]
loss: 0.032740  [   84/  183]
loss: 0.186696  [  164/  183]
Test Error: 
 Accuracy: 95.8%, Avg loss: 0.211872 

Epoch 5
-------------------------------
loss: 0.066858  [    4/  183]
loss: 0.

{'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_accuracy': array([0.97916667, 0.97916667, 1.        , 0.97916667, 0.97916667,
        0.97916667, 1.        , 0.97916667, 0.97916667, 1.        ])}

In [38]:
print(results_bigger_model_with_batch_norm_before_relu["train_accuracy"].mean())
print(results_bigger_model_with_batch_norm_before_relu["train_accuracy"].std())
print(results_bigger_model_with_batch_norm_before_relu["test_accuracy"].mean())
print(results_bigger_model_with_batch_norm_before_relu["test_accuracy"].std())

1.0
0.0
0.9854166666666666
0.009547032697824684


Having batch normalization before the relu gives a huge improvement! Is this because it's before the relu, or because it's before the max pooling layers? Let's find out.

In [39]:
class BiggerModelWithBatchNormBetweenReluAndMaxPool(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 16, 3, padding=1)
        self.norm0 = nn.BatchNorm2d(16)
        self.conv1 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm2 = nn.BatchNorm2d(16)
        self.lin = nn.Linear(160, 1)

    def forward(self, x):
        assert (x.dim() in (3, 4)), f"Expected input to be 3-D or 4-D but received {x.dim()}-D tensor."
        batched = x.dim() == 4
        if not batched:
            x = x.unsqueeze(0)
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes, which adds an extra dimension
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = self.conv0(x)                  # (22, 10) -> (22, 10)
        x = F.relu(x)
        x = self.norm0(x)
        x = self.conv1(x)                  # (22, 10) -> (22, 10)
        x = F.relu(x)
        x = self.norm1(x)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = self.conv2(x)                  # (11,  5) -> (11,  5)
        x = F.relu(x)
        x = self.norm2(x)
        x = F.max_pool2d(x, kernel_size=2) # (11,  5) -> ( 5,  2)
        x = torch.flatten(x, start_dim=1)
        logits = self.lin(x).squeeze(-1)
        if not batched:
            logits = logits.squeeze(0)
        return logits


model = BiggerModelWithBatchNormBetweenReluAndMaxPool().to(device)
print(model)

model.eval()
with torch.no_grad():
    x, y = train_dataset[0]
    logits = model(x)
    preds = F.sigmoid(logits)
    print(f"Predicted probability: {preds:.5f}")

BiggerModelWithBatchNormBetweenReluAndMaxPool(
  (conv0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (norm2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=160, out_features=1, bias=True)
)
Predicted probability: 0.48775


In [40]:
results_bigger_model_with_batch_norm_between_relu_and_max_pool = training_process(BiggerModelWithBatchNormBetweenReluAndMaxPool)
print(results_bigger_model_with_batch_norm_between_relu_and_max_pool)
print(results_bigger_model_with_batch_norm_between_relu_and_max_pool["train_accuracy"].mean())
print(results_bigger_model_with_batch_norm_between_relu_and_max_pool["train_accuracy"].std())
print(results_bigger_model_with_batch_norm_between_relu_and_max_pool["test_accuracy"].mean())
print(results_bigger_model_with_batch_norm_between_relu_and_max_pool["test_accuracy"].std())

Training...
Epoch 0
-------------------------------
loss: 0.960941  [    4/  183]
loss: 0.698880  [   84/  183]
loss: 1.146560  [  164/  183]
Test Error: 
 Accuracy: 70.8%, Avg loss: 0.506839 

Epoch 1
-------------------------------
loss: 0.287441  [    4/  183]
loss: 0.356623  [   84/  183]
loss: 0.112153  [  164/  183]
Test Error: 
 Accuracy: 89.6%, Avg loss: 0.269673 

Epoch 2
-------------------------------
loss: 0.344800  [    4/  183]
loss: 0.026881  [   84/  183]
loss: 0.163868  [  164/  183]
Test Error: 
 Accuracy: 97.9%, Avg loss: 0.125879 

Epoch 3
-------------------------------
loss: 0.054840  [    4/  183]
loss: 0.388696  [   84/  183]
loss: 0.027784  [  164/  183]
Test Error: 
 Accuracy: 97.9%, Avg loss: 0.055574 

Epoch 4
-------------------------------
loss: 0.093774  [    4/  183]
loss: 0.127800  [   84/  183]
loss: 0.109005  [  164/  183]
Test Error: 
 Accuracy: 97.9%, Avg loss: 0.048872 

Epoch 5
-------------------------------
loss: 0.011009  [    4/  183]
loss: 0.

Thus, it seems that the cause of the instability before with batch normalization was because it was placed after the max pooling layers. Whether we put batch normalization before or after the relu activation doesn't seem to make much difference. We note that test loss is quite spiky in early epochs when the batch normalization is after the relu, whereas it is slightly spikier for later epochs when the batch normalization is before the relu. Therefore, because we want to avoid large spikes and want to shorten training time as much as possible, we prefer the model with batch normalization before the relu.

# Batch size

Now let's see if we can eke out some more performance by varying the batch size with the batch-normalized model.

In [42]:
results = {}

for batch_size in [1, 2, 4, 8, 16, 32]:
    results[batch_size] = training_process(BiggerModelWithBatchNormBeforeRelu, epochs=100, batch_size=batch_size)

Training...
Epoch 0
-------------------------------
loss: 0.854068  [    1/  183]
loss: 2.041043  [   21/  183]
loss: 0.791983  [   41/  183]
loss: 0.740999  [   61/  183]
loss: 0.937269  [   81/  183]
loss: 1.080365  [  101/  183]
loss: 0.362292  [  121/  183]
loss: 1.050763  [  141/  183]
loss: 0.039154  [  161/  183]
loss: 0.007088  [  181/  183]
Test Error: 
 Accuracy: 84.4%, Avg loss: 0.322394 

Epoch 1
-------------------------------
loss: 0.158566  [    1/  183]
loss: 0.212968  [   21/  183]
loss: 0.012332  [   41/  183]
loss: 0.197545  [   61/  183]
loss: 0.675596  [   81/  183]
loss: 0.024971  [  101/  183]
loss: 0.008567  [  121/  183]
loss: 0.125346  [  141/  183]
loss: 0.106370  [  161/  183]
loss: 0.095618  [  181/  183]
Test Error: 
 Accuracy: 91.1%, Avg loss: 0.192656 

Epoch 2
-------------------------------
loss: 0.027272  [    1/  183]
loss: 0.042911  [   21/  183]
loss: 0.010323  [   41/  183]
loss: 0.196613  [   61/  183]
loss: 0.071387  [   81/  183]
loss: 0.000652

In [44]:
for batch_size, batch_size_results in results.items():
    print(f"Results for batch size {batch_size}:")
    print(f"Training accuracy: mean {batch_size_results['train_accuracy'].mean():.5f}, std {batch_size_results['train_accuracy'].std():.5f}")
    print(f"Test accuracy: mean {batch_size_results['test_accuracy'].mean():.5f}, std {batch_size_results['test_accuracy'].std():.5f}")
    print()

Results for batch size 1:
Training accuracy: mean 1.00000, std 0.00000
Test accuracy: mean 1.00000, std 0.00000

Results for batch size 2:
Training accuracy: mean 1.00000, std 0.00000
Test accuracy: mean 0.99348, std 0.00996

Results for batch size 4:
Training accuracy: mean 1.00000, std 0.00000
Test accuracy: mean 0.97292, std 0.02954

Results for batch size 8:
Training accuracy: mean 1.00000, std 0.00000
Test accuracy: mean 0.98333, std 0.00833

Results for batch size 16:
Training accuracy: mean 1.00000, std 0.00000
Test accuracy: mean 0.98702, std 0.01069

Results for batch size 32:
Training accuracy: mean 1.00000, std 0.00000
Test accuracy: mean 0.98377, std 0.01593



There is a clear trend here that the larger the batch size, the worse the performance on the test set. Ideally, reduce the batch size to 1 or 2 or keep it the same. This is the same result we saw in a previous experiment.

The tradeoff here is that the smaller the batch size, the longer the training takes. Here are the times taken:
* Batch size 1  - 58.8 s
* Batch size 2  - 51.1 s
* Batch size 4  - 39.0 s
* Batch size 8  - 30.3 s
* Batch size 16 - 24.1 s
* Batch size 32 - 22.9 s

# Conclusion

We should replace our model with a larger one, add batch normalization, and consider reducing the batch size or adding some other kind of regularization.

Once potential concern here is that we are going from a model with 586 parameters to a much larger 5201 parameters - almost a tenfold increase in the number of parameters.