# Experiment 006

In this experiment, we try out an architecture involving a locally-connected RNN layer to see if it improves the block landing model.

In [2]:
import os
from pathlib import Path
import random
import shutil
import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

from custom_layers import PointwiseRNN2d

# If running this from VS Code, launch a TensorBoard session on the folder runs/experiment_006

In [14]:
class BlockLandingDataset(Dataset):
    def __init__(self, path: str):
        self.path = path
        if not os.path.exists(path):
            raise FileNotFoundError()
        with os.scandir(self.path) as it:
            entry: os.DirEntry = next(iter(it))
            _, self.ext = os.path.splitext(entry.name)
            self.highest_index = max((int(Path(file.path).stem) for file in it))

    def __len__(self):
        return self.highest_index + 1

    def __getitem__(self, idx):
        file = os.path.join(self.path, f"{idx}{self.ext}")
        if not os.path.exists(file):
            raise IndexError()
        boards = np.load(file)
        assert boards.shape[0] == 4
        x = boards[:3] # Just take the first two frames as that's what will be input to the main model
        b1 = boards[2] # We can identify a block landing by the fact that a block spawns in the next time step
        b2 = boards[3]
        y = (np.all(b1[0] == 0) & np.any(b2[0] == 1)).astype(np.float32)
        x, y = torch.tensor(x), torch.tensor(y)
        return x, y
        

In [15]:
train_dataset = BlockLandingDataset(os.path.join("data", "block_landing", "train"))
test_dataset = BlockLandingDataset(os.path.join("data", "block_landing", "test"))
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

x, y = next(iter(train_dataloader))
print(x.shape, x.dtype)
print(y.shape, y.dtype)

torch.Size([4, 3, 22, 10]) torch.int32
torch.Size([4]) torch.float32


In [5]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [6]:
class BlockLandingModel(nn.Module):
    """Predicts whether a block has landed.

    Inputs:
        x: Tensor of int32 of shape (batch_size, seq_length, height, width). height = 22 and width = 10 are the dimensions of the game
           board. seq_length = 2 is the number of game board inputs. The entries should be 0 for empty cells and 1 for blocks.
    
    Returns: float32 scalar, logit for block landing prediction.
    """

    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(2, 5, 3)
        self.rnn = nn.RNN(5, 5)
        self.norm = nn.BatchNorm1d(5)
        self.lin = nn.Linear(5, 1)

    def forward(self, x):
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2

        xs = [] # Split up the timesteps so we can apply convolution to them separately
        for i in range(x.shape[0]):
            x_i = x[i]
            x_i = F.relu(self.conv(F.pad(x_i, (1, 1, 1, 1)))) # Pad height and width (the last 2 dimensions) with zeroes to represent the board boundaries
            x_i = F.avg_pool2d(x_i, kernel_size=x.shape[-2:]).squeeze(-1).squeeze(-1)
            x_i = x_i.unsqueeze(0)
            xs.append(x_i)

        x = torch.concat(xs)
        x, rnn_state = self.rnn(x)
        x = x[-1] # Just take last predicted state
        x = self.norm(x)
        logits = self.lin(x).squeeze(-1)
        return logits

In [7]:
model = BlockLandingModel().to(device)
print(model)

with torch.no_grad():
    X, y = next(iter(train_dataloader))
    logits = model(X)[0]
    preds = F.sigmoid(logits)
    print(f"Predicted states: {preds}")

BlockLandingModel(
  (conv): Conv2d(2, 5, kernel_size=(3, 3), stride=(1, 1))
  (rnn): RNN(5, 5)
  (norm): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=5, out_features=1, bias=True)
)
Predicted states: 0.5695329308509827


In [16]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()

    avg_loss = 0.0
    correct = 0.0

    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metric calculations
        avg_loss += loss.item()
        classes = (pred >= 0).type(torch.float)
        correct += (classes == y).type(torch.float).mean().item()

        if batch % 20 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    
    avg_loss /= num_batches
    correct /= num_batches
    print(f"Training accuracy: {(100*correct):>0.1f}%")
    return {
        "loss": avg_loss,
        "acc": correct,
    }


def test_loop(dataloader, model, loss_fn):
    model.eval()
    
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            classes = (pred >= 0).type(torch.float)
            correct += (classes == y).type(torch.float).mean().item()

    test_loss /= num_batches
    correct /= num_batches
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return {
        "loss": test_loss,
        "acc": correct,
    }

In [19]:
class ModelWithPointwiseRnn(nn.Module):
    """This model has a locally connected RNN instead of a dense one operating on global average pools."""

    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(2, 5, 3)
        self.rnn = PointwiseRNN2d(5, 5)
        #self.norm = nn.BatchNorm1d(5)
        self.lin = nn.Linear(5, 1)

    def forward(self, x):
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x1, x2, x3 = x # Split up the two timesteps so we can apply convolution to them both separately

        x1 = F.relu(self.conv(F.pad(x1, (1, 1, 1, 1)))) # Pad height and width (the last 2 dimensions) with zeroes to represent the board boundaries
        x2 = F.relu(self.conv(F.pad(x2, (1, 1, 1, 1))))
        x3 = F.relu(self.conv(F.pad(x3, (1, 1, 1, 1))))

        x = torch.concat((x1.unsqueeze(0), x2.unsqueeze(0), x3.unsqueeze(0)))
        x, rnn_state = self.rnn(x)
        x = x[-1] # Just take last predicted state
        #x = self.norm(x)
        x = F.avg_pool2d(x, kernel_size=x.shape[-2:]).squeeze(-1).squeeze(-1)
        logits = self.lin(x).squeeze(-1)
        return logits

In [20]:
learning_rate = 1e-2
batch_size = 4
epochs = 100

models = {
    "baseline": BlockLandingModel().to(device),
    "pointwise_rnn": ModelWithPointwiseRnn().to(device),
}

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

for name, model in models.items():
    print(f"Model '{name}' has {count_parameters(model)} parameters.")

log_dir = os.path.join("runs", "experiment_006")
shutil.rmtree(log_dir, ignore_errors=True)

for name, model in models.items():
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    print(f"Training model '{name}'...")
    log_subdir = os.path.join(log_dir, name + "_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    tb = SummaryWriter(log_subdir)

    for t in range(epochs):
        print(f"Epoch {t}\n-------------------------------")
        train_metrics = train_loop(train_dataloader, model, loss_fn, optimizer)
        test_metrics = test_loop(test_dataloader, model, loss_fn)
        tb.add_scalar("Loss/train", train_metrics["loss"], t)
        tb.add_scalar("Accuracy/train", train_metrics["acc"], t)
        tb.add_scalar("Loss/test", test_metrics["loss"], t)
        tb.add_scalar("Accuracy/test", test_metrics["acc"], t)
        for name, weight in model.named_parameters():
            tb.add_histogram(f"Weights/{name}", weight, t)
            tb.add_histogram(f"Gradients/{name}", weight.grad, t)

    tb.close()
    print("Done!")

Model 'baseline' has 171 parameters.
Model 'pointwise_rnn' has 161 parameters.
Training model 'baseline'...
Epoch 0
-------------------------------
loss: 0.836069  [    4/  183]
loss: 0.662130  [   84/  183]
loss: 0.478500  [  164/  183]
Training accuracy: 65.0%
Test Error: 
 Accuracy: 43.8%, Avg loss: 0.711740 

Epoch 1
-------------------------------
loss: 0.419497  [    4/  183]
loss: 0.678149  [   84/  183]
loss: 0.438411  [  164/  183]
Training accuracy: 69.9%
Test Error: 
 Accuracy: 66.7%, Avg loss: 0.605260 

Epoch 2
-------------------------------
loss: 0.425502  [    4/  183]
loss: 0.610328  [   84/  183]
loss: 0.743086  [  164/  183]
Training accuracy: 69.0%
Test Error: 
 Accuracy: 77.1%, Avg loss: 0.567052 

Epoch 3
-------------------------------
loss: 0.394889  [    4/  183]
loss: 0.347516  [   84/  183]
loss: 0.447395  [  164/  183]
Training accuracy: 71.0%
Test Error: 
 Accuracy: 66.7%, Avg loss: 0.584078 

Epoch 4
-------------------------------
loss: 0.419473  [    4/ 

The pointwise RNN takes a long time to train and takes many epochs to improve loss and accuracy. Ideally we should try varying the learning rate and other properties, but to do this in a reasonable time, we need a faster pointwise RNN implementation.

Meanwhile I have had some other ideas about the possible architecture which I might try first. For example, we could try just using convolutions on the last frame to distil the spatial information, then a linear layer or two at the end. If we need to operate on multiple frames, we can just concatenate the tensors instead of treating them as a sequence. Avoiding RNNs may circumvent the training instability.