# Experiment 012

In this experiment, we build on the findings of Experiment 011 and instead of incorporating a pretrained block landing model into the Tetris emulator, we merge the two architectures and train end to end.

In [25]:
import os
from pathlib import Path
import shutil
import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt

In [26]:
class RecordingDataset(Dataset):
    def __init__(self, path: str):
        self.path = path
        if not os.path.exists(path):
            raise FileNotFoundError()
        with os.scandir(self.path) as it:
            entry: os.DirEntry = next(iter(it))
            _, self.ext = os.path.splitext(entry.name)
            self.highest_index = max((int(Path(file.path).stem) for file in it), default=-1)

    def __len__(self):
        return self.highest_index + 1

    def __getitem__(self, idx):
        file = os.path.join(self.path, f"{idx}{self.ext}")
        if not os.path.exists(file):
            raise IndexError()
        boards = np.load(file)
        x = torch.tensor(boards[-2]) # Ignore all boards except the last two
        y = torch.tensor(boards[-1], dtype=torch.long)
        return x, y
        

In [27]:
train_dataset = RecordingDataset(os.path.join("data", "tetris_emulator", "train"))
test_dataset = RecordingDataset(os.path.join("data", "tetris_emulator", "test"))
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

x, y = next(iter(train_dataloader))
print(x.shape, x.dtype)
print(y.shape, y.dtype)

torch.Size([4, 22, 10]) torch.int32
torch.Size([4, 22, 10]) torch.int64


In [28]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [29]:
class TetrisModel(nn.Module):
    """Predicts the next state of the cells.

    Inputs:
        x: Tensor of int32 of shape (batch_size, height, width). height = 22 and width = 10 are the dimensions of the game
           board. The entries should be 0 for empty cells and 1 for blocks.
    
    Returns: Tensor of float32 of shape (batch_size, height, width), logits for the new cells. Probabilities close to 0 (negative logits)
             correspond to empty cells, and probabilities close to 1 (positive logits) correspond to blocks.
    """

    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 16, 3, padding=1)
        self.norm0 = nn.BatchNorm2d(16)
        self.conv1 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.norm2 = nn.BatchNorm2d(16)
        self.conv3 = nn.Conv2d(16, 2, 1)

    def forward(self, x):
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((0, 3, 1, 2)) # Move channels/classes to dimension 1
        x = F.relu(self.norm0(self.conv0(x)))
        x = F.relu(self.norm1(self.conv1(x)))
        x = F.relu(self.norm2(self.conv2(x)))
        logits = F.log_softmax(self.conv3(x), dim=1)
        return logits

In [30]:
class ModelWithGlobalAtStart(nn.Module):
    def __init__(self):
        super().__init__()
        self.glob = nn.Sequential(
            nn.Conv2d(2, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(),
            nn.Linear(160, 10)
        )
        self.loc = nn.Sequential(
            nn.Conv2d(12, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 2, 1),
            nn.LogSoftmax()
        )

    def forward(self, x):
        batch_size, height, width = x.shape

        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((0, 3, 1, 2)) # Move channels/classes to dimension 1

        x_glob = self.glob(x)
        x_glob = x_glob[:, :, None, None] # Expand dims
        x_glob = x_glob.repeat(1, 1, height, width) # Upscale to image size
        x = torch.cat((x, x_glob), dim=1)

        logits = self.loc(x)
        return logits

In [38]:
class ModelWithGlobalAtEnd(nn.Module):
    def __init__(self):
        super().__init__()
        self.loc = nn.Sequential(
            nn.Conv2d(2, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
        )
        self.glob = nn.Sequential(
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(),
            nn.Linear(160, 10)
        )
        self.head = nn.Sequential(
            nn.Conv2d(26, 2, 1),
            nn.LogSoftmax()
        )

    def forward(self, x):
        batch_size, height, width = x.shape
        
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((0, 3, 1, 2)) # Move channels/classes to dimension 1

        x = self.loc(x)

        x_glob = self.glob(x)
        x_glob = x_glob[:, :, None, None] # Expand dims
        x_glob = x_glob.repeat(1, 1, height, width) # Upscale to image size
        x = torch.cat((x, x_glob), dim=1)

        logits = self.head(x)
        return logits

In [39]:
class ModelWithGlobalInMiddle(nn.Module):
    def __init__(self):
        super().__init__()
        self.loc = nn.Sequential(
            nn.Conv2d(2, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
        )
        self.glob = nn.Sequential(
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(),
            nn.Linear(160, 10)
        )
        self.head = nn.Sequential(
            nn.Conv2d(26, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 2, 1),
            nn.LogSoftmax()
        )

    def forward(self, x):
        batch_size, height, width = x.shape
        
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((0, 3, 1, 2)) # Move channels/classes to dimension 1

        x = self.loc(x)

        x_glob = self.glob(x)
        x_glob = x_glob[:, :, None, None] # Expand dims
        x_glob = x_glob.repeat(1, 1, height, width) # Upscale to image size
        x = torch.cat((x, x_glob), dim=1)

        logits = self.head(x)
        return logits

In [41]:
model = ModelWithGlobalInMiddle().to(device)
print(model)

with torch.no_grad():
    X, y = next(iter(train_dataloader))
    logits = model(X)[0]
    preds = torch.exp(logits)
    print(f"Predicted states: {preds}")

ModelWithGlobalInMiddle(
  (loc): Sequential(
    (0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
  )
  (glob): Sequential(
    (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)

  input = module(input)


In [42]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()

    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def test_loop(split_name, dataloader, model, loss_fn):
    model.eval()

    epoch_loss = 0.0
    cell_accuracy = 0.0
    board_accuracy = 0.0
    spawn_recall = 0.0
    num_spawns = 0.0

    num_batches = len(dataloader)
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            epoch_loss += loss_fn(pred, y).item()
            classes = torch.argmax(pred, dim=1)
            cell_accuracy += (classes == y).type(torch.float).mean().item()
            board_accuracy += (classes == y).all(-1).all(-1).type(torch.float).mean().item()

            actual_spawns = (X[:, 0, :] == 0).all(-1) & (y[:, 0, :] == 1).any(-1)
            predicted_spawns = (classes[:, 0, :] == 1).any(-1)
            spawn_recall += (actual_spawns & predicted_spawns).type(torch.float).sum().item()
            num_spawns += actual_spawns.type(torch.float).sum().item()

    epoch_loss /= num_batches
    cell_accuracy /= num_batches
    board_accuracy /= num_batches
    spawn_recall /= num_spawns
    return {
        "loss": epoch_loss,
        "acc": cell_accuracy,
        "acc_board": board_accuracy,
        "Spawn recall": spawn_recall
    }

In [43]:
def train(model, learning_rate=1e-1, batch_size=4, epochs=100):
    loss_fn = nn.NLLLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    log_dir = os.path.join("runs", "experiment_012")
    log_subdir = os.path.join(log_dir, f'{model.__class__.__name__}_lr_{str(learning_rate).replace(".", "_")}_bs_{batch_size}_ep_{epochs}_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}')
    tb = SummaryWriter(log_subdir)

    print(f"Training model {model.__class__.__name__}...")
    for t in range(epochs):
        train_loop(train_dataloader, model, loss_fn, optimizer)
        train_metrics = test_loop("Train", train_dataloader, model, loss_fn)
        test_metrics = test_loop("Test", test_dataloader, model, loss_fn)
        tb.add_scalar("Loss/train", train_metrics["loss"], t)
        tb.add_scalar("Cell accuracy/train", train_metrics["acc"], t)
        tb.add_scalar("Board accuracy/train", train_metrics["acc_board"], t)
        tb.add_scalar("Spawn recall/train", train_metrics["Spawn recall"], t)
        tb.add_scalar("Loss/test", test_metrics["loss"], t)
        tb.add_scalar("Cell accuracy/test", test_metrics["acc"], t)
        tb.add_scalar("Board accuracy/test", test_metrics["acc_board"], t)
        tb.add_scalar("Spawn recall/test", test_metrics["Spawn recall"], t)
        for name, weight in model.named_parameters():
            tb.add_histogram(f"Weights/{name}", weight, t)
            tb.add_histogram(f"Gradients/{name}", weight.grad, t)

    tb.close()
    print("Done!")

    return {
        "train_accuracy": train_metrics["acc_board"],
        "train_spawn_recall": train_metrics["Spawn recall"],
        "test_accuracy": test_metrics["acc_board"],
        "test_spawn_recall": test_metrics["Spawn recall"]
    }

In [44]:
from scipy.stats import ttest_ind

repeats = 6

architectures = [
    {
        "class": TetrisModel
    },
    {
        "class": ModelWithGlobalAtStart
    },
    {
        "class": ModelWithGlobalAtEnd
    },
    {
        "class": ModelWithGlobalInMiddle
    }
]

for a in architectures:
    a["train_accuracy"] = np.zeros(repeats)
    a["train_spawn_recall"] = np.zeros(repeats)
    a["test_accuracy"] = np.zeros(repeats)
    a["test_spawn_recall"] = np.zeros(repeats)

for repeat in range(repeats):
    for architecture in architectures:
        model = architecture["class"]().to(device)
        repeat_results = train(model)
        architecture["train_accuracy"][repeat] = repeat_results["train_accuracy"]
        architecture["train_spawn_recall"][repeat] = repeat_results["train_spawn_recall"]
        architecture["test_accuracy"][repeat] = repeat_results["test_accuracy"]
        architecture["test_spawn_recall"][repeat] = repeat_results["test_spawn_recall"]
    
print(f"Results:")
for architecture in architectures:
    print(f"Class: {architecture['class'].__name__}")
    print(f"  Train accuracy mean {architecture['train_accuracy'].mean():.5f}, std {architecture['train_accuracy'].std():.5f}")
    print(f"  Test accuracy mean {architecture['test_accuracy'].mean():.5f}, std {architecture['test_accuracy'].std():.5f}")
    print(f"  Train spawn recall mean {architecture['train_spawn_recall'].mean():.5f}, std {architecture['train_spawn_recall'].std():.5f}")
    print(f"  Test spawn recall mean {architecture['test_spawn_recall'].mean():.5f}, std {architecture['test_spawn_recall'].std():.5f}")
    print()

for architecture in architectures[1:]:
    baseline = architectures[0]
    print(f"Performing t-tests of {architecture['class'].__name__} against {baseline['class'].__name__}")
    accuracy_ttest = ttest_ind(baseline["test_accuracy"], architecture["test_accuracy"], equal_var=False)
    spawn_recall_ttest = ttest_ind(baseline["test_spawn_recall"], architecture["test_spawn_recall"], equal_var=False)
    print("Accuracy t-test results:")
    print(accuracy_ttest)
    print("Spawn recall t-test results:")
    print(spawn_recall_ttest)
    print()

Training model TetrisModel...
Train Error: 
 Accuracy: 99.4%, Board accuracy: 45.8%, Avg loss: 0.046508 

Test Error: 
 Accuracy: 99.6%, Board accuracy: 55.0%, Avg loss: 0.046407 

Train Error: 
 Accuracy: 98.8%, Board accuracy: 21.2%, Avg loss: 0.036883 

Test Error: 
 Accuracy: 98.9%, Board accuracy: 25.6%, Avg loss: 0.037931 

Train Error: 
 Accuracy: 99.8%, Board accuracy: 85.4%, Avg loss: 0.014346 

Test Error: 
 Accuracy: 99.8%, Board accuracy: 85.6%, Avg loss: 0.016331 

Train Error: 
 Accuracy: 99.9%, Board accuracy: 89.6%, Avg loss: 0.011588 

Test Error: 
 Accuracy: 99.8%, Board accuracy: 89.4%, Avg loss: 0.014397 

Train Error: 
 Accuracy: 99.9%, Board accuracy: 90.8%, Avg loss: 0.009927 

Test Error: 
 Accuracy: 99.8%, Board accuracy: 88.3%, Avg loss: 0.012917 

Train Error: 
 Accuracy: 99.9%, Board accuracy: 91.2%, Avg loss: 0.008414 

Test Error: 
 Accuracy: 99.8%, Board accuracy: 86.7%, Avg loss: 0.011444 

Train Error: 
 Accuracy: 99.9%, Board accuracy: 90.0%, Avg loss:

From these results, we see that in terms of spawn recall, `ModelWithGlobalAtStart` and `ModelWithGlobalInMiddle` did significantly better than the baseline, whereas `ModelWithGlobalAtEnd` does not. It does well in terms of spawn recall on the training set, but worse on the test set, likely a sign of overfitting.

In terms of board accuracy, no model does significantly better than the baseline, but the baseline already has quite a high accuracy. `ModelWithGlobalInMiddle` has the lowest p-value here by far.

The curves are all quite spiky, but especially those of spawn recall and of `ModelWithGlobalAtStart`. The spawn recall curve is probably spiky due to the fact that the dataset contains very few block spawns and is a sign that we need more data, and perhaps we need to rebalance the data to include more spawns and less of other events. The `ModelWithGlobalAtStart` curves may be spiky due to the 10 global features overwhelming the 2 local features when they're combined, or it may be because the global information has a long way to travel before it influences the prediction so gets lost inside the network.

The loss and spawn accuracy are significantly lower on the test set, which implies that we are overfitting somewhat and could do with more data.

`ModelWithGlobalInMiddle` seems the most appealing architecture. Having a local module at the start means that in theory, some spatial information can be shared by both the local and global parts of the network. Having a nontrivial head at the end means that the global features can have a more complex effect on the predictions of each cell. The network also gives a significant amount of customisability.

In conclusion, `ModelWithGlobalInMiddle` looks like the best architecture, but to perform well it needs to be trained on more data. If the training instability persists, we can try lowering the learning rate.