# Experiment 008

In this experiment, we see how well the block landing model learns on an imbalanced dataset. We use the same underlying dataset, but the Dataset object enforces a certain proportion split between positive (block landed) and negative (block not landed) examples.

In [4]:
import os
from pathlib import Path
import random
import shutil
import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

In [5]:
class BlockLandingDataset(Dataset):
    def __init__(self, path: str):
        self.path = path
        if not os.path.exists(path):
            raise FileNotFoundError()
        with os.scandir(self.path) as it:
            entry: os.DirEntry = next(iter(it))
            _, self.ext = os.path.splitext(entry.name)
            self.highest_index = max((int(Path(file.path).stem) for file in it))

    def __len__(self):
        return self.highest_index + 1

    def __getitem__(self, idx):
        file = os.path.join(self.path, f"{idx}{self.ext}")
        if not os.path.exists(file):
            raise IndexError()
        boards = np.load(file)
        assert boards.shape[0] == 4
        x = boards[:3]
        b1 = boards[2] # We can identify a block landing by the fact that a block spawns in the next time step
        b2 = boards[3]
        y = (np.all(b1[0] == 0) & np.any(b2[0] == 1)).astype(np.float32)
        x, y = torch.tensor(x), torch.tensor(y)
        return x, y

In [6]:
class ImbalancedDataset(Dataset):
    def __init__(self, dataset, pos_fraction):
        # Assuming the underlying dataset has 50% positive and negative examples, we take all of the negative examples and a subset of the positive ones,
        # so we need pos_fraction to be at most 50%.
        assert pos_fraction <= 0.5
        pos = []
        neg = []
        for (x, y) in dataset:
            if y:
                pos.append((x, y))
            else:
                neg.append((x, y))
        random.shuffle(pos)
        # Remove positive examples until we reach the desired proportion (or overshoot slightly).
        while len(pos) / (len(pos) + len(neg)) > pos_fraction:
            pos.pop()
        print(f"pos_fraction: requested {pos_fraction:.3f}, got {len(pos) / (len(pos) + len(neg)):.3f}")
        print(f"Total positive examples: {len(pos)}")
        
        # Build up a list of all the examples.
        self.lst = pos + neg
        random.shuffle(self.lst)

        # Set helpful properties
        self.num_pos = len(pos)
        self.num_neg = len(neg)

    def __len__(self):
        return len(self.lst)
    
    def __getitem__(self, idx):
        return self.lst[idx]

In [7]:
train_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "train")), 0.5)
test_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "test")), 0.5)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

x, y = next(iter(train_dataloader))
print(x.shape, x.dtype)
print(y.shape, y.dtype)

pos_fraction: requested 0.500, got 0.492
Total positive examples: 90
pos_fraction: requested 0.500, got 0.500
Total positive examples: 21
torch.Size([4, 3, 22, 10]) torch.int32
torch.Size([4]) torch.float32


In [8]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [9]:
class BlockLandingModel(nn.Module):
    """Predicts whether a block has landed.

    Inputs:
        x: Tensor of int32 of shape (batch_size, seq_length, height, width). height = 22 and width = 10 are the dimensions of the game
           board. seq_length = 2 is the number of game board inputs. The entries should be 0 for empty cells and 1 for blocks.
    
    Returns: float32 scalar, logit for block landing prediction.
    """

    def __init__(self):
        super().__init__()
        self.conv0 = nn.Conv2d(2, 5, 3, padding=1)
        self.conv1 = nn.Conv2d(5, 10, 3, padding=1)
        self.lin = nn.Linear(30, 1)

    def forward(self, x):
        x = F.one_hot(x.long(), 2) # One-hot encode the two cell classes
        x = x.type(torch.float) # Convert to floating-point
        x = x.permute((1, 0, 4, 2, 3)) # Move sequence to dimension 0 and channels/classes to dimension 2
        x = x[-1] # Just take the last board state in the sequence; channels are now dim 1
        x = F.relu(self.conv0(x))          # (22, 10) -> (22, 10)
        x = F.max_pool2d(x, kernel_size=2) # (22, 10) -> (11,  5)
        x = F.relu(self.conv1(x))          # (11,  5) -> (11,  5)
        x = F.max_pool2d(x, kernel_size=3) # (11,  5) -> ( 4,  2)
        x = torch.flatten(x, start_dim=1)
        logits = self.lin(x).squeeze(-1)
        return logits

In [10]:
model = BlockLandingModel().to(device)
print(model)

with torch.no_grad():
    X, y = next(iter(train_dataloader))
    logits = model(X)[0]
    preds = F.sigmoid(logits)
    print(f"Predicted states: {preds}")

BlockLandingModel(
  (conv0): Conv2d(2, 5, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1): Conv2d(5, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (lin): Linear(in_features=30, out_features=1, bias=True)
)
Predicted states: 0.49169811606407166


In [18]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()

    avg_loss = 0.0
    correct = 0.0
    true_pos = 0.0

    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metric calculations
        avg_loss += loss.item()
        classes = (pred >= 0).type(torch.float)
        correct += (classes == y).type(torch.float).sum().item()
        true_pos += ((y == 1.0) & (classes == 1.0)).type(torch.float).sum().item()

        if batch % 20 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    
    avg_loss /= num_batches
    accuracy = correct / size
    recall = true_pos / dataloader.dataset.num_pos
    return {
        "loss": avg_loss,
        "acc": accuracy,
        "rec": recall
    }


def test_loop(dataloader, model, loss_fn):
    model.eval()
    
    test_loss = 0.0
    correct = 0.0
    true_pos = 0.0

    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            classes = (pred >= 0).type(torch.float)
            correct += (classes == y).type(torch.float).sum().item()
            true_pos += ((y == 1.0) & (classes == 1.0)).type(torch.float).sum().item()

    test_loss /= num_batches
    accuracy = correct / size
    recall = true_pos / dataloader.dataset.num_pos if dataloader.dataset.num_pos > 0 else np.nan
    print(f"Test Error: \n Recall: {(100*recall):>0.1f}%, Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return {
        "loss": test_loss,
        "acc": accuracy,
        "rec": recall
    }

In [25]:
pos_fracs = [0.5, 0.4, 0.3, 0.2, 0.1]
learning_rate = 1e-2
batch_size = 4
epochs = 300

log_dir = os.path.join("runs", "experiment_008")
shutil.rmtree(log_dir, ignore_errors=True)

for pos_frac in pos_fracs:
    train_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "train")), pos_frac)
    test_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "test")), pos_frac)
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

    
    model = BlockLandingModel().to(device)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    log_subdir = os.path.join(log_dir, "pos_frac_" + str(pos_frac).replace(".", "_") + "_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    tb = SummaryWriter(log_subdir)

    for t in range(epochs):
        print(f"Epoch {t}\n-------------------------------")
        train_metrics = train_loop(train_dataloader, model, loss_fn, optimizer)
        test_metrics = test_loop(test_dataloader, model, loss_fn)
        tb.add_scalar("Loss/train", train_metrics["loss"], t)
        tb.add_scalar("Accuracy/train", train_metrics["acc"], t)
        tb.add_scalar("Recall/train", train_metrics["rec"], t)
        tb.add_scalar("Loss/test", test_metrics["loss"], t)
        tb.add_scalar("Accuracy/test", test_metrics["acc"], t)
        tb.add_scalar("Recall/test", test_metrics["rec"], t)
        for name, weight in model.named_parameters():
            tb.add_histogram(f"Weights/{name}", weight, t)
            tb.add_histogram(f"Gradients/{name}", weight.grad, t)

    tb.close()
    print("Done!")

pos_fraction: requested 0.500, got 0.492
Total positive examples: 90
pos_fraction: requested 0.500, got 0.500
Total positive examples: 21
Epoch 0
-------------------------------
loss: 0.708550  [    4/  183]
loss: 0.694710  [   84/  183]
loss: 0.670528  [  164/  183]
Test Error: 
 Recall: 14.3%, Accuracy: 57.1%, Avg loss: 0.689238 

Epoch 1
-------------------------------
loss: 0.695797  [    4/  183]
loss: 0.687308  [   84/  183]
loss: 0.688124  [  164/  183]
Test Error: 
 Recall: 33.3%, Accuracy: 64.3%, Avg loss: 0.688261 

Epoch 2
-------------------------------
loss: 0.696748  [    4/  183]
loss: 0.671701  [   84/  183]
loss: 0.699695  [  164/  183]
Test Error: 
 Recall: 57.1%, Accuracy: 71.4%, Avg loss: 0.687448 

Epoch 3
-------------------------------
loss: 0.688724  [    4/  183]
loss: 0.675721  [   84/  183]
loss: 0.684409  [  164/  183]
Test Error: 
 Recall: 66.7%, Accuracy: 73.8%, Avg loss: 0.685342 

Epoch 4
-------------------------------
loss: 0.687244  [    4/  183]
loss

This looks promising: the model seems to learn the positive cases even when the proportion in the dataset is as low as 10%, it just takes longer. The number of epochs $ e $ until the model reaches nonzero recall seems to be roughly $ O(p) $, where $ p $ is the proportion of positive examples in the dataset. This fits with intuition, because the number of positive examples seen by the model (with repeats) is the product $ e p $, and if $ e = O(p) $ then this product is roughly constant.

# Focal loss

We could put different weights on the classes, but let's assume that we can't do this. This is because the block landing model architecture is intended to be part of a larger Tetris emulator model, and I'd rather avoid injecting knowledge about landed/not landed class weights into the middle of the model because that is essentially giving it domain knowledge.

Let's try using focal loss instead to see if it improves the training.

In [26]:
from torchvision.ops import sigmoid_focal_loss

loss_fns = {
    "focal_0p2": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.2),
    "focal_0p4": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.4),
    "focal_0p6": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.6),
    "focal_0p8": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.8),
    "bce": nn.BCEWithLogitsLoss(),
}
pos_frac = 0.1
learning_rate = 1e-2
batch_size = 4
epochs = 300

log_dir = os.path.join("runs", "experiment_008")
shutil.rmtree(log_dir, ignore_errors=True)

for name, loss_fn in loss_fns.items():
    train_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "train")), pos_frac)
    test_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "test")), pos_frac)
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)
    
    model = BlockLandingModel().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    log_subdir = os.path.join(log_dir, name + "_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    tb = SummaryWriter(log_subdir)

    for t in range(epochs):
        print(f"Epoch {t}\n-------------------------------")
        train_metrics = train_loop(train_dataloader, model, loss_fn, optimizer)
        test_metrics = test_loop(test_dataloader, model, loss_fn)
        tb.add_scalar("Loss/train", train_metrics["loss"], t)
        tb.add_scalar("Accuracy/train", train_metrics["acc"], t)
        tb.add_scalar("Recall/train", train_metrics["rec"], t)
        tb.add_scalar("Loss/test", test_metrics["loss"], t)
        tb.add_scalar("Accuracy/test", test_metrics["acc"], t)
        tb.add_scalar("Recall/test", test_metrics["rec"], t)
        for name, weight in model.named_parameters():
            tb.add_histogram(f"Weights/{name}", weight, t)
            tb.add_histogram(f"Gradients/{name}", weight.grad, t)

    tb.close()
    print("Done!")

pos_fraction: requested 0.100, got 0.097
Total positive examples: 10
pos_fraction: requested 0.100, got 0.087
Total positive examples: 2
Epoch 0
-------------------------------
loss: 0.694552  [    4/  103]
loss: 0.609268  [   84/  103]
Test Error: 
 Recall: 0.0%, Accuracy: 82.6%, Avg loss: 0.596175 

Epoch 1
-------------------------------
loss: 0.592577  [    4/  103]
loss: 0.569939  [   84/  103]
Test Error: 
 Recall: 0.0%, Accuracy: 91.3%, Avg loss: 0.546273 

Epoch 2
-------------------------------
loss: 0.570934  [    4/  103]
loss: 0.497831  [   84/  103]
Test Error: 
 Recall: 0.0%, Accuracy: 91.3%, Avg loss: 0.509351 

Epoch 3
-------------------------------
loss: 0.543609  [    4/  103]
loss: 0.448894  [   84/  103]
Test Error: 
 Recall: 0.0%, Accuracy: 91.3%, Avg loss: 0.472476 

Epoch 4
-------------------------------
loss: 0.439945  [    4/  103]
loss: 0.415236  [   84/  103]
Test Error: 
 Recall: 0.0%, Accuracy: 91.3%, Avg loss: 0.442535 

Epoch 5
-------------------------

When the positive examples make up 10% of the dataset, focal loss seems to do somewhat better in terms of recall than binary crossentropy, but only when gamma &lt; 0.5. Over 5 runs, focal loss with gamma = 0.2 beat binary cross entropy within 300 epochs on all runs except one where they both scored 100%. This was measure in terms of training recall. We didn't use test recall because the test set was too small to be useful (only 2 examples).

In [31]:
from torchvision.ops import sigmoid_focal_loss

loss_fns = {
    "focal_0p2": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.2),
    "focal_0p4": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.4),
    "focal_0p6": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.6),
    "focal_0p8": lambda inputs, targets: sigmoid_focal_loss(inputs, targets, reduction="mean", alpha=-1, gamma=0.8),
    "bce": nn.BCEWithLogitsLoss(),
}
pos_frac = 0.04
learning_rate = 1e-2
batch_size = 4
epochs = 300

log_dir = os.path.join("runs", "experiment_008")
shutil.rmtree(log_dir, ignore_errors=True)

for name, loss_fn in loss_fns.items():
    train_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "train")), pos_frac)
    test_dataset = ImbalancedDataset(BlockLandingDataset(os.path.join("data", "block_landing", "test")), pos_frac)
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)
    
    model = BlockLandingModel().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    log_subdir = os.path.join(log_dir, name + "_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    tb = SummaryWriter(log_subdir)

    for t in range(epochs):
        print(f"Epoch {t}\n-------------------------------")
        train_metrics = train_loop(train_dataloader, model, loss_fn, optimizer)
        test_metrics = test_loop(test_dataloader, model, loss_fn)
        tb.add_scalar("Loss/train", train_metrics["loss"], t)
        tb.add_scalar("Accuracy/train", train_metrics["acc"], t)
        tb.add_scalar("Recall/train", train_metrics["rec"], t)
        tb.add_scalar("Loss/test", test_metrics["loss"], t)
        tb.add_scalar("Accuracy/test", test_metrics["acc"], t)
        tb.add_scalar("Recall/test", test_metrics["rec"], t)
        for name, weight in model.named_parameters():
            tb.add_histogram(f"Weights/{name}", weight, t)
            tb.add_histogram(f"Gradients/{name}", weight.grad, t)

    tb.close()
    print("Done!")

pos_fraction: requested 0.040, got 0.031
Total positive examples: 3
pos_fraction: requested 0.040, got 0.000
Total positive examples: 0
Epoch 0
-------------------------------
loss: 0.513625  [    4/   96]
loss: 0.389289  [   84/   96]
Test Error: 
 Recall: nan%, Accuracy: 100.0%, Avg loss: 0.367554 

Epoch 1
-------------------------------
loss: 0.375347  [    4/   96]
loss: 0.249398  [   84/   96]
Test Error: 
 Recall: nan%, Accuracy: 100.0%, Avg loss: 0.226268 

Epoch 2
-------------------------------
loss: 0.224879  [    4/   96]
loss: 0.136499  [   84/   96]
Test Error: 
 Recall: nan%, Accuracy: 100.0%, Avg loss: 0.124297 

Epoch 3
-------------------------------
loss: 0.538450  [    4/   96]
loss: 0.080805  [   84/   96]
Test Error: 
 Recall: nan%, Accuracy: 100.0%, Avg loss: 0.072123 

Epoch 4
-------------------------------
loss: 0.072382  [    4/   96]
loss: 0.053943  [   84/   96]
Test Error: 
 Recall: nan%, Accuracy: 100.0%, Avg loss: 0.047557 

Epoch 5
---------------------

When the positive examples make up 4% of the dataset, which is roughly the true proportion when we record all game data, it is a bit hit-and-miss as to whether the model progresses much with recall in 300 epochs. The ordering of which loss functions were better or worse was different in all runs. gamma = 0.4 was the best on three of the runs, but failed to make any progress with recall in the other two runs (out of 5 runs). BCE made progress (and beat some of the focal loss functions) on two of the runs, so it is not clear that focal loss makes a huge difference.

# Conclusion

The model struggles with recall when the positive examples make up less than 10% of the dataset, but even when they are as low as 4%, it is possible for the model to learn eventually, even without modifying the loss function. This suggests that we can proceed with integrating the block landing model architecture into the Tetris emulator and expect some improvement in performance. If we fail to see an improvement, we can try gathering a larger dataset so the model has enough examples of uncommon events to learn from.