# Prepare Tetris emulator data - 80/20

The data recorded while playing the game needs to be checked and cleaned before being used to train a model. In addition, we will ensure that the data is about 80% block falls and 20% block spawns.

For expedience, we will merge the `tetris_emulator` and `balanced` datasets and selectively remove examples until we get the desired ratio. We will also ensure that training (test) examples stay as training (test) examples, to allow validation between datasets.

In [1]:
import os
from pathlib import Path
import random

import numpy as np
import matplotlib.pyplot as plt

from recording import RecordingDatabase

In [2]:
import recording
from importlib import reload
recording = reload(recording)
from recording import RecordingDatabase

# Copy data from existing datasets

In [3]:
# Initialise existing databases

train_unb = RecordingDatabase(os.path.join("data", "tetris_emulator", "train"))
test_unb = RecordingDatabase(os.path.join("data", "tetris_emulator", "test"))
train_bal = RecordingDatabase(os.path.join("data", "balanced", "train"))
test_bal = RecordingDatabase(os.path.join("data", "balanced", "test"))

for db in [train_unb, test_unb, train_bal, test_bal]:
    print(f"{db.path} has {len(db)} examples")

data\tetris_emulator\train has 1778 examples
data\tetris_emulator\test has 444 examples
data\balanced\train has 1778 examples
data\balanced\test has 444 examples


In [4]:
# Create folders

os.mkdir(os.path.join("data", "80_20", "train"))
os.mkdir(os.path.join("data", "80_20", "test"))

In [5]:
# Initialize and populate new databases

train_db = RecordingDatabase(os.path.join("data", "80_20", "train"))
test_db = RecordingDatabase(os.path.join("data", "80_20", "test"))

for train_source in [train_unb, train_bal]:
    for train_example in train_source:
        train_db.insert(train_example)

for test_source in [test_unb, test_bal]:
    for test_example in test_source:
        test_db.insert(test_example)

# Balance the data

In [6]:
def is_block_spawn(boards):
    return (boards[-2, 0, :] == 0).all() and (boards[-1, 0, :] == 1).any()

In [7]:
def count_falls_and_spawns(db):
    spawn_count = sum(1 for x in db if is_block_spawn(x))
    fall_count = len(db) - spawn_count
    return fall_count, spawn_count

In [8]:
train_falls, train_spawns = count_falls_and_spawns(train_db)
test_falls, test_spawns = count_falls_and_spawns(test_db)

print(f"Training set has {train_falls} ({train_falls / len(train_db):.2%}) falls and {train_spawns} ({train_spawns / len(train_db):.2%}) spawns.")
print(f"Test set has {test_falls} ({test_falls / len(test_db):.2%}) falls and {test_spawns} ({test_spawns / len(test_db):.2%}) spawns.")

Training set has 2526 (71.03%) falls and 1030 (28.97%) spawns.
Test set has 622 (70.05%) falls and 266 (29.95%) spawns.


In [9]:
def get_target_falls_and_spawns(target_size, target_spawn_split):
    target_spawns = round(target_size * target_spawn_split)
    target_falls = target_size - target_spawns
    return target_falls, target_spawns

In [10]:
# Match size of original datasets
target_train = len(train_unb)
target_test = len(test_unb)
target_spawn_split = 0.2
print(f"Target train count: {target_train}")
print(f"Target test count: {target_test}")
print(f"Target spawn proportion: {target_spawn_split}")

Target train count: 1778
Target test count: 444
Target spawn proportion: 0.2


In [11]:
# Get targets for each type of example
target_train_falls, target_train_spawns = get_target_falls_and_spawns(target_train, target_spawn_split)
print(f"Target train falls: {target_train_falls}")
print(f"Target train spawns: {target_train_spawns}")

target_test_falls, target_test_spawns = get_target_falls_and_spawns(target_test, target_spawn_split)
print(f"Target test falls: {target_test_falls}")
print(f"Target test spawns: {target_test_spawns}")

Target train falls: 1422
Target train spawns: 356
Target test falls: 355
Target test spawns: 89


In [12]:
def prune(db, target_falls, target_spawns):
    falls, spawns = count_falls_and_spawns(db)
    fall_excess = falls - target_falls
    spawn_excess = spawns - target_spawns
    assert fall_excess >= 0, "Not enough falls"
    assert spawn_excess >= 0, "Not enough spawns"

    falls_counted = 0
    spawns_counted = 0
    delete_idxs = []

    for idx, example in enumerate(db):
        if (falls_counted == fall_excess) and (spawns_counted == spawn_excess):
            break
        if is_block_spawn(example):
            if spawns_counted < spawn_excess:
                delete_idxs.append(idx)
                spawns_counted += 1
        else:
            if falls_counted < fall_excess:
                delete_idxs.append(idx)
                falls_counted += 1

    print(f"Removing {fall_excess} falls and {spawn_excess} spawns")
    db.delete_batch(delete_idxs)

In [13]:
print("Pruning training dataset")
prune(train_db, target_train_falls, target_train_spawns)

print("Pruning test dataset")
prune(test_db, target_test_falls, target_test_spawns)

Pruning training dataset
Removing 1104 falls and 674 spawns
Pruning test dataset
Removing 267 falls and 177 spawns


In [14]:
train_falls, train_spawns = count_falls_and_spawns(train_db)
test_falls, test_spawns = count_falls_and_spawns(test_db)

print(f"Training set has {train_falls} ({train_falls / len(train_db):.2%}) falls and {train_spawns} ({train_spawns / len(train_db):.2%}) spawns.")
print(f"Test set has {test_falls} ({test_falls / len(test_db):.2%}) falls and {test_spawns} ({test_spawns / len(test_db):.2%}) spawns.")

Training set has 1422 (79.98%) falls and 356 (20.02%) spawns.
Test set has 355 (79.95%) falls and 89 (20.05%) spawns.


# Final checks

Let's check the number of each type of block spawn in both the training and test folder.

In [15]:
import torch
from torch.utils.data import Dataset

class RecordingDataset(Dataset):
    def __init__(self, db: RecordingDatabase):
        self.db = db

    def __len__(self):
        return len(self.db)

    def __getitem__(self, idx):
        boards = self.db[idx]
        x = torch.tensor(boards[-2]) # Ignore all boards except the last two
        y = torch.tensor(boards[-1], dtype=torch.long)
        return x, y

In [16]:
class BlockType:
    I = "I"
    O = "O"
    T = "T"
    Z = "Z"
    S = "S"
    J = "J"
    L = "L"
    All = [I, O, T, Z, S, J, L]
    
def get_block_spawn_type(example):
    x, y = example
    if (x[0, :] == 1).any() | (y[0, :] == 0).all():
        return None
    if (y[1, :] == 0).all():
        return BlockType.I
    if y[0, 3] == 1:
        if y[1, 3] == 1:
            return BlockType.J
        else:
            if y[0, 5] == 1:
                return BlockType.T
            else:
                return BlockType.Z
    else:
        if y[1, 3] == 1:
            if y[0, 4] == 1:
                return BlockType.S
            else:
                return BlockType.L
        else:
            return BlockType.O


def count_block_spawns_by_type(dataset):
    spawns_by_type = {block_type: 0 for block_type in BlockType.All}
    for example in dataset:
        spawn_type = get_block_spawn_type(example)
        if spawn_type is not None:
            spawns_by_type[spawn_type] += 1
    return spawns_by_type


def describe_block_spawns(dataset):
    spawns_by_type = count_block_spawns_by_type(dataset)
    print(spawns_by_type)
    
    num_block_spawns = sum(val for key, val in spawns_by_type.items() if key is not None)
    frac_block_spawns = num_block_spawns / len(dataset)
    print(f"Dataset has {num_block_spawns} block spawns.")


train_dataset = RecordingDataset(train_db)
test_dataset = RecordingDataset(test_db)

print("Describing training dataset...")
describe_block_spawns(train_dataset)
print()
print("Describing test dataset...")
describe_block_spawns(test_dataset)

Describing training dataset...
{'I': 54, 'O': 51, 'T': 43, 'Z': 40, 'S': 54, 'J': 58, 'L': 56}
Dataset has 356 block spawns.

Describing test dataset...
{'I': 13, 'O': 7, 'T': 18, 'Z': 13, 'S': 11, 'J': 14, 'L': 13}
Dataset has 89 block spawns.
