In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import numpy as np
from pathlib import Path

In [10]:
indices_path = Path('tabularbench/data/train_val_test_indices.npy')
indices = np.load(indices_path, allow_pickle=True).item()

keys are the openml_ids:

In [11]:
indices.keys()

dict_keys([44132, 44133, 44134, 44136, 44137, 44138, 44139, 44140, 44141, 44142, 44143, 44144, 44145, 44146, 44147, 44148, 45032, 45033, 45034, 44089, 44120, 44121, 44122, 44123, 44125, 44126, 44128, 44129, 44130, 45022, 45021, 45020, 45019, 45028, 45026, 44055, 44056, 44059, 44061, 44062, 44063, 44065, 44066, 44068, 44069, 45041, 45042, 45043, 45045, 45046, 45047, 45048, 44156, 44157, 44159, 45035, 45036, 45038, 45039])

Every openml dataset has different dataset sizes:

In [12]:
sizes_all = []

for openml_id, indices_by_size in indices.items():
    sizes = indices_by_size.keys()
    sizes_all.append(sizes)

print(set.intersection(*map(set, sizes_all)))
print(set.union(*map(set, sizes_all)))

{10000, 50000}
{10000, 50000}


Under every key, there is an array with every element belonging to a split:

In [13]:
splits_per_openmlid = { k: (len(v[10000]), len(v[50000])) for k, v in indices.items() }
splits_per_openmlid

{44132: (3, 3),
 44133: (2, 2),
 44134: (2, 2),
 44136: (3, 3),
 44137: (3, 3),
 44138: (1, 2),
 44139: (1, 2),
 44140: (1, 1),
 44141: (3, 3),
 44142: (2, 2),
 44143: (1, 1),
 44144: (1, 2),
 44145: (3, 3),
 44146: (1, 1),
 44147: (3, 3),
 44148: (1, 2),
 45032: (3, 3),
 45033: (5, 5),
 45034: (1, 1),
 44089: (2, 2),
 44120: (1, 1),
 44121: (1, 1),
 44122: (3, 3),
 44123: (3, 3),
 44125: (3, 3),
 44126: (3, 3),
 44128: (1, 1),
 44129: (1, 1),
 44130: (3, 3),
 45022: (1, 1),
 45021: (1, 1),
 45020: (3, 3),
 45019: (5, 5),
 45028: (1, 2),
 45026: (3, 3),
 44055: (5, 5),
 44056: (3, 3),
 44059: (1, 1),
 44061: (5, 5),
 44062: (3, 3),
 44063: (2, 2),
 44065: (1, 1),
 44066: (1, 2),
 44068: (1, 1),
 44069: (1, 1),
 45041: (3, 3),
 45042: (5, 5),
 45043: (1, 1),
 45045: (1, 1),
 45046: (1, 1),
 45047: (1, 1),
 45048: (1, 1),
 44156: (1, 1),
 44157: (3, 3),
 44159: (1, 1),
 45035: (1, 1),
 45036: (3, 3),
 45038: (1, 1),
 45039: (3, 3)}

For every split, we have a tuple with the indices for the training, validation and test data:

In [14]:
idcs = indices[45033][10000][0]
print(len(idcs['train']), len(idcs['val']), len(idcs['test']))
print(idcs['train'][:10], idcs['val'][:10], idcs['test'][:10])

2923 376 878
[0, 3, 6, 7, 8, 9, 11, 12, 13, 16] [2, 4, 5, 10, 14, 30, 33, 39, 70, 72] [1, 15, 17, 22, 23, 31, 34, 36, 42, 45]


There should be no overlapping indices:

In [15]:
for openml_id, dataset in indices.items():
    for size, splits in dataset.items():
        for split in splits:
            split_train = set(split['train'])
            split_val = set(split['val'])
            split_test = set(split['test'])

            assert len(split_train & split_val) == 0
            assert len(split_train & split_test) == 0
            assert len(split_val & split_test) == 0        

Per dataset and size, we consider the values that are not in any of the splits:

In [16]:
n_no_split_values = {}
for openml_id, dataset in indices.items():
    n_no_split_values[openml_id] = {}
    for size, splits in dataset.items():
        n_no_split_values[openml_id][size] = []
        for split in splits:
            split_combined = split['train'] + split['val'] + split['test']
            max_index = max(split_combined) + 1
            n_no_split_values[openml_id][size].append(max_index - len(split_combined))


n_no_split_values

{44132: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44133: {10000: [0, 0], 50000: [0, 0]},
 44134: {10000: [0, 0], 50000: [0, 0]},
 44136: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44137: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44138: {10000: [0], 50000: [0, 0]},
 44139: {10000: [0], 50000: [0, 0]},
 44140: {10000: [0], 50000: [0]},
 44141: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44142: {10000: [0, 0], 50000: [0, 0]},
 44143: {10000: [471834], 50000: [431835]},
 44144: {10000: [0], 50000: [0, 0]},
 44145: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44146: {10000: [57145], 50000: [29146]},
 44147: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44148: {10000: [0], 50000: [0, 0]},
 45032: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 45033: {10000: [0, 0, 0, 0, 0], 50000: [0, 0, 0, 0, 0]},
 45034: {10000: [5355531], 50000: [5315569]},
 44089: {10000: [0, 0], 50000: [0, 0]},
 44120: {10000: [0], 50000: [0]},
 44121: {10000: [456601], 50000: [416602]},
 44122: {10000: [0, 0, 0], 50000: [0, 0, 0]},
 44123: {10000: [0, 0,

These are the datasets for which we don't use all the data:

In [17]:
dataset_too_large = []
for openml_id, dataset in n_no_split_values.items():
    for size, splits in dataset.items():
        if np.mean(splits) > 0:
            dataset_too_large.append((openml_id, size))

sorted(dataset_too_large)

[(44065, 10000),
 (44065, 50000),
 (44068, 10000),
 (44068, 50000),
 (44069, 10000),
 (44069, 50000),
 (44121, 10000),
 (44121, 50000),
 (44129, 10000),
 (44129, 50000),
 (44143, 10000),
 (44143, 50000),
 (44146, 10000),
 (44146, 50000),
 (44159, 10000),
 (44159, 50000),
 (45034, 10000),
 (45034, 50000),
 (45038, 10000),
 (45045, 10000),
 (45045, 50000),
 (45046, 10000),
 (45046, 50000),
 (45047, 10000),
 (45047, 50000),
 (45048, 10000),
 (45048, 50000)]