In [1]:
import xarray as xr

In [2]:
ds = xr.open_dataset("data/reformatted_results_tabzilla.nc")
ds

In [3]:
ds.coords['model_name'].values

array(['CATBOOST', 'DECISION_TREE', 'DEEPFM', 'KNN', 'LIGHTGBM',
       'LINEAR_REGRESSION', 'MLP', 'RANDOM_FOREST', 'STG', 'SVM',
       'TABNET', 'TABTRANSFORMER', 'VIME', 'XGBOOST', 'MLP_RTDL',
       'RESNET', 'DANET', 'NAM', 'NODE', 'SAINT', 'FT_TRANSFORMER',
       'TABPFN'], dtype=object)

The benchmark models also include models (TABTRANSFORMER, DEEPFM, NAM) that are not in the paper. They are not famous models and they don't have sufficient number of runs, so we remove them.

In [4]:
model_names = ['CATBOOST', 'DECISION_TREE', 'KNN', 'LIGHTGBM',
       'LINEAR_REGRESSION', 'MLP', 'RANDOM_FOREST', 'STG', 'SVM', 'TABNET', 'VIME', 'XGBOOST', 'MLP_RTDL', 'RESNET', 'DANET',
       'NODE', 'SAINT', 'FT_TRANSFORMER']

# Removed: TABTRANSFORMER, DEEPFM, NAM, TABPFN

ds = ds.sel(model_name=model_names)



We take datasets for which we have at least one completed default run per model

In [5]:
ds_default_runs = ds['runs_actual'].where(ds['search_type'] == 'DEFAULT').sum(dim='run_id')
ds['runs_actual'] = ds['runs_actual'].where(ds_default_runs > 0)
ds.where(ds['runs_actual'].count(dim='model_name') == ds.sizes['model_name'], drop=True)

We take datasets for which we have at least one completed run per model

In [6]:
ds = ds.where((ds['runs_actual'] > 0).sum(dim='model_name') == ds.sizes['model_name'], drop=True)

In [7]:
import pandas as pd

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    series = (ds['runs_actual'] > 0).sum(dim='model_name').to_series()
    print(series)

openml_dataset_id
3         18
4         18
5         18
7         18
9         18
10        18
11        18
12        18
14        18
15        18
16        18
18        18
23        18
25        18
27        18
29        18
30        18
35        18
37        18
39        18
40        18
43        18
45        18
47        18
48        18
49        18
50        18
53        18
59        18
2074      18
2079      18
2867      18
3021      18
3022      18
3485      18
3512      18
3540      18
3543      18
3549      18
3560      18
3561      18
3602      18
3620      18
3647      18
3711      18
3731      18
3739      18
3748      18
3779      18
3797      18
3896      18
3902      18
3903      18
3904      18
3913      18
3917      18
3918      18
3953      18
9946      18
9952      18
9957      18
9960      18
9964      18
9971      18
9978      18
9984      18
10089     18
10093     18
10101     18
14952     18
14954     18
14965     18
14967     18
125920    18
125921    18
145793 

In [8]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    series = (ds['runs_actual'] > 0).sum(dim='openml_dataset_id').to_series()
    series.sort_values(ascending=True, inplace=True)
    print(series)

model_name
CATBOOST             99
DECISION_TREE        99
KNN                  99
LIGHTGBM             99
LINEAR_REGRESSION    99
MLP                  99
RANDOM_FOREST        99
STG                  99
SVM                  99
TABNET               99
VIME                 99
XGBOOST              99
MLP_RTDL             99
RESNET               99
DANET                99
NODE                 99
SAINT                99
FT_TRANSFORMER       99
Name: runs_actual, dtype: int64


In [9]:
ds_default_runs = ds['runs_actual'].where(ds['search_type'] == 'DEFAULT').sum(dim='run_id')
series = (ds_default_runs > 0).sum(dim='openml_dataset_id').to_series()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(series)

model_name
CATBOOST             99
DECISION_TREE        99
KNN                  99
LIGHTGBM             99
LINEAR_REGRESSION    99
MLP                  99
RANDOM_FOREST        99
STG                  99
SVM                  99
TABNET               99
VIME                 99
XGBOOST              99
MLP_RTDL             99
RESNET               99
DANET                99
NODE                 99
SAINT                99
FT_TRANSFORMER       99
Name: runs_actual, dtype: int64


In [10]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    series = (ds['runs_actual'] >= 30).sum(dim='openml_dataset_id').to_series()
    series.sort_values(ascending=True, inplace=True)
    print(series)

model_name
SAINT                17
NODE                 40
SVM                  77
DANET                82
TABNET               92
STG                  95
VIME                 96
RESNET               97
FT_TRANSFORMER       97
KNN                  98
RANDOM_FOREST        99
MLP                  99
CATBOOST             99
DECISION_TREE        99
LINEAR_REGRESSION    99
LIGHTGBM             99
MLP_RTDL             99
XGBOOST              99
Name: runs_actual, dtype: int64


In [14]:
string = '['
for id in ds.coords['openml_dataset_id']:
    string += f'{id.item()}, '

string = string[:-2] + ']'
print(string)

[3, 4, 5, 7, 9, 10, 11, 12, 14, 15, 16, 18, 23, 25, 27, 29, 30, 35, 37, 39, 40, 43, 45, 47, 48, 49, 50, 53, 59, 2074, 2079, 2867, 3021, 3022, 3485, 3512, 3540, 3543, 3549, 3560, 3561, 3602, 3620, 3647, 3711, 3731, 3739, 3748, 3779, 3797, 3896, 3902, 3903, 3904, 3913, 3917, 3918, 3953, 9946, 9952, 9957, 9960, 9964, 9971, 9978, 9984, 10089, 10093, 10101, 14952, 14954, 14965, 14967, 125920, 125921, 145793, 145799, 145836, 145847, 145977, 145984, 146024, 146032, 146063, 146065, 146192, 146210, 146607, 146800, 146817, 146818, 146820, 146821, 167140, 167141, 167211, 168911, 190408, 360948]
