In [7]:
import openml
import pandas as pd
from pathlib import Path

In [2]:
dataset_ids = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
]
df = dict()
for dataset_id in dataset_ids:
    dataset = openml.datasets.get_dataset(dataset_id)
    # get data characteristics
    # number of samples, features, classes, categorical features
    name = dataset.name
    df[name] = {
        'dataset': name,
        "openml_id": dataset_id,
        "n_instances": dataset.qualities["NumberOfInstances"],
        "n_features": dataset.qualities["NumberOfFeatures"],
        "n_classes": dataset.qualities["NumberOfClasses"],
        "n_categorical": dataset.qualities["NumberOfSymbolicFeatures"],
    }
df = pd.DataFrame.from_dict(df, orient="index")
df = df.astype({"openml_id": int, "n_instances": int, "n_features": int, "n_classes": int, 'n_categorical': int})

In [9]:
df

Unnamed: 0,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
ecoli,ecoli,39,336,8,8,1
iris,iris,61,150,5,3,1
satimage,satimage,182,6430,37,6,1
har,har,1478,10299,562,6,1
nursery,nursery,1568,12958,9,4,9
shuttle,shuttle,40685,58000,10,7,1
segment,segment,40984,2310,20,7,1
alizadeh-2000-v2,alizadeh-2000-v2,46773,62,2094,3,0
alizadeh-2000-v3,alizadeh-2000-v3,46774,62,2092,4,0
armstrong-2002-v1,armstrong-2002-v1,46775,72,1082,2,0


In [8]:
df.to_csv(Path("/home/belucci/code/cohirf/results/real")/"datasets_characteristics.csv"),

(None,)

In [5]:
rename_dict = {
    "dataset": "Dataset",
    "openml_id": "OpenML ID",
    "n_instances": "$n$",
    "n_features": "$p$",
    "n_classes": "$C$",
    "n_categorical": "$p_{cat}$",
}

In [6]:
df_latex = df.copy()
df_latex.rename(columns=rename_dict, inplace=True)
print(df_latex.style.hide().to_latex(hrules=True))

\begin{tabular}{lrrrrr}
\toprule
Dataset & OpenML ID & $n$ & $p$ & $C$ & $p_{cat}$ \\
\midrule
ecoli & 39 & 336 & 8 & 8 & 1 \\
iris & 61 & 150 & 5 & 3 & 1 \\
satimage & 182 & 6430 & 37 & 6 & 1 \\
har & 1478 & 10299 & 562 & 6 & 1 \\
nursery & 1568 & 12958 & 9 & 4 & 9 \\
shuttle & 40685 & 58000 & 10 & 7 & 1 \\
segment & 40984 & 2310 & 20 & 7 & 1 \\
alizadeh-2000-v2 & 46773 & 62 & 2094 & 3 & 0 \\
alizadeh-2000-v3 & 46774 & 62 & 2092 & 4 & 0 \\
armstrong-2002-v1 & 46775 & 72 & 1082 & 2 & 0 \\
bittner-2000 & 46776 & 38 & 2202 & 2 & 0 \\
bredel-2005 & 46777 & 50 & 1740 & 3 & 0 \\
chowdary-2006 & 46778 & 104 & 183 & 2 & 0 \\
garber-2001 & 46779 & 66 & 4554 & 4 & 0 \\
golub-1999-v2 & 46780 & 72 & 1869 & 3 & 0 \\
khan-2001 & 46781 & 83 & 1070 & 4 & 0 \\
binary_alpha_digits & 46782 & 1404 & 321 & 36 & 0 \\
coil-20 & 46783 & 1440 & 1025 & 0 & 0 \\
\bottomrule
\end{tabular}

