In [None]:
pip install pandas scikit-learn openml pymfe tqdm

Collecting openml
  Downloading openml-0.15.0-py3-none-any.whl.metadata (9.9 kB)
Collecting pymfe
  Downloading pymfe-0.4.3-py3-none-any.whl.metadata (14 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.10-py3-none-any.whl.metadata (6.5 kB)
Collecting texttable (from pymfe)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting igraph>=0.10.1 (from pymfe)
  Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting gower (from pymfe)
  Downloading gower-0.1.2-py3-none-any.whl.metadata (3.7 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading op

In [None]:
from pathlib import Path
import openml
import pandas as pd
from pymfe.mfe import MFE
from tqdm import tqdm
import warnings

# Suppress precision and invalid value warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


# Meta-feature extraction settings
groups = ["landmarking", "general", "statistical", "model-based", "info-theory", "relative"]
summary_funcs = ["mean", "sd", "count", "histogram", "iq_range", "kurtosis", "max", "median", "min", "quantiles", "range", "skewness"]
scoring = "balanced-accuracy"

def featurize_dataset(task_id):
    # Load data using OpenML task ID
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

    # Convert to numpy arrays
    X = X.to_numpy()
    y = y.to_numpy()

    # Identify categorical columns
    categorical_cols = list(dataset.get_features_by_type('nominal', [dataset.default_target_attribute]))

    # Check if the dataset is classification
    if dataset.qualities["NumberOfClasses"] <= 1:
        print("Unsupported target type. Skipping.")
        return None

    print(f"Processing dataset: {dataset.name}")
    metafeats = []

    # Extract metafeatures
    mfe = MFE(groups=groups, summary=summary_funcs, random_state=0, score=scoring)
    mfe.fit(X, y, cat_cols=categorical_cols, transform_num=False, transform_cat=None)
    ft = mfe.extract()

    # Consolidate results
    fold_metafeats = {"dataset_name": dataset.name}
    for group in groups:
        ft_group = mfe.parse_by_group(group, ft)
        fold_metafeats.update(
            {f"f__pymfe.{group}.{name}": value for name, value in zip(*ft_group)}
        )
    metafeats.append(fold_metafeats)

    return metafeats

def featurize_all_datasets(task_ids):
    output_file = Path("metafeatures.csv")
    if output_file.exists():
        computed_features = pd.read_csv(output_file)
        computed_features.set_index("dataset_name", inplace=True)
    else:
        computed_features = None

    for task_id in task_ids:
        dataset_name = openml.tasks.get_task(task_id).get_dataset().name
        if computed_features is not None and dataset_name in computed_features.index:
            continue

        print(f"Featurizing task ID: {task_id}")
        dataset_metafeatures = featurize_dataset(task_id)
        if dataset_metafeatures is None:
            continue

        dataset_metafeatures = pd.DataFrame(dataset_metafeatures)
        dataset_metafeatures.set_index("dataset_name", inplace=True)

        if computed_features is None:
            computed_features = dataset_metafeatures
            computed_features = computed_features[sorted(computed_features.columns)]
        else:
            computed_features = pd.concat([dataset_metafeatures, computed_features])

        print("Writing. Do not interrupt...")
        computed_features.to_csv(output_file)

# Specify OpenML task IDs
task_ids = [14965, 9977, 34539, 146606, 7592, 146195, 167119, 167120, 168331, 168330, 168335, 146212,
            168868, 31, 10101, 3913, 3917, 9957, 9946, 3918,
            3903, 37, 9971, 9952, 3902, 49, 43, 9978, 10093, 219, 9976, 6, 53, 11, 15, 16, 14, 32, 3549,
            12, 9981, 18, 28, 2074, 29, 45, 125922, 9960, 9964, 22, 2079, 14969, 3560, 14952, 125920, 23,
            3904, 3022, 9985, 9910, 14970, 3021, 3481, 146824, 146820, 146822, 146195, 146800, 146817,
            146819, 146821, 14954, 167141, 167140, 167125]
featurize_all_datasets(task_ids)




Featurizing task ID: 14965
Processing dataset: bank-marketing


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
  res = hypotest_fun_out(*samples, **kwds)


Writing. Do not interrupt...
Featurizing task ID: 9977
Processing dataset: nomao


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 34539
Processing dataset: Amazon_employee_access
Writing. Do not interrupt...
Featurizing task ID: 146606
Processing dataset: higgs


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 7592
Processing dataset: adult


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
TypeError("'<' not supported between instances of 'float' and 'str'").


Writing. Do not interrupt...
Featurizing task ID: 146195
Processing dataset: connect-4
Writing. Do not interrupt...
Featurizing task ID: 167119
Processing dataset: jungle_chess_2pcs_raw_endgame_complete


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
  res = hypotest_fun_out(*samples, **kwds)


Writing. Do not interrupt...
Featurizing task ID: 167120
Processing dataset: numerai28.6


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 168331
Processing dataset: volkert


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
  res = hypotest_fun_out(*samples, **kwds)


Writing. Do not interrupt...
Featurizing task ID: 168330
Processing dataset: jannis


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 168335
Processing dataset: MiniBooNE


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146212
Processing dataset: shuttle


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 168868
Processing dataset: APSFailure


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 31
Processing dataset: credit-g


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 10101
Processing dataset: blood-transfusion-service-center


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3913
Processing dataset: kc2


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3917
Processing dataset: kc1


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9957
Processing dataset: qsar-biodeg


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9946
Processing dataset: wdbc


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3918
Processing dataset: pc1


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3903
Processing dataset: pc3


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 37
Processing dataset: diabetes


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9971
Processing dataset: ilpd


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9952
Processing dataset: phoneme


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3902
Processing dataset: pc4


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 49
Processing dataset: tic-tac-toe
Writing. Do not interrupt...
Featurizing task ID: 43
Processing dataset: spambase


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9978
Processing dataset: ozone-level-8hr


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 10093
Processing dataset: banknote-authentication


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 219
Processing dataset: electricity


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
  res = hypotest_fun_out(*samples, **kwds)


Writing. Do not interrupt...
Featurizing task ID: 9976
Processing dataset: madelon


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 6
Processing dataset: letter


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 53
Processing dataset: vehicle


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 11
Processing dataset: balance-scale


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 15
Processing dataset: breast-w


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 16
Processing dataset: mfeat-karhunen


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 14
Processing dataset: mfeat-fourier


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 32
Processing dataset: pendigits


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3549
Processing dataset: analcatdata_authorship


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 12
Processing dataset: mfeat-factors


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9981
Processing dataset: cnae-9


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 18
Processing dataset: mfeat-morphological


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 28
Processing dataset: optdigits


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
  res = hypotest_fun_out(*samples, **kwds)


Writing. Do not interrupt...
Featurizing task ID: 2074
Processing dataset: satimage


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 29
Processing dataset: credit-approval


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
TypeError("'<' not supported between instances of 'str' and 'float'").


Writing. Do not interrupt...
Featurizing task ID: 45
Processing dataset: splice
Writing. Do not interrupt...
Featurizing task ID: 125922
Processing dataset: texture


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9960
Processing dataset: wall-robot-navigation


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9964
Processing dataset: semeion


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 22
Processing dataset: mfeat-zernike


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 2079
Processing dataset: eucalyptus


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 14969
Processing dataset: GesturePhaseSegmentationProcessed


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3560
Processing dataset: analcatdata_dmft
Writing. Do not interrupt...
Featurizing task ID: 14952
Processing dataset: PhishingWebsites
Writing. Do not interrupt...
Featurizing task ID: 125920
Processing dataset: dresses-sales


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
TypeError("'<' not supported between instances of 'float' and 'str'").


Writing. Do not interrupt...
Featurizing task ID: 23
Processing dataset: cmc


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3904
Processing dataset: jm1


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3022
Processing dataset: vowel


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9985
Processing dataset: first-order-theorem-proving


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 9910
Processing dataset: Bioresponse


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 14970
Processing dataset: har


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 3021
Processing dataset: sick


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
TypeError("'<' not supported between instances of 'float' and 'str'").


Writing. Do not interrupt...
Featurizing task ID: 3481
Processing dataset: isolet


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146824
Processing dataset: mfeat-pixel


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146820
Processing dataset: wilt


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146822
Processing dataset: segment


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146800
Processing dataset: MiceProtein


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146817
Processing dataset: steel-plates-fault


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146819
Processing dataset: climate-model-simulation-crashes


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 146821
Processing dataset: car
Writing. Do not interrupt...
Featurizing task ID: 14954
Processing dataset: cylinder-bands


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
TypeError("'<' not supported between instances of 'float' and 'str'").


Writing. Do not interrupt...
Featurizing task ID: 167141
Processing dataset: churn


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...
Featurizing task ID: 167140
Processing dataset: dna
Writing. Do not interrupt...
Featurizing task ID: 167125
Processing dataset: Internet-Advertisements


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").


Writing. Do not interrupt...


In [None]:
pip freeze > requirements.txt