The AMLTK (https://github.com/automl/amltk) provides a framework for developing AutoML systems. One component of this system is the search space definitions. 

TPOT2 provides a function called tpot2.utils.tpot2_parser which can convert a search space defined in the AMLTK API into the search space class used by TPOT2. This allows users to define a single search space to be used by both algorithms, facilitating better comparisons. Below is an example of a few search spaces defined in AMLTK and how to use them in TPOT2.

Note: this feature is still experimental and not all features present in the AMLTK API are fully supported in TPOT2 yet. (For example, automated splitting based on categorical vs numeric with amltk.pipeline.Split is not currently implemented in the parser.)

In [1]:
from sklearn.compose import make_column_selector
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from amltk.pipeline import Choice, Component, Sequential, Split
import tpot2
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
import tpot2
import numpy as np
import sklearn
import sklearn.datasets
import pandas as pd
# create dummy pandas dataset with both categorical and numerical columns
X, y = sklearn.datasets.make_classification(n_samples=100, n_features=5, n_informative=3, n_classes=2, random_state=42)
X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(5)])
# add 5 categorical columns
for i in range(5):
    X[f"cat_{i}"] = np.random.choice(["A", "B", "C"], size=100)
y = y.flatten()
# train test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.5)

# TODO: implement support for this condition
# select_categories = make_column_selector(dtype_include=object)
# select_numerical = make_column_selector(dtype_include=np.number)

# split_imputation = Split(
#     {
#         "categories": [SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first")],
#         "numerics": Component(SimpleImputer, space={"strategy": ["mean", "median"]}),
#     },
#     config={"categories": select_categories, "numerics": select_numerical}, #not yet supported
#     name="feature_preprocessing",
# )
# split_imputation

select_categories = make_column_selector(dtype_include=object)
select_numerical = make_column_selector(dtype_include=np.number)

cat_selector = make_column_transformer(("passthrough", select_categories))
num_selector = make_column_transformer(("passthrough", select_numerical))


split_imputation = Split(
    {
        "categories": [cat_selector,SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first", sparse_output=False)],
        "numerics": [num_selector, Component(SimpleImputer, space={"strategy": ["mean", "median"]})],
    },
    name="split_imputation",
)
split_imputation

In [2]:
from tpot2.builtin_modules import Passthrough, ZeroCount
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold, SelectKBest

selectors = Choice(
    Component(VarianceThreshold, space={"threshold": (0.1,1)}),
    Component(SelectKBest, space={"k": (1, 10)}),
    name="selectors",
)


transformers = Split(
    {
        "passthrough": Passthrough(),
        "polynomial": Component(PolynomialFeatures, space={"degree": [2, 3]}),
        "zerocount" : ZeroCount(),
    },
    # config={"categories": select_categories, "numerics": select_numerical},
    name="transformers",
)

pipeline = (
    Sequential(name="my_pipeline")
    >> split_imputation
    # >> Component(SimpleImputer, space={"strategy": ["mean", "median"]})  # Choose either mean or median
    
    >> selectors
    >> transformers
    >> Choice(
        # Our pipeline can choose between two different estimators
        Component(
            RandomForestClassifier,
            space={"n_estimators": (10, 100), "criterion": ["gini", "log_loss"]},
            config={"max_depth": 3},
        ),
        Component(SVC, space={"kernel": ["linear", "rbf", "poly"]}),
        name="estimator",
    )
)

# Display the amltk Pipeline
pipeline

In [3]:
#convert to tpot search space
tpot_search_space = tpot2.utils.tpot2_parser(pipeline)

# sample a pipeline from the tpot search space
tpot_search_space.generate().export_pipeline()

In [4]:



est = tpot2.TPOTEstimator(
    scorers = ["roc_auc"],
    scorers_weights = [1],
    classification = True,
    cv = 5,
    search_space = tpot_search_space, #converted search space goes here
    population_size= 10,
    generations = 2,
    max_eval_time_seconds = 60*5,
    verbose = 5,
    n_jobs=10,
)

est.fit(X_train, y_train)

Generation:  50%|█████     | 1/2 [00:02<00:02,  2.60s/it]

Generation:  1
Best roc_auc_score score: 0.976


Generation: 100%|██████████| 2/2 [00:03<00:00,  1.57s/it]
2024-09-09 17:25:40,301 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:39897' caused the cluster to lose scattered data, which can't be recovered: {'ndarray-3f2f44921e6e9cc40ef07cfcd8ae90fb', 'DataFrame-5551f84174fd651642ff10eb71e30b22'} (stimulus_id='handle-worker-cleanup-1725927940.3010821')


Generation:  2
Best roc_auc_score score: 0.984


In [5]:
est.fitted_pipeline_

In [6]:
est.predict(X_test)

array([1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0])