# Tuning Tpot

In [1]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import dask.dataframe as dd
from dask_cuda import LocalCUDACluster
from dask.distributed import Client



In [2]:
# CUDA Cluster Setup
cluster = LocalCUDACluster(enable_nvlink=True, rmm_pool_size="4GB")
client = Client(cluster)

In [3]:
client

0,1
Client  Scheduler: ucx://127.0.0.1:49887  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 67.45 GB


In [4]:
# Load Data
train = pd.read_feather("/home/jovyan/work/ray_experiments/data/train_feat_1_baseline.feather")
target = train['target'] 
subtrain = train.drop(['ID_code', 'target'], axis = 1)

In [5]:
# convert to dask
#subtrain = dd.from_pandas(subtrain, npartitions=2)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(subtrain, target, test_size=0.3,
                                                    stratify=target,
                                                    random_state=42)

## Initiating TPoT

we will use the cuml config

In [7]:
import numpy as np

# This configuration provides users with access to a GPU the ability to
# use RAPIDS cuML and DMLC/XGBoost classifiers as estimators alongside
# the scikit-learn preprocessors in the TPOT default configuration.

classifier_config_cuml = {
    # cuML + DMLC/XGBoost Classifiers

    "cuml.neighbors.KNeighborsClassifier": {
        "n_neighbors": range(1, 101),
        "weights": ["uniform",],
    },

    "cuml.linear_model.LogisticRegression": {
        "penalty": ["l1", "l2", "elasticnet"],
        "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,],
    },
    
    #"cuml.ensemble.RandomForestClassifier": {
    #    "n_estimators": [100],
    #    "max_depth": np.arange(4,25,1),
    #    "max_features": range(3, 10),
    #    "n_bins": [8]
    #},

    "xgboost.XGBClassifier": {
        "n_estimators": [300],
        "max_depth": range(3, 10),
        "learning_rate": [1e-2, 1e-1, 0.5, 1.],
        "subsample": np.arange(0.05, 1.01, 0.05),
        "min_child_weight": range(1, 21),
        "alpha": [1, 10],
        "tree_method": ["gpu_hist"],
        "n_jobs": [1],
        "verbosity": [0]
    },
    
    # test lightgbm
    #"lightgbm.LGBMClassifier": {
    #    "n_estimators": [300],
    #    "learning_rate": [1e-2, 1e-1, 0.5, 1.],
    #    "objective": "binary",
    #    "min_child_weight": np.arange(1e-3, 1e-1, 1e-3),
    #    "n_jobs": [1]
    #},

    # Sklearn Preprocesssors

    "sklearn.preprocessing.Binarizer": {
        "threshold": np.arange(0.0, 1.01, 0.05)
    },

    # cuml pca instead
    #"cuml.PCA": {
    #    "n_components"
    #}
    
    # this is really slow
    #"sklearn.decomposition.FastICA": {
    #    "tol": np.arange(0.0, 1.01, 0.05)
    #},

    "sklearn.cluster.FeatureAgglomeration": {
        "linkage": ["ward", "complete", "average"],
        "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine"]
    },

    "sklearn.preprocessing.MaxAbsScaler": {
    },

    "sklearn.preprocessing.MinMaxScaler": {
    },

    "sklearn.preprocessing.Normalizer": {
        "norm": ["l1", "l2", "max"]
    },

    "sklearn.kernel_approximation.Nystroem": {
        "kernel": ["rbf", "cosine", "chi2", "laplacian", "polynomial", "poly", "linear", "additive_chi2", "sigmoid"],
        "gamma": np.arange(0.0, 1.01, 0.05),
        "n_components": range(1, 11)
    },

    #"sklearn.decomposition.PCA": {
    #    "svd_solver": ["randomized"],
    #    "iterated_power": range(1, 11)
    #},
    
    "cuml.PCA": {
        "svd_solver": ["jacobi"],
        "iterated_power": range(1, 11)
    },

    "sklearn.kernel_approximation.RBFSampler": {
        "gamma": np.arange(0.0, 1.01, 0.05)
    },

    "sklearn.preprocessing.RobustScaler": {
    },

    "sklearn.preprocessing.StandardScaler": {
    },

    "tpot.builtins.ZeroCount": {
    },

    "tpot.builtins.OneHotEncoder": {
        "minimum_fraction": [0.05, 0.1, 0.15, 0.2, 0.25],
        "sparse": [False],
        "threshold": [10]
    },

    # Selectors

    "sklearn.feature_selection.SelectFwe": {
        "alpha": np.arange(0, 0.05, 0.001),
        "score_func": {
            "sklearn.feature_selection.f_classif": None
        }
    },

    "sklearn.feature_selection.SelectPercentile": {
        "percentile": range(1, 100),
        "score_func": {
            "sklearn.feature_selection.f_classif": None
        }
    },

    "sklearn.feature_selection.VarianceThreshold": {
        "threshold": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
    }
}

In [8]:
# Need njobs = 1 for cuml
pipeline_optimizer = TPOTClassifier(generations=5, population_size=10, cv=5, n_jobs=1,
                                    random_state=42, verbosity=2, scoring='roc_auc', 
                                    config_dict=classifier_config_cuml, 
                                    use_dask=True)

### Fitting

need to add dask `use_dask=True` in order to leverage multi GPU

In [9]:
pipeline_optimizer.fit(X_train, y_train)



Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8851678309945477

Generation 2 - Current best internal CV score: 0.8851678309945477

Generation 3 - Current best internal CV score: 0.8851678309945477

Generation 4 - Current best internal CV score: 0.8851678309945477

Generation 5 - Current best internal CV score: 0.8852792511659686

Best pipeline: XGBClassifier(RobustScaler(input_matrix), alpha=10, learning_rate=0.1, max_depth=8, min_child_weight=11, n_estimators=300, n_jobs=1, subsample=0.55, tree_method=gpu_hist, verbosity=0)


TPOTClassifier(config_dict={'cuml.PCA': {'iterated_power': range(1, 11),
                                         'svd_solver': ['jacobi']},
                            'cuml.linear_model.LogisticRegression': {'C': [0.0001,
                                                                           0.001,
                                                                           0.01,
                                                                           0.1,
                                                                           0.5,
                                                                           1.0,
                                                                           5.0,
                                                                           10.0,
                                                                           15.0,
                                                                           20.0,
                                                  