Skip to content

Commit

Permalink
clean up configs; update notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
beckernick committed Aug 26, 2020
1 parent eb3909b commit 29ee28f
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 144 deletions.
3 changes: 2 additions & 1 deletion tpot/config/classifier_cuml.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# the scikit-learn preprocessors in the TPOT default configuration.

classifier_config_cuml = {
# cuML + XGboost Classifiers
# cuML + DMLC/XGBoost Classifiers

"cuml.neighbors.KNeighborsClassifier": {
"n_neighbors": range(1, 101),
Expand Down Expand Up @@ -118,6 +118,7 @@
},

# Selectors

"sklearn.feature_selection.SelectFwe": {
"alpha": np.arange(0, 0.05, 0.001),
"score_func": {
Expand Down
130 changes: 76 additions & 54 deletions tpot/config/regressor_cuml.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,100 +26,122 @@
import numpy as np

# This configuration provides users with access to a GPU the ability to
# use cuML regressors as estimators alongside the scikit-learn
# preprocessors in the TPOT default configuration.
# use RAPIDS cuML and DMLC/XGBoost regressors as estimators alongside
# the scikit-learn preprocessors in the TPOT default configuration.

regressor_config_cuml = {
# cuML Regressors
# cuML + DMLC/XGBoost Regressors

'cuml.linear_model.ElasticNet': {
'l1_ratio': np.arange(0.0, 1.01, 0.05),
'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
"cuml.linear_model.ElasticNet": {
"l1_ratio": np.arange(0.0, 1.01, 0.05),
"tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
},

'cuml.neighbors.KNeighborsRegressor': {
'n_neighbors': range(1, 101),
'weights': ["uniform"],
"cuml.neighbors.KNeighborsRegressor": {
"n_neighbors": range(1, 101),
"weights": ["uniform"],
},

'cuml.linear_model.Lasso': {
'normalize': [True, False]
"cuml.linear_model.Lasso": {
"normalize": [True, False]
},

'cuml.svm.SVR': {
'tol': [1e-4, 1e-3, 1e-2, 1e-1,],
'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,]
"cuml.ensemble.RandomForestRegressor": {
"n_estimators": [100, 300, 500,],
"split_algo": [0, 1,],
"max_depth": range(8, 20),
"max_features": np.arange(0.05, 1.01, 0.05),
"min_rows_per_node": range(2, 21),
"n_bins": [64,]
},

'cuml.ensemble.RandomForestRegressor': {
'n_estimators': [100, 300, 500,],
'split_algo': [0, 1,],
'max_depth': range(8, 20),
'max_features': np.arange(0.05, 1.01, 0.05),
'min_rows_per_node': range(2, 21),
'n_bins': [8, 64,]
"cuml.linear_model.Ridge": {
},

'cuml.linear_model.Ridge': {
"xgboost.XGBRegressor": {
"n_estimators": [100],
"max_depth": range(3, 10),
"learning_rate": [1e-2, 1e-1, 0.5, 1.],
"subsample": np.arange(0.05, 1.01, 0.05),
"min_child_weight": range(1, 21),
"alpha": [1, 10],
"tree_method": ["gpu_hist"],
"nthread": [1],
"objective": ["reg:squarederror"]
},

# Sklearn + cuML Preprocesssors
'sklearn.preprocessing.Binarizer': {
'threshold': np.arange(0.0, 1.01, 0.05)
# Sklearn Preprocesssors

"sklearn.preprocessing.Binarizer": {
"threshold": np.arange(0.0, 1.01, 0.05)
},

'sklearn.decomposition.FastICA': {
'tol': np.arange(0.0, 1.01, 0.05)
"sklearn.decomposition.FastICA": {
"tol": np.arange(0.0, 1.01, 0.05)
},

'sklearn.cluster.FeatureAgglomeration': {
'linkage': ['ward', 'complete', 'average'],
'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
"sklearn.cluster.FeatureAgglomeration": {
"linkage": ["ward", "complete", "average"],
"affinity": ["euclidean", "l1", "l2", "manhattan", "cosine"]
},

'sklearn.preprocessing.MaxAbsScaler': {
"sklearn.preprocessing.MaxAbsScaler": {
},

'sklearn.preprocessing.MinMaxScaler': {
"sklearn.preprocessing.MinMaxScaler": {
},

'sklearn.preprocessing.Normalizer': {
'norm': ['l1', 'l2', 'max']
"sklearn.preprocessing.Normalizer": {
"norm": ["l1", "l2", "max"]
},

'sklearn.kernel_approximation.Nystroem': {
'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'],
'gamma': np.arange(0.0, 1.01, 0.05),
'n_components': range(1, 11)
"sklearn.kernel_approximation.Nystroem": {
"kernel": ["rbf", "cosine", "chi2", "laplacian", "polynomial", "poly", "linear", "additive_chi2", "sigmoid"],
"gamma": np.arange(0.0, 1.01, 0.05),
"n_components": range(1, 11)
},

'cuml.decomposition.PCA': {
'svd_solver': ['jacobi'],
'iterated_power': range(1, 11),
"sklearn.decomposition.PCA": {
"svd_solver": ["randomized"],
"iterated_power": range(1, 11)
},

'sklearn.preprocessing.PolynomialFeatures': {
'degree': [2],
'include_bias': [False],
'interaction_only': [False]
"sklearn.kernel_approximation.RBFSampler": {
"gamma": np.arange(0.0, 1.01, 0.05)
},

'sklearn.kernel_approximation.RBFSampler': {
'gamma': np.arange(0.0, 1.01, 0.05)
"sklearn.preprocessing.RobustScaler": {
},

'sklearn.preprocessing.RobustScaler': {
"sklearn.preprocessing.StandardScaler": {
},

'sklearn.preprocessing.StandardScaler': {
"tpot.builtins.ZeroCount": {
},

'tpot.builtins.ZeroCount': {
"tpot.builtins.OneHotEncoder": {
"minimum_fraction": [0.05, 0.1, 0.15, 0.2, 0.25],
"sparse": [False],
"threshold": [10]
},

'tpot.builtins.OneHotEncoder': {
'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
'sparse': [False],
'threshold': [10]
# Selectors

"sklearn.feature_selection.SelectFwe": {
"alpha": np.arange(0, 0.05, 0.001),
"score_func": {
"sklearn.feature_selection.f_classif": None
}
},

"sklearn.feature_selection.SelectPercentile": {
"percentile": range(1, 100),
"score_func": {
"sklearn.feature_selection.f_classif": None
}
},

"sklearn.feature_selection.VarianceThreshold": {
"threshold": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
}
}
73 changes: 40 additions & 33 deletions tutorials/Higgs_Boson.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML.\n",
"This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML.\n",
"\n",
"It is intended to show how the `TPOT cuML` configuration can provide significant benefits on medium-sized and larger datasets. "
]
Expand All @@ -15,15 +15,17 @@
"source": [
"## Downloading Data\n",
"\n",
"This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UCI Machine Learning Repositoru."
"This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UC Irvine Machine Learning Repository."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
Expand All @@ -34,28 +36,33 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# This is a 2.7 GB file.\n",
"# Please make sure you have space before uncommenting the code below and downloading this file.\n",
"# Please make sure you have enough space available before\n",
"# uncommenting the code below and downloading this file.\n",
"\n",
"if not os.path.isfile(\"HIGGS.csv.gz\"):\n",
" !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
"DATA_DIRECTORY = \"./\"\n",
"DATASET_PATH = os.path.join(DATA_DIRECTORY, \"HIGGS.csv.gz\")\n",
"\n",
"# if not os.path.isfile(DATASET_PATH):\n",
"# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# This fuction is borrowed from https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n",
"# This fuction is borrowed and adapted from\n",
"# https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n",
"# Thanks!\n",
"\n",
"def prepare_higgs(dataset_folder, nrows=None):\n",
" higgs = pd.read_csv(\"HIGGS.csv.gz\", nrows=nrows)\n",
"def prepare_higgs(nrows=None):\n",
" higgs = pd.read_csv(DATASET_PATH, nrows=nrows)\n",
" X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)\n",
" y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)\n",
" return train_test_split(X, y, stratify=y, random_state=77, test_size=0.2)"
Expand All @@ -74,19 +81,19 @@
"source": [
"In the interest of time, we'll only use a 500,000 row sample of this file. 500,000 rows is more than enough for this example.\n",
"\n",
"With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup.\n",
"With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup while achieving essentially equivalent accuracy.\n",
"\n",
"Such speedups also mean you can create larger evolutionary search strategies while **still** returning faster results."
"Such speedups also mean you can create larger evolutionary search strategies while **still** obtaining faster results."
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"NROWS = 500_000\n",
"X_train, X_test, y_train, y_test = prepare_higgs(\"./\", nrows=NROWS)"
"X_train, X_test, y_train, y_test = prepare_higgs(nrows=NROWS)"
]
},
{
Expand All @@ -98,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand All @@ -121,29 +128,29 @@
"text": [
"\n",
"Generation 1 - Current best internal CV score: 0.7103025000000001\n",
"Generation 2 - Current best internal CV score: 0.71385\n",
"Generation 2 - Current best internal CV score: 0.7103025000000001\n",
"Generation 3 - Current best internal CV score: 0.725755\n",
"Generation 4 - Current best internal CV score: 0.7299725\n",
"Generation 5 - Current best internal CV score: 0.7299725\n",
"Generation 6 - Current best internal CV score: 0.7299725\n",
"Generation 7 - Current best internal CV score: 0.7309975\n",
"Generation 8 - Current best internal CV score: 0.7309975\n",
"Generation 9 - Current best internal CV score: 0.7309975\n",
"Generation 10 - Current best internal CV score: 0.7309975\n",
"Best pipeline: XGBClassifier(ZeroCount(input_matrix), alpha=1, learning_rate=0.1, max_depth=6, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.8500000000000001, tree_method=gpu_hist)\n",
"CPU times: user 4min 59s, sys: 13min 27s, total: 18min 27s\n",
"Wall time: 18min 29s\n"
"Generation 4 - Current best internal CV score: 0.727995\n",
"Generation 5 - Current best internal CV score: 0.727995\n",
"Generation 6 - Current best internal CV score: 0.730315\n",
"Generation 7 - Current best internal CV score: 0.730315\n",
"Generation 8 - Current best internal CV score: 0.730315\n",
"Generation 9 - Current best internal CV score: 0.7308699999999999\n",
"Generation 10 - Current best internal CV score: 0.7347775\n",
"Best pipeline: XGBClassifier(input_matrix, alpha=1, learning_rate=0.1, max_depth=8, min_child_weight=19, n_estimators=100, nthread=1, subsample=0.8, tree_method=gpu_hist)\n",
"CPU times: user 5min 34s, sys: 1min 16s, total: 6min 50s\n",
"Wall time: 6min 52s\n"
]
},
{
"data": {
"text/plain": [
"TPOTClassifier(config_dict='TPOT cuML', cv=2, generations=10,\n",
" log_file=<ipykernel.iostream.OutStream object at 0x7f282044a7d0>,\n",
" log_file=<ipykernel.iostream.OutStream object at 0x7f87fe597d50>,\n",
" population_size=10, random_state=12, verbosity=2)"
]
},
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -172,16 +179,16 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7308499813079834\n",
"CPU times: user 565 ms, sys: 5.52 ms, total: 570 ms\n",
"Wall time: 569 ms\n"
"0.73669\n",
"CPU times: user 816 ms, sys: 39.9 ms, total: 856 ms\n",
"Wall time: 855 ms\n"
]
}
],
Expand Down
Loading

0 comments on commit 29ee28f

Please sign in to comment.