clean up configs; update notebooks

EpistasisLab · Aug 26, 2020 · 29ee28f · 29ee28f
1 parent eb3909b
commit 29ee28f
Show file tree

Hide file tree

Showing 5 changed files with 166 additions and 144 deletions.
diff --git a/tpot/config/classifier_cuml.py b/tpot/config/classifier_cuml.py
@@ -30,7 +30,7 @@
 # the scikit-learn preprocessors in the TPOT default configuration.
 
 classifier_config_cuml = {
-    # cuML + XGboost Classifiers
+    # cuML + DMLC/XGBoost Classifiers
 
     "cuml.neighbors.KNeighborsClassifier": {
         "n_neighbors": range(1, 101),
@@ -118,6 +118,7 @@
     },
 
     # Selectors
+
     "sklearn.feature_selection.SelectFwe": {
         "alpha": np.arange(0, 0.05, 0.001),
         "score_func": {

diff --git a/tpot/config/regressor_cuml.py b/tpot/config/regressor_cuml.py
@@ -26,100 +26,122 @@
 import numpy as np
 
 # This configuration provides users with access to a GPU the ability to
-# use cuML regressors as estimators alongside the scikit-learn
-# preprocessors in the TPOT default configuration.
+# use RAPIDS cuML and DMLC/XGBoost regressors as estimators alongside
+# the scikit-learn preprocessors in the TPOT default configuration.
 
 regressor_config_cuml = {
-    # cuML Regressors
+    # cuML + DMLC/XGBoost Regressors
 
-    'cuml.linear_model.ElasticNet': {
-        'l1_ratio': np.arange(0.0, 1.01, 0.05),
-        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    "cuml.linear_model.ElasticNet": {
+        "l1_ratio": np.arange(0.0, 1.01, 0.05),
+        "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
     },
 
-    'cuml.neighbors.KNeighborsRegressor': {
-        'n_neighbors': range(1, 101),
-        'weights': ["uniform"],
+    "cuml.neighbors.KNeighborsRegressor": {
+        "n_neighbors": range(1, 101),
+        "weights": ["uniform"],
     },
 
-    'cuml.linear_model.Lasso': {
-        'normalize': [True, False]
+    "cuml.linear_model.Lasso": {
+        "normalize": [True, False]
     },
 
-    'cuml.svm.SVR': {
-        'tol': [1e-4, 1e-3, 1e-2, 1e-1,],
-        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,]
+    "cuml.ensemble.RandomForestRegressor": {
+        "n_estimators": [100, 300, 500,],
+        "split_algo": [0, 1,],
+        "max_depth": range(8, 20),
+        "max_features": np.arange(0.05, 1.01, 0.05),
+        "min_rows_per_node": range(2, 21),
+        "n_bins": [64,]
     },
 
-    'cuml.ensemble.RandomForestRegressor': {
-        'n_estimators': [100, 300, 500,],
-        'split_algo': [0, 1,],
-        'max_depth': range(8, 20),
-        'max_features': np.arange(0.05, 1.01, 0.05),
-        'min_rows_per_node': range(2, 21),
-        'n_bins': [8, 64,]
+    "cuml.linear_model.Ridge": {
     },
 
-    'cuml.linear_model.Ridge': {
+    "xgboost.XGBRegressor": {
+        "n_estimators": [100],
+        "max_depth": range(3, 10),
+        "learning_rate": [1e-2, 1e-1, 0.5, 1.],
+        "subsample": np.arange(0.05, 1.01, 0.05),
+        "min_child_weight": range(1, 21),
+        "alpha": [1, 10],
+        "tree_method": ["gpu_hist"],
+        "nthread": [1],
+        "objective": ["reg:squarederror"]
     },
 
-    # Sklearn + cuML Preprocesssors
-    'sklearn.preprocessing.Binarizer': {
-        'threshold': np.arange(0.0, 1.01, 0.05)
+    # Sklearn Preprocesssors
+
+    "sklearn.preprocessing.Binarizer": {
+        "threshold": np.arange(0.0, 1.01, 0.05)
     },
 
-    'sklearn.decomposition.FastICA': {
-        'tol': np.arange(0.0, 1.01, 0.05)
+    "sklearn.decomposition.FastICA": {
+        "tol": np.arange(0.0, 1.01, 0.05)
     },
 
-    'sklearn.cluster.FeatureAgglomeration': {
-        'linkage': ['ward', 'complete', 'average'],
-        'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
+    "sklearn.cluster.FeatureAgglomeration": {
+        "linkage": ["ward", "complete", "average"],
+        "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine"]
     },
 
-    'sklearn.preprocessing.MaxAbsScaler': {
+    "sklearn.preprocessing.MaxAbsScaler": {
     },
 
-    'sklearn.preprocessing.MinMaxScaler': {
+    "sklearn.preprocessing.MinMaxScaler": {
     },
 
-    'sklearn.preprocessing.Normalizer': {
-        'norm': ['l1', 'l2', 'max']
+    "sklearn.preprocessing.Normalizer": {
+        "norm": ["l1", "l2", "max"]
     },
 
-    'sklearn.kernel_approximation.Nystroem': {
-        'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'],
-        'gamma': np.arange(0.0, 1.01, 0.05),
-        'n_components': range(1, 11)
+    "sklearn.kernel_approximation.Nystroem": {
+        "kernel": ["rbf", "cosine", "chi2", "laplacian", "polynomial", "poly", "linear", "additive_chi2", "sigmoid"],
+        "gamma": np.arange(0.0, 1.01, 0.05),
+        "n_components": range(1, 11)
     },
 
-    'cuml.decomposition.PCA': {
-        'svd_solver': ['jacobi'],
-        'iterated_power': range(1, 11),
+    "sklearn.decomposition.PCA": {
+        "svd_solver": ["randomized"],
+        "iterated_power": range(1, 11)
     },
 
-    'sklearn.preprocessing.PolynomialFeatures': {
-        'degree': [2],
-        'include_bias': [False],
-        'interaction_only': [False]
+    "sklearn.kernel_approximation.RBFSampler": {
+        "gamma": np.arange(0.0, 1.01, 0.05)
     },
 
-    'sklearn.kernel_approximation.RBFSampler': {
-        'gamma': np.arange(0.0, 1.01, 0.05)
+    "sklearn.preprocessing.RobustScaler": {
     },
 
-    'sklearn.preprocessing.RobustScaler': {
+    "sklearn.preprocessing.StandardScaler": {
     },
 
-    'sklearn.preprocessing.StandardScaler': {
+    "tpot.builtins.ZeroCount": {
     },
 
-    'tpot.builtins.ZeroCount': {
+    "tpot.builtins.OneHotEncoder": {
+        "minimum_fraction": [0.05, 0.1, 0.15, 0.2, 0.25],
+        "sparse": [False],
+        "threshold": [10]
     },
 
-    'tpot.builtins.OneHotEncoder': {
-        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
-        'sparse': [False],
-        'threshold': [10]
+    # Selectors
+
+    "sklearn.feature_selection.SelectFwe": {
+        "alpha": np.arange(0, 0.05, 0.001),
+        "score_func": {
+            "sklearn.feature_selection.f_classif": None
+        }
     },
+
+    "sklearn.feature_selection.SelectPercentile": {
+        "percentile": range(1, 100),
+        "score_func": {
+            "sklearn.feature_selection.f_classif": None
+        }
+    },
+
+    "sklearn.feature_selection.VarianceThreshold": {
+        "threshold": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
+    }
 }
diff --git a/tutorials/Higgs_Boson.ipynb b/tutorials/Higgs_Boson.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML.\n",
+    "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML.\n",
     "\n",
     "It is intended to show how the `TPOT cuML` configuration can provide significant benefits on medium-sized and larger datasets. "
    ]
@@ -15,15 +15,17 @@
    "source": [
     "## Downloading Data\n",
     "\n",
-    "This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UCI Machine Learning Repositoru."
+    "This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UC Irvine Machine Learning Repository."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "\n",
     "import pandas as pd\n",
     "import numpy as np\n",
     "from sklearn.model_selection import train_test_split\n",
@@ -34,28 +36,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "# This is a 2.7 GB file.\n",
-    "# Please make sure you have space before uncommenting the code below and downloading this file.\n",
+    "# Please make sure you have enough space available before\n",
+    "# uncommenting the code below and downloading this file.\n",
     "\n",
-    "if not os.path.isfile(\"HIGGS.csv.gz\"):\n",
-    "    !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
+    "DATA_DIRECTORY = \"./\"\n",
+    "DATASET_PATH = os.path.join(DATA_DIRECTORY, \"HIGGS.csv.gz\")\n",
+    "\n",
+    "# if not os.path.isfile(DATASET_PATH):\n",
+    "#     !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# This fuction is borrowed from https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n",
+    "# This fuction is borrowed and adapted from\n",
+    "# https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n",
     "# Thanks!\n",
     "\n",
-    "def prepare_higgs(dataset_folder, nrows=None):\n",
-    "    higgs = pd.read_csv(\"HIGGS.csv.gz\", nrows=nrows)\n",
+    "def prepare_higgs(nrows=None):\n",
+    "    higgs = pd.read_csv(DATASET_PATH, nrows=nrows)\n",
     "    X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)\n",
     "    y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)\n",
     "    return train_test_split(X, y, stratify=y, random_state=77, test_size=0.2)"
@@ -74,19 +81,19 @@
    "source": [
     "In the interest of time, we'll only use a 500,000 row sample of this file. 500,000 rows is more than enough for this example.\n",
     "\n",
-    "With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup.\n",
+    "With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup while achieving essentially equivalent accuracy.\n",
     "\n",
-    "Such speedups also mean you can create larger evolutionary search strategies while **still** returning faster results."
+    "Such speedups also mean you can create larger evolutionary search strategies while **still** obtaining faster results."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
     "NROWS = 500_000\n",
-    "X_train, X_test, y_train, y_test = prepare_higgs(\"./\", nrows=NROWS)"
+    "X_train, X_test, y_train, y_test = prepare_higgs(nrows=NROWS)"
    ]
   },
   {
@@ -98,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -121,29 +128,29 @@
      "text": [
       "\n",
       "Generation 1 - Current best internal CV score: 0.7103025000000001\n",
-      "Generation 2 - Current best internal CV score: 0.71385\n",
+      "Generation 2 - Current best internal CV score: 0.7103025000000001\n",
       "Generation 3 - Current best internal CV score: 0.725755\n",
-      "Generation 4 - Current best internal CV score: 0.7299725\n",
-      "Generation 5 - Current best internal CV score: 0.7299725\n",
-      "Generation 6 - Current best internal CV score: 0.7299725\n",
-      "Generation 7 - Current best internal CV score: 0.7309975\n",
-      "Generation 8 - Current best internal CV score: 0.7309975\n",
-      "Generation 9 - Current best internal CV score: 0.7309975\n",
-      "Generation 10 - Current best internal CV score: 0.7309975\n",
-      "Best pipeline: XGBClassifier(ZeroCount(input_matrix), alpha=1, learning_rate=0.1, max_depth=6, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.8500000000000001, tree_method=gpu_hist)\n",
-      "CPU times: user 4min 59s, sys: 13min 27s, total: 18min 27s\n",
-      "Wall time: 18min 29s\n"
+      "Generation 4 - Current best internal CV score: 0.727995\n",
+      "Generation 5 - Current best internal CV score: 0.727995\n",
+      "Generation 6 - Current best internal CV score: 0.730315\n",
+      "Generation 7 - Current best internal CV score: 0.730315\n",
+      "Generation 8 - Current best internal CV score: 0.730315\n",
+      "Generation 9 - Current best internal CV score: 0.7308699999999999\n",
+      "Generation 10 - Current best internal CV score: 0.7347775\n",
+      "Best pipeline: XGBClassifier(input_matrix, alpha=1, learning_rate=0.1, max_depth=8, min_child_weight=19, n_estimators=100, nthread=1, subsample=0.8, tree_method=gpu_hist)\n",
+      "CPU times: user 5min 34s, sys: 1min 16s, total: 6min 50s\n",
+      "Wall time: 6min 52s\n"
      ]
     },
     {
      "data": {
       "text/plain": [
        "TPOTClassifier(config_dict='TPOT cuML', cv=2, generations=10,\n",
-       "               log_file=<ipykernel.iostream.OutStream object at 0x7f282044a7d0>,\n",
+       "               log_file=<ipykernel.iostream.OutStream object at 0x7f87fe597d50>,\n",
        "               population_size=10, random_state=12, verbosity=2)"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -172,16 +179,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.7308499813079834\n",
-      "CPU times: user 565 ms, sys: 5.52 ms, total: 570 ms\n",
-      "Wall time: 569 ms\n"
+      "0.73669\n",
+      "CPU times: user 816 ms, sys: 39.9 ms, total: 856 ms\n",
+      "Wall time: 855 ms\n"
      ]
     }
    ],