alteryx · angela97lin · Feb 13, 2020 · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -5,6 +5,7 @@ Changelog
 **Future Releases**
     * Enhancements
         * Added emacs buffers to .gitignore :pr:`350`
+        * Add CatBoost (gradient-boosted trees) classification and regression components and pipelines :pr:`247`
     * Fixes
         * Fixed ROC and confusion matrix plots not being calculated if user passed own additional_objectives :pr:`276`
     * Changes

diff --git a/evalml/model_types/model_types.py b/evalml/model_types/model_types.py
@@ -6,9 +6,11 @@ class ModelTypes(Enum):
     RANDOM_FOREST = 'random_forest'
     XGBOOST = 'xgboost'
     LINEAR_MODEL = 'linear_model'
+    CATBOOST = 'catboost'
 
     def __str__(self):
         model_type_dict = {ModelTypes.RANDOM_FOREST.name: "Random Forest",
                            ModelTypes.XGBOOST.name: "XGBoost Classifier",
-                           ModelTypes.LINEAR_MODEL.name: "Linear Model"}
+                           ModelTypes.LINEAR_MODEL.name: "Linear Model",
+                           ModelTypes.CATBOOST.name: "CatBoost Classifier"}
         return model_type_dict[self.name]
diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py
@@ -14,16 +14,23 @@
     FeatureSelector,
     CategoricalEncoder,
     RFClassifierSelectFromModel,
-    RFRegressorSelectFromModel
+    RFRegressorSelectFromModel,
+    CatBoostClassifier,
+    CatBoostRegressor
 )
 
 from .pipeline_base import PipelineBase
 from .classification import (
     LogisticRegressionPipeline,
     RFClassificationPipeline,
-    XGBoostPipeline
+    XGBoostPipeline,
+    CatBoostClassificationPipeline,
+)
+from .regression import (
+    LinearRegressionPipeline,
+    RFRegressionPipeline,
+    CatBoostRegressionPipeline
 )
-from .regression import LinearRegressionPipeline, RFRegressionPipeline
 from .utils import (
     get_pipelines,
     list_model_types,

diff --git a/evalml/pipelines/classification/__init__.py b/evalml/pipelines/classification/__init__.py
@@ -2,3 +2,4 @@
 from .logistic_regression import LogisticRegressionPipeline
 from .random_forest import RFClassificationPipeline
 from .xgboost import XGBoostPipeline
+from .catboost import CatBoostClassificationPipeline
diff --git a/evalml/pipelines/classification/catboost.py b/evalml/pipelines/classification/catboost.py
@@ -0,0 +1,39 @@
+from skopt.space import Integer, Real
+
+from evalml.model_types import ModelTypes
+from evalml.pipelines import PipelineBase
+from evalml.pipelines.components import CatBoostClassifier, SimpleImputer
+from evalml.problem_types import ProblemTypes
+
+
+class CatBoostClassificationPipeline(PipelineBase):
+    """
+    CatBoost Pipeline for both binary and multiclass classification.
+    CatBoost is an open-source library and natively supports categorical features.
+
+    For more information, check out https://catboost.ai/
+    """
+    name = "CatBoost Classifier w/ Simple Imputer"
+    model_type = ModelTypes.CATBOOST
+    problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
+    hyperparameters = {
+        "impute_strategy": ["most_frequent"],
+        "n_estimators": Integer(10, 1000),
+        "eta": Real(0, 1),
+        "max_depth": Integer(1, 8),
+    }
+
+    def __init__(self, objective, impute_strategy, n_estimators,
+                 eta, max_depth, number_features, bootstrap_type=None,
+                 n_jobs=1, random_state=0):
+        # note: impute_strategy must support both string and numeric data
+        imputer = SimpleImputer(impute_strategy=impute_strategy)
+        estimator = CatBoostClassifier(n_estimators=n_estimators,
+                                       eta=eta,
+                                       max_depth=max_depth,
+                                       bootstrap_type=bootstrap_type,
+                                       random_state=random_state)
+        super().__init__(objective=objective,
+                         component_list=[imputer, estimator],
+                         n_jobs=1,
+                         random_state=random_state)
diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
@@ -7,7 +7,9 @@
     LogisticRegressionClassifier,
     RandomForestClassifier,
     RandomForestRegressor,
-    XGBoostClassifier
+    XGBoostClassifier,
+    CatBoostClassifier,
+    CatBoostRegressor
 )
 from .transformers import (
     Transformer,

diff --git a/evalml/pipelines/components/estimators/__init__.py b/evalml/pipelines/components/estimators/__init__.py
@@ -2,6 +2,8 @@
 from .estimator import Estimator
 from .classifiers import (LogisticRegressionClassifier,
                           RandomForestClassifier,
-                          XGBoostClassifier)
+                          XGBoostClassifier,
+                          CatBoostClassifier)
 from .regressors import (LinearRegressor,
-                         RandomForestRegressor)
+                         RandomForestRegressor,
+                         CatBoostRegressor)
diff --git a/evalml/pipelines/components/estimators/classifiers/__init__.py b/evalml/pipelines/components/estimators/classifiers/__init__.py
@@ -2,3 +2,4 @@
 from .logistic_regression import LogisticRegressionClassifier
 from .rf_classifier import RandomForestClassifier
 from .xgboost_classifier import XGBoostClassifier
+from .catboost_classifier import CatBoostClassifier
diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
@@ -0,0 +1,67 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+from skopt.space import Integer, Real
+
+from evalml.model_types import ModelTypes
+from evalml.pipelines.components import ComponentTypes
+from evalml.pipelines.components.estimators import Estimator
+from evalml.problem_types import ProblemTypes
+from evalml.utils import import_or_raise
+
+
+class CatBoostClassifier(Estimator):
+    """
+    CatBoost Classifier, a classifier that uses gradient-boosting on decision trees.
+    CatBoost is an open-source library and natively supports categorical features.
+
+    For more information, check out https://catboost.ai/
+    """
+    name = "CatBoost Classifier"
+    component_type = ComponentTypes.CLASSIFIER
+    _needs_fitting = True
+    hyperparameter_ranges = {
+        "n_estimators": Integer(10, 1000),
+        "eta": Real(0, 1),
+        "max_depth": Integer(1, 16),
+    }
+    model_type = ModelTypes.CATBOOST
+    problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
+
+    def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0):
+        parameters = {"n_estimators": n_estimators,
+                      "eta": eta,
+                      "max_depth": max_depth}
+        if bootstrap_type is not None:
+            parameters['bootstrap_type'] = bootstrap_type
+
+        cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
+        catboost = import_or_raise("catboost", error_msg=cb_error_msg)
+        self._label_encoder = None
+        cb_classifier = catboost.CatBoostClassifier(**parameters,
+                                                    silent=True,
+                                                    allow_writing_files=False)
+        super().__init__(parameters=parameters,
+                         component_obj=cb_classifier,
+                         random_state=random_state)
+
+    def fit(self, X, y=None):
+        cat_cols = X.select_dtypes(['category', 'object'])
+
+        # For binary classification, catboost expects numeric values, so encoding before.
+        if y.nunique() <= 2:
+            self._label_encoder = LabelEncoder()
+            y = pd.Series(self._label_encoder.fit_transform(y))
+        model = self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
+        return model
+
+    def predict(self, X):
+        predictions = self._component_obj.predict(X)
+        if self._label_encoder:
+            return self._label_encoder.inverse_transform(predictions.astype(np.int64))
+
+        return predictions
+
+    @property
+    def feature_importances(self):
+        return self._component_obj.get_feature_importance()
diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py
@@ -1,10 +1,10 @@
 from skopt.space import Integer, Real
-from xgboost import XGBClassifier
 
 from evalml.model_types import ModelTypes
 from evalml.pipelines.components import ComponentTypes
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
+from evalml.utils import import_or_raise
 
 
 class XGBoostClassifier(Estimator):
@@ -26,11 +26,13 @@ def __init__(self, eta=0.1, max_depth=3, min_child_weight=1, n_estimators=100, r
                       "max_depth": max_depth,
                       "min_child_weight": min_child_weight,
                       "n_estimators": n_estimators}
-        xgb_classifier = XGBClassifier(random_state=random_state,
-                                       eta=eta,
-                                       max_depth=max_depth,
-                                       n_estimators=n_estimators,
-                                       min_child_weight=min_child_weight)
+        xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`"
+        xgb = import_or_raise("xgboost", error_msg=xgb_error_msg)
+        xgb_classifier = xgb.XGBClassifier(random_state=random_state,
+                                           eta=eta,
+                                           max_depth=max_depth,
+                                           n_estimators=n_estimators,
+                                           min_child_weight=min_child_weight)
         super().__init__(parameters=parameters,
                          component_obj=xgb_classifier,
                          random_state=random_state)

diff --git a/evalml/pipelines/components/estimators/regressors/__init__.py b/evalml/pipelines/components/estimators/regressors/__init__.py
@@ -1,4 +1,4 @@
 # flake8:noqa
 from .linear_regressor import LinearRegressor
 from .rf_regressor import RandomForestRegressor
-
+from .catboost_regressor import CatBoostRegressor
diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py
@@ -0,0 +1,61 @@
+from skopt.space import Integer, Real
+
+from evalml.model_types import ModelTypes
+from evalml.pipelines.components import ComponentTypes
+from evalml.pipelines.components.estimators import Estimator
+from evalml.problem_types import ProblemTypes
+from evalml.utils import import_or_raise
+
+
+class CatBoostRegressor(Estimator):
+    """
+    CatBoost Regressor, a regressor that uses gradient-boosting on decision trees.
+    CatBoost is an open-source library and natively supports categorical features.
+
+    For more information, check out https://catboost.ai/
+    """
+    name = "CatBoost Regressor"
+    component_type = ComponentTypes.REGRESSOR
+    _needs_fitting = True
+    hyperparameter_ranges = {
+        "n_estimators": Integer(10, 1000),
+        "eta": Real(0, 1),
+        "max_depth": Integer(1, 16),
+    }
+    model_type = ModelTypes.CATBOOST
+    problem_types = [ProblemTypes.REGRESSION]
+
+    def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0):
+        parameters = {"n_estimators": n_estimators,
+                      "eta": eta,
+                      "max_depth": max_depth}
+        if bootstrap_type is not None:
+            parameters['bootstrap_type'] = bootstrap_type
+
+        cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
+        catboost = import_or_raise("catboost", error_msg=cb_error_msg)
+        cb_regressor = catboost.CatBoostRegressor(**parameters,
+                                                  random_state=random_state,
+                                                  silent=True,
+                                                  allow_writing_files=False)
+        super().__init__(parameters=parameters,
+                         component_obj=cb_regressor,
+                         random_state=random_state)
+
+    def fit(self, X, y=None):
+        """Build a model
+
+        Arguments:
+            X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
+            y (pd.Series): the target training labels of length [n_samples]
+
+        Returns:
+            self
+        """
+        cat_cols = X.select_dtypes(['object', 'category'])
+        model = self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
+        return model
+
+    @property
+    def feature_importances(self):
+        return self._component_obj.get_feature_importance()
diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py
@@ -29,11 +29,9 @@ def __init__(self, number_features=None, n_estimators=10, max_depth=None,
                                              n_estimators=n_estimators,
                                              max_depth=max_depth,
                                              n_jobs=n_jobs)
-        feature_selection = SkSelect(
-            estimator=estimator,
-            max_features=max_features,
-            threshold=threshold
-        )
+        feature_selection = SkSelect(estimator=estimator,
+                                     max_features=max_features,
+                                     threshold=threshold)
 
         super().__init__(parameters=parameters,
                          component_obj=feature_selection,

diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py
@@ -29,11 +29,9 @@ def __init__(self, number_features=None, n_estimators=10, max_depth=None,
                                             n_estimators=n_estimators,
                                             max_depth=max_depth,
                                             n_jobs=n_jobs)
-        feature_selection = SkSelect(
-            estimator=estimator,
-            max_features=max_features,
-            threshold=threshold
-        )
+        feature_selection = SkSelect(estimator=estimator,
+                                     max_features=max_features,
+                                     threshold=threshold)
 
         super().__init__(parameters=parameters,
                          component_obj=feature_selection,

diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
@@ -263,7 +263,7 @@ def score(self, X, y, other_objectives=None):
 
     @property
     def feature_importances(self):
-        """Return feature importances. Feature dropped by feaure selection are excluded"""
+        """Return feature importances. Features dropped by feature selection are excluded"""
         feature_names = self.input_feature_names[self.estimator.name]
         importances = list(zip(feature_names, self.estimator.feature_importances))  # note: this only works for binary
         importances.sort(key=lambda x: -abs(x[1]))

diff --git a/evalml/pipelines/regression/__init__.py b/evalml/pipelines/regression/__init__.py
@@ -1,3 +1,4 @@
 # flake8:noqa
 from .linear_regression import LinearRegressionPipeline
 from .random_forest import RFRegressionPipeline
+from .catboost import CatBoostRegressionPipeline
diff --git a/evalml/pipelines/regression/catboost.py b/evalml/pipelines/regression/catboost.py
@@ -0,0 +1,39 @@
+from skopt.space import Integer, Real
+
+from evalml.model_types import ModelTypes
+from evalml.pipelines import PipelineBase
+from evalml.pipelines.components import CatBoostRegressor, SimpleImputer
+from evalml.problem_types import ProblemTypes
+
+
+class CatBoostRegressionPipeline(PipelineBase):
+    """
+    CatBoost Pipeline for regression problems.
+    CatBoost is an open-source library and natively supports categorical features.
+
+    For more information, check out https://catboost.ai/
+    """
+    name = "CatBoost Regressor w/ Simple Imputer"
+    model_type = ModelTypes.CATBOOST
+    problem_types = [ProblemTypes.REGRESSION]
+    hyperparameters = {
+        "impute_strategy": ["most_frequent"],
+        "n_estimators": Integer(10, 1000),
+        "eta": Real(0, 1),
+        "max_depth": Integer(1, 8),
+    }
+
+    def __init__(self, objective, impute_strategy, n_estimators, eta,
+                 max_depth, number_features, bootstrap_type=None,
+                 n_jobs=-1, random_state=0):
+        # note: impute_strategy must support both string and numeric data
+        imputer = SimpleImputer(impute_strategy=impute_strategy)
+        estimator = CatBoostRegressor(n_estimators=n_estimators,
+                                      eta=eta,
+                                      max_depth=max_depth,
+                                      bootstrap_type=bootstrap_type,
+                                      random_state=random_state)
+        super().__init__(objective=objective,
+                         component_list=[imputer, estimator],
+                         n_jobs=1,
+                         random_state=random_state)
diff --git a/evalml/pipelines/regression/linear_regression.py b/evalml/pipelines/regression/linear_regression.py
@@ -21,7 +21,8 @@ class LinearRegressionPipeline(PipelineBase):
         'fit_intercept': [False, True]
     }
 
-    def __init__(self, objective, random_state, number_features, impute_strategy, normalize=False, fit_intercept=True, n_jobs=-1):
+    def __init__(self, objective, number_features, impute_strategy,
+                 normalize=False, fit_intercept=True, random_state=0, n_jobs=-1):
 
         imputer = SimpleImputer(impute_strategy=impute_strategy)
         enc = OneHotEncoder()