Tabular: Added support for specifying early stopping rounds in GBM, C…

…AT, XGB (open-mmlab#1037) * Tabular: Added support for specifying early stopping rounds in GBM, CAT, XGB * addressed comment
FANGAreNotGnu · Mar 31, 2021 · e93e409 · e93e409
1 parent d738ac2
commit e93e409
Show file tree

Hide file tree

Showing 13 changed files with 135 additions and 60 deletions.
diff --git a/core/src/autogluon/core/models/_utils.py b/core/src/autogluon/core/models/_utils.py
@@ -0,0 +1,14 @@
+
+# TODO: Add more strategies
+#  - Adaptive early stopping: adjust rounds during model training
+def get_early_stopping_rounds(num_rows_train, strategy='auto', min_rounds=10, max_rounds=150, min_rows=10000):
+    """Gets early stopping rounds"""
+    if strategy == 'auto':
+        modifier = 1 if num_rows_train <= min_rows else min_rows / num_rows_train
+        early_stopping_rounds = max(
+            round(modifier * max_rounds),
+            min_rounds,
+        )
+    else:
+        raise AssertionError(f'unknown early stopping strategy: {strategy}')
+    return early_stopping_rounds
diff --git a/core/src/autogluon/core/models/abstract/abstract_model.py b/core/src/autogluon/core/models/abstract/abstract_model.py
@@ -705,7 +705,7 @@ def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
         time_start = time.time()
         logger.log(15, "Starting generic AbstractModel hyperparameter tuning for %s model..." % self.name)
         self._set_default_searchspace()
-        params_copy = self.params.copy()
+        params_copy = self._get_params()
         directory = self.path  # also create model directory if it doesn't exist
         # TODO: This will break on S3. Use tabular/utils/savers for datasets, add new function
         scheduler_cls, scheduler_params = scheduler_options  # Unpack tuple
@@ -918,7 +918,6 @@ def _get_default_ag_args_ensemble(cls) -> dict:
         """
         return {}
 
-
     def _get_default_stopping_metric(self):
         """
         Returns the default stopping metric to use for early stopping.
@@ -932,6 +931,46 @@ def _get_default_stopping_metric(self):
         stopping_metric = metrics.get_metric(stopping_metric, self.problem_type, 'stopping_metric')
         return stopping_metric
 
+    def _get_params(self) -> dict:
+        """Gets all params."""
+        return self.params.copy()
+
+    def _get_ag_params(self) -> dict:
+        """Gets params that are not passed to the inner model, but are used by the wrapper."""
+        ag_param_names = self._ag_params()
+        if ag_param_names:
+            return {key: val for key, val in self.params.items() if key in ag_param_names}
+        else:
+            return dict()
+
+    def _get_model_params(self) -> dict:
+        """Gets params that are passed to the inner model."""
+        ag_param_names = self._ag_params()
+        if ag_param_names:
+            return {key: val for key, val in self.params.items() if key not in ag_param_names}
+        else:
+            return self._get_params()
+
+    # TODO: Add documentation for valid args for each model. Currently only `ag.es`
+    def _ag_params(self) -> set:
+        """
+        Set of params that are not passed to self.model, but are used by the wrapper.
+        For developers, this is purely optional and is just for convenience to logically distinguish between model specific parameters and added AutoGluon functionality.
+        The goal is to have common parameter names for useful functionality shared between models,
+        even if the functionality is not natively available as a parameter in the model itself or under a different name.
+
+        Below are common patterns / options to make available. Their actual usage and options in a particular model should be documented in the model itself, as it has flexibility to differ.
+
+        Possible params:
+
+        ag.es : int, str, or tuple
+            generic name for early stopping logic. Typically can be an int or a str preset/strategy.
+            Also possible to pass tuple of (class, kwargs) to construct a custom early stopping object.
+                Refer to `autogluon.core.utils.early_stopping` for examples.
+
+        """
+        return set()
+
     def _get_tags(self):
         collected_tags = {}
         for base_class in reversed(inspect.getmro(self.__class__)):

diff --git a/tabular/src/autogluon/tabular/models/catboost/catboost_model.py b/tabular/src/autogluon/tabular/models/catboost/catboost_model.py
@@ -7,15 +7,16 @@
 import psutil
 import numpy as np
 
-from autogluon.core.utils.exceptions import NotEnoughMemoryError, TimeLimitExceeded
-from autogluon.core.utils import try_import_catboost, try_import_catboostdev
 from autogluon.core.constants import PROBLEM_TYPES_CLASSIFICATION, MULTICLASS, SOFTCLASS
 from autogluon.core.features.types import R_OBJECT
+from autogluon.core.models import AbstractModel
+from autogluon.core.models._utils import get_early_stopping_rounds
+from autogluon.core.utils.exceptions import NotEnoughMemoryError, TimeLimitExceeded
+from autogluon.core.utils import try_import_catboost, try_import_catboostdev
 
 from .catboost_utils import construct_custom_catboost_metric
 from .hyperparameters.parameters import get_param_baseline
 from .hyperparameters.searchspaces import get_default_searchspace
-from autogluon.core.models import AbstractModel
 
 logger = logging.getLogger(__name__)
 
@@ -72,7 +73,8 @@ def _fit(self,
              **kwargs):
         try_import_catboost()
         from catboost import CatBoostClassifier, CatBoostRegressor, Pool
-        params = self.params.copy()
+        ag_params = self._get_ag_params()
+        params = self._get_model_params()
         if self.problem_type == SOFTCLASS:
             try_import_catboostdev()  # Need to first import catboost then catboost_dev not vice-versa.
             from catboost_dev import CatBoostClassifier, CatBoostRegressor, Pool
@@ -114,31 +116,29 @@ def _fit(self,
         cat_features = list(X.select_dtypes(include='category').columns)
         X = Pool(data=X, label=y, cat_features=cat_features, weight=sample_weight)
 
-        if X_val is not None:
+        if X_val is None:
+            eval_set = None
+            num_sample_iter_max = 50
+            early_stopping_rounds = None
+        else:
             X_val = self.preprocess(X_val)
             X_val = Pool(data=X_val, label=y_val, cat_features=cat_features, weight=sample_weight_val)
             eval_set = X_val
-            if num_rows_train <= 10000:
-                modifier = 1
-            else:
-                modifier = 10000/num_rows_train
-            early_stopping_rounds = max(round(modifier*150), 10)
-            num_sample_iter_max = max(round(modifier*50), 2)
-        else:
-            eval_set = None
-            early_stopping_rounds = None
-            num_sample_iter_max = 50
-
-        train_dir = None
-        if 'allow_writing_files' in self.params and self.params['allow_writing_files']:
-            if 'train_dir' not in self.params:
+            modifier = min(1.0, 10000 / num_rows_train)
+            num_sample_iter_max = max(round(modifier * 50), 2)
+            early_stopping_rounds = ag_params.get('ag.es', 'auto')
+            if isinstance(early_stopping_rounds, str):
+                early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=early_stopping_rounds)
+
+        if params.get('allow_writing_files', False):
+            if 'train_dir' not in params:
                 try:
                     # TODO: What if path is in S3?
                     os.makedirs(os.path.dirname(self.path), exist_ok=True)
                 except:
                     pass
                 else:
-                    train_dir = self.path + 'catboost_info'
+                    params['train_dir'] = self.path + 'catboost_info'
 
         # TODO: Add more control over these params (specifically early_stopping_rounds)
         verbosity = kwargs.get('verbosity', 2)
@@ -187,9 +187,6 @@ def _fit(self,
 
         logger.log(15, f'\tCatboost model hyperparameters: {params}')
 
-        if train_dir is not None:
-            params['train_dir'] = train_dir
-
         if time_limit:
             time_left_start = time_limit - (time.time() - start_time)
             if time_left_start <= time_limit * 0.4:  # if 60% of time was spent preprocessing, likely not enough time to train model
@@ -326,3 +323,9 @@ def _get_best_val_score(model, metric_name):
             metric_name_sub = metric_name.split(':')[0]
             best_score = model_best_scores[metric_name_sub]
         return best_score
+
+    def _get_early_stopping_rounds(self, num_rows_train, strategy='auto'):
+        return get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=strategy)
+
+    def _ag_params(self) -> set:
+        return {'ag.es'}
diff --git a/tabular/src/autogluon/tabular/models/fastainn/tabular_nn_fastai.py b/tabular/src/autogluon/tabular/models/fastainn/tabular_nn_fastai.py
@@ -170,7 +170,7 @@ def _fit(self,
         if sample_weight is not None:  # TODO: support
             logger.log(15, "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training.")
 
-        params = self.params.copy()
+        params = self._get_model_params()
 
         self.y_scaler = params.get('y_scaler', None)
         if self.y_scaler is not None:

diff --git a/tabular/src/autogluon/tabular/models/fasttext/fasttext_model.py b/tabular/src/autogluon/tabular/models/fasttext/fasttext_model.py
@@ -72,7 +72,7 @@ def _fit(self,
         try_import_fasttext()
         import fasttext
 
-        params = self.params.copy()
+        params = self._get_model_params()
         quantize_model = params.pop('quantize_model', True)
 
         verbosity = kwargs.get('verbosity', 2)

diff --git a/tabular/src/autogluon/tabular/models/knn/knn_model.py b/tabular/src/autogluon/tabular/models/knn/knn_model.py
@@ -84,7 +84,7 @@ def _fit(self,
         num_rows_max = len(X)
         # FIXME: v0.1 Must store final num rows for refit_full or else will use everything! Worst case refit_full could train far longer than the original model.
         if time_limit is None or num_rows_max <= 10000:
-            self.model = self._get_model_type()(**self.params).fit(X, y)
+            self.model = self._get_model_type()(**self._get_model_params()).fit(X, y)
         else:
             self.model = self._fit_with_samples(X=X, y=y, time_limit=time_limit - (time.time() - time_start))
 
@@ -216,7 +216,7 @@ def sample_func(chunk, frac):
                 X_samp = X
                 y_samp = y
                 idx = None
-            self.model = model_type(**self.params).fit(X_samp, y_samp)
+            self.model = model_type(**self._get_model_params()).fit(X_samp, y_samp)
             time_limit_left_prior = time_limit_left
             time_fit_end_sample = time.time()
             time_limit_left = time_limit - (time_fit_end_sample - time_start)

diff --git a/tabular/src/autogluon/tabular/models/lgb/lgb_model.py b/tabular/src/autogluon/tabular/models/lgb/lgb_model.py
@@ -14,6 +14,7 @@
 from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
 from autogluon.core.features.types import R_OBJECT
 from autogluon.core.models import AbstractModel
+from autogluon.core.models._utils import get_early_stopping_rounds
 from autogluon.core.utils import try_import_lightgbm
 from autogluon.core.utils.savers import save_pkl
 
@@ -73,9 +74,8 @@ def _fit(self,
              verbosity=2,
              **kwargs):
         start_time = time.time()
-        params = self.params.copy()
-
-        # TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored.
+        ag_params = self._get_ag_params()
+        params = self._get_model_params()
         params = fixedvals_from_searchspaces(params)
 
         if verbosity <= 1:
@@ -88,8 +88,11 @@ def _fit(self,
             verbose_eval = 1
 
         stopping_metric, stopping_metric_name = self._get_stopping_metric_internal()
-        dataset_train, dataset_val = self.generate_datasets(X=X, y=y, params=params, X_val=X_val, y_val=y_val,
-                                                            sample_weight=sample_weight, sample_weight_val=sample_weight_val, dataset_train=dataset_train, dataset_val=dataset_val)
+        dataset_train, dataset_val = self.generate_datasets(
+            X=X, y=y, params=params, X_val=X_val, y_val=y_val,
+            sample_weight=sample_weight, sample_weight_val=sample_weight_val,
+            dataset_train=dataset_train, dataset_val=dataset_val
+        )
         gc.collect()
 
         num_boost_round = params.pop('num_boost_round', 1000)
@@ -110,17 +113,16 @@ def _fit(self,
             if params['min_data_in_leaf'] > num_rows_train:  # TODO: may not be necessary
                 params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0))
 
-        # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
-        if (dataset_val is not None) and (dataset_train is not None):
-            modifier = 1 if num_rows_train <= 10000 else 10000 / num_rows_train
-            early_stopping_rounds = max(round(modifier * 150), 10)
-        else:
-            early_stopping_rounds = 150
-
         callbacks = []
         valid_names = ['train_set']
         valid_sets = [dataset_train]
         if dataset_val is not None:
+            # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
+            early_stopping_rounds = ag_params.get('ag.es', 'auto')
+            if isinstance(early_stopping_rounds, str):
+                early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=early_stopping_rounds)
+            if early_stopping_rounds is None:
+                early_stopping_rounds = 999999
             reporter = kwargs.get('reporter', None)
             train_loss_name = self._get_train_loss_name() if reporter is not None else None
             if train_loss_name is not None:
@@ -308,7 +310,7 @@ def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
         time_start = time.time()
         logger.log(15, "Beginning hyperparameter tuning for Gradient Boosting Model...")
         self._set_default_searchspace()
-        params_copy = self.params.copy()
+        params_copy = self._get_params()
         if isinstance(params_copy['min_data_in_leaf'], Int):
             upper_minleaf = params_copy['min_data_in_leaf'].upper
             if upper_minleaf > X.shape[0]:  # TODO: this min_data_in_leaf adjustment based on sample size may not be necessary
@@ -389,6 +391,9 @@ def _get_train_loss_name(self):
             raise ValueError(f"unknown problem_type for LGBModel: {self.problem_type}")
         return train_loss_name
 
+    def _get_early_stopping_rounds(self, num_rows_train, strategy='auto'):
+        return get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=strategy)
+
     def get_model_feature_importance(self, use_original_feature_names=False):
         feature_names = self.model.feature_name()
         importances = self.model.feature_importance()
@@ -405,3 +410,6 @@ def _get_default_auxiliary_params(self) -> dict:
         )
         default_auxiliary_params.update(extra_auxiliary_params)
         return default_auxiliary_params
+
+    def _ag_params(self) -> set:
+        return {'ag.es'}
diff --git a/tabular/src/autogluon/tabular/models/rf/rf_model.py b/tabular/src/autogluon/tabular/models/rf/rf_model.py
@@ -75,8 +75,8 @@ def _fit(self,
              **kwargs):
         time_start = time.time()
         max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
-        hyperparams = self.params.copy()
-        n_estimators_final = hyperparams['n_estimators']
+        params = self._get_model_params()
+        n_estimators_final = params['n_estimators']
 
         n_estimators_minimum = min(40, n_estimators_final)
         n_estimators_test = min(4, max(1, math.floor(n_estimators_minimum/5)))
@@ -103,15 +103,15 @@ def _fit(self,
         if n_estimators_final > n_estimators_test * 2:
             if self.problem_type == MULTICLASS:
                 n_estimator_increments = [n_estimators_test, n_estimators_final]
-                hyperparams['warm_start'] = True
+                params['warm_start'] = True
             else:
                 if expected_memory_usage > (0.05 * max_memory_usage_ratio):  # Somewhat arbitrary, consider finding a better value, should it scale by cores?
                     # Causes ~10% training slowdown, so try to avoid if memory is not an issue
                     n_estimator_increments = [n_estimators_test, n_estimators_final]
-                    hyperparams['warm_start'] = True
+                    params['warm_start'] = True
 
-        hyperparams['n_estimators'] = n_estimator_increments[0]
-        self.model = self._get_model_type()(**hyperparams)
+        params['n_estimators'] = n_estimator_increments[0]
+        self.model = self._get_model_type()(**params)
 
         time_train_start = time.time()
         for i, n_estimators in enumerate(n_estimator_increments):

diff --git a/tabular/src/autogluon/tabular/models/rf/rf_rapids_model.py b/tabular/src/autogluon/tabular/models/rf/rf_rapids_model.py
@@ -45,6 +45,6 @@ def _fit(self, X, y, **kwargs):
                        'Consider using CPU instead if model quality is not sufficient.\n'
                        '\t\tLink to issue: https://github.com/rapidsai/cuml/issues/2518')
         X = self.preprocess(X)
-        self.model = self._get_model_type()(**self.params)
+        self.model = self._get_model_type()(**self._get_model_params())
         self.model = self.model.fit(X, y)
         self.params_trained['n_estimators'] = self.model.n_estimators
diff --git a/tabular/src/autogluon/tabular/models/tabular_nn/tabular_nn_model.py b/tabular/src/autogluon/tabular/models/tabular_nn/tabular_nn_model.py
@@ -180,7 +180,7 @@ def _fit(self,
         if sample_weight is not None:  # TODO: support
             logger.log(15, "sample_weight not yet supported for TabularNeuralNetModel, this model will ignore them in training.")
 
-        params = self.params.copy()
+        params = self._get_model_params()
         params = fixedvals_from_searchspaces(params)
         if self.feature_metadata is None:
             raise ValueError("Trainer class must set feature_metadata for this model")
@@ -723,7 +723,7 @@ def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
             raise ValueError("scheduler_cls and scheduler_params cannot be None for hyperparameter tuning")
         num_cpus = scheduler_params['resource']['num_cpus']
 
-        params_copy = self.params.copy()
+        params_copy = self._get_params()
 
         self.num_dataloading_workers = max(1, int(num_cpus/2.0))
         self.batch_size = params_copy['batch_size']

diff --git a/tabular/src/autogluon/tabular/models/xgboost/callbacks.py b/tabular/src/autogluon/tabular/models/xgboost/callbacks.py
@@ -10,6 +10,9 @@
 class EarlyStoppingCustom(EarlyStopping):
     """Augments early stopping in XGBoost to also consider time_limit and memory usage"""
     def __init__(self, rounds, time_limit=None, start_time=None, verbose=False, **kwargs):
+        if rounds is None:
+            # Disable early stopping via rounds
+            rounds = 999999
         super().__init__(rounds=rounds, **kwargs)
         self.time_limit = time_limit
         self.start_time = start_time

diff --git a/tabular/src/autogluon/tabular/models/xgboost/hyperparameters/searchspaces.py b/tabular/src/autogluon/tabular/models/xgboost/hyperparameters/searchspaces.py
@@ -23,7 +23,7 @@ def get_base_searchspace():
         'booster': 'gbtree',
         'n_jobs': os.cpu_count(), # TODO: xgboost plans to accept -1 for compability with other packages. After that, resolving this issue.
         'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
-        'max_depth': Int(lower=3, upper=10, default=3),
+        'max_depth': Int(lower=3, upper=10, default=6),
         'min_child_weight': Int(lower=1, upper=5, default=1),
         'gamma': Real(lower=0, upper=5, default=0.01),
         'subsample': Real(lower=0.5, upper=1.0, default=1.0),