Skip to content

Commit

Permalink
Tabular: Added support for specifying early stopping rounds in GBM, C…
Browse files Browse the repository at this point in the history
…AT, XGB (open-mmlab#1037)

* Tabular: Added support for specifying early stopping rounds in GBM, CAT, XGB

* addressed comment
  • Loading branch information
Innixma committed Mar 31, 2021
1 parent d738ac2 commit e93e409
Show file tree
Hide file tree
Showing 13 changed files with 135 additions and 60 deletions.
14 changes: 14 additions & 0 deletions core/src/autogluon/core/models/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

# TODO: Add more strategies
# - Adaptive early stopping: adjust rounds during model training
def get_early_stopping_rounds(num_rows_train, strategy='auto', min_rounds=10, max_rounds=150, min_rows=10000):
"""Gets early stopping rounds"""
if strategy == 'auto':
modifier = 1 if num_rows_train <= min_rows else min_rows / num_rows_train
early_stopping_rounds = max(
round(modifier * max_rounds),
min_rounds,
)
else:
raise AssertionError(f'unknown early stopping strategy: {strategy}')
return early_stopping_rounds
43 changes: 41 additions & 2 deletions core/src/autogluon/core/models/abstract/abstract_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@ def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
time_start = time.time()
logger.log(15, "Starting generic AbstractModel hyperparameter tuning for %s model..." % self.name)
self._set_default_searchspace()
params_copy = self.params.copy()
params_copy = self._get_params()
directory = self.path # also create model directory if it doesn't exist
# TODO: This will break on S3. Use tabular/utils/savers for datasets, add new function
scheduler_cls, scheduler_params = scheduler_options # Unpack tuple
Expand Down Expand Up @@ -918,7 +918,6 @@ def _get_default_ag_args_ensemble(cls) -> dict:
"""
return {}


def _get_default_stopping_metric(self):
"""
Returns the default stopping metric to use for early stopping.
Expand All @@ -932,6 +931,46 @@ def _get_default_stopping_metric(self):
stopping_metric = metrics.get_metric(stopping_metric, self.problem_type, 'stopping_metric')
return stopping_metric

def _get_params(self) -> dict:
"""Gets all params."""
return self.params.copy()

def _get_ag_params(self) -> dict:
"""Gets params that are not passed to the inner model, but are used by the wrapper."""
ag_param_names = self._ag_params()
if ag_param_names:
return {key: val for key, val in self.params.items() if key in ag_param_names}
else:
return dict()

def _get_model_params(self) -> dict:
"""Gets params that are passed to the inner model."""
ag_param_names = self._ag_params()
if ag_param_names:
return {key: val for key, val in self.params.items() if key not in ag_param_names}
else:
return self._get_params()

# TODO: Add documentation for valid args for each model. Currently only `ag.es`
def _ag_params(self) -> set:
"""
Set of params that are not passed to self.model, but are used by the wrapper.
For developers, this is purely optional and is just for convenience to logically distinguish between model specific parameters and added AutoGluon functionality.
The goal is to have common parameter names for useful functionality shared between models,
even if the functionality is not natively available as a parameter in the model itself or under a different name.
Below are common patterns / options to make available. Their actual usage and options in a particular model should be documented in the model itself, as it has flexibility to differ.
Possible params:
ag.es : int, str, or tuple
generic name for early stopping logic. Typically can be an int or a str preset/strategy.
Also possible to pass tuple of (class, kwargs) to construct a custom early stopping object.
Refer to `autogluon.core.utils.early_stopping` for examples.
"""
return set()

def _get_tags(self):
collected_tags = {}
for base_class in reversed(inspect.getmro(self.__class__)):
Expand Down
49 changes: 26 additions & 23 deletions tabular/src/autogluon/tabular/models/catboost/catboost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@
import psutil
import numpy as np

from autogluon.core.utils.exceptions import NotEnoughMemoryError, TimeLimitExceeded
from autogluon.core.utils import try_import_catboost, try_import_catboostdev
from autogluon.core.constants import PROBLEM_TYPES_CLASSIFICATION, MULTICLASS, SOFTCLASS
from autogluon.core.features.types import R_OBJECT
from autogluon.core.models import AbstractModel
from autogluon.core.models._utils import get_early_stopping_rounds
from autogluon.core.utils.exceptions import NotEnoughMemoryError, TimeLimitExceeded
from autogluon.core.utils import try_import_catboost, try_import_catboostdev

from .catboost_utils import construct_custom_catboost_metric
from .hyperparameters.parameters import get_param_baseline
from .hyperparameters.searchspaces import get_default_searchspace
from autogluon.core.models import AbstractModel

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -72,7 +73,8 @@ def _fit(self,
**kwargs):
try_import_catboost()
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
params = self.params.copy()
ag_params = self._get_ag_params()
params = self._get_model_params()
if self.problem_type == SOFTCLASS:
try_import_catboostdev() # Need to first import catboost then catboost_dev not vice-versa.
from catboost_dev import CatBoostClassifier, CatBoostRegressor, Pool
Expand Down Expand Up @@ -114,31 +116,29 @@ def _fit(self,
cat_features = list(X.select_dtypes(include='category').columns)
X = Pool(data=X, label=y, cat_features=cat_features, weight=sample_weight)

if X_val is not None:
if X_val is None:
eval_set = None
num_sample_iter_max = 50
early_stopping_rounds = None
else:
X_val = self.preprocess(X_val)
X_val = Pool(data=X_val, label=y_val, cat_features=cat_features, weight=sample_weight_val)
eval_set = X_val
if num_rows_train <= 10000:
modifier = 1
else:
modifier = 10000/num_rows_train
early_stopping_rounds = max(round(modifier*150), 10)
num_sample_iter_max = max(round(modifier*50), 2)
else:
eval_set = None
early_stopping_rounds = None
num_sample_iter_max = 50

train_dir = None
if 'allow_writing_files' in self.params and self.params['allow_writing_files']:
if 'train_dir' not in self.params:
modifier = min(1.0, 10000 / num_rows_train)
num_sample_iter_max = max(round(modifier * 50), 2)
early_stopping_rounds = ag_params.get('ag.es', 'auto')
if isinstance(early_stopping_rounds, str):
early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=early_stopping_rounds)

if params.get('allow_writing_files', False):
if 'train_dir' not in params:
try:
# TODO: What if path is in S3?
os.makedirs(os.path.dirname(self.path), exist_ok=True)
except:
pass
else:
train_dir = self.path + 'catboost_info'
params['train_dir'] = self.path + 'catboost_info'

# TODO: Add more control over these params (specifically early_stopping_rounds)
verbosity = kwargs.get('verbosity', 2)
Expand Down Expand Up @@ -187,9 +187,6 @@ def _fit(self,

logger.log(15, f'\tCatboost model hyperparameters: {params}')

if train_dir is not None:
params['train_dir'] = train_dir

if time_limit:
time_left_start = time_limit - (time.time() - start_time)
if time_left_start <= time_limit * 0.4: # if 60% of time was spent preprocessing, likely not enough time to train model
Expand Down Expand Up @@ -326,3 +323,9 @@ def _get_best_val_score(model, metric_name):
metric_name_sub = metric_name.split(':')[0]
best_score = model_best_scores[metric_name_sub]
return best_score

def _get_early_stopping_rounds(self, num_rows_train, strategy='auto'):
return get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=strategy)

def _ag_params(self) -> set:
return {'ag.es'}
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def _fit(self,
if sample_weight is not None: # TODO: support
logger.log(15, "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training.")

params = self.params.copy()
params = self._get_model_params()

self.y_scaler = params.get('y_scaler', None)
if self.y_scaler is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _fit(self,
try_import_fasttext()
import fasttext

params = self.params.copy()
params = self._get_model_params()
quantize_model = params.pop('quantize_model', True)

verbosity = kwargs.get('verbosity', 2)
Expand Down
4 changes: 2 additions & 2 deletions tabular/src/autogluon/tabular/models/knn/knn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _fit(self,
num_rows_max = len(X)
# FIXME: v0.1 Must store final num rows for refit_full or else will use everything! Worst case refit_full could train far longer than the original model.
if time_limit is None or num_rows_max <= 10000:
self.model = self._get_model_type()(**self.params).fit(X, y)
self.model = self._get_model_type()(**self._get_model_params()).fit(X, y)
else:
self.model = self._fit_with_samples(X=X, y=y, time_limit=time_limit - (time.time() - time_start))

Expand Down Expand Up @@ -216,7 +216,7 @@ def sample_func(chunk, frac):
X_samp = X
y_samp = y
idx = None
self.model = model_type(**self.params).fit(X_samp, y_samp)
self.model = model_type(**self._get_model_params()).fit(X_samp, y_samp)
time_limit_left_prior = time_limit_left
time_fit_end_sample = time.time()
time_limit_left = time_limit - (time_fit_end_sample - time_start)
Expand Down
34 changes: 21 additions & 13 deletions tabular/src/autogluon/tabular/models/lgb/lgb_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
from autogluon.core.features.types import R_OBJECT
from autogluon.core.models import AbstractModel
from autogluon.core.models._utils import get_early_stopping_rounds
from autogluon.core.utils import try_import_lightgbm
from autogluon.core.utils.savers import save_pkl

Expand Down Expand Up @@ -73,9 +74,8 @@ def _fit(self,
verbosity=2,
**kwargs):
start_time = time.time()
params = self.params.copy()

# TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored.
ag_params = self._get_ag_params()
params = self._get_model_params()
params = fixedvals_from_searchspaces(params)

if verbosity <= 1:
Expand All @@ -88,8 +88,11 @@ def _fit(self,
verbose_eval = 1

stopping_metric, stopping_metric_name = self._get_stopping_metric_internal()
dataset_train, dataset_val = self.generate_datasets(X=X, y=y, params=params, X_val=X_val, y_val=y_val,
sample_weight=sample_weight, sample_weight_val=sample_weight_val, dataset_train=dataset_train, dataset_val=dataset_val)
dataset_train, dataset_val = self.generate_datasets(
X=X, y=y, params=params, X_val=X_val, y_val=y_val,
sample_weight=sample_weight, sample_weight_val=sample_weight_val,
dataset_train=dataset_train, dataset_val=dataset_val
)
gc.collect()

num_boost_round = params.pop('num_boost_round', 1000)
Expand All @@ -110,17 +113,16 @@ def _fit(self,
if params['min_data_in_leaf'] > num_rows_train: # TODO: may not be necessary
params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0))

# TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
if (dataset_val is not None) and (dataset_train is not None):
modifier = 1 if num_rows_train <= 10000 else 10000 / num_rows_train
early_stopping_rounds = max(round(modifier * 150), 10)
else:
early_stopping_rounds = 150

callbacks = []
valid_names = ['train_set']
valid_sets = [dataset_train]
if dataset_val is not None:
# TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
early_stopping_rounds = ag_params.get('ag.es', 'auto')
if isinstance(early_stopping_rounds, str):
early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=early_stopping_rounds)
if early_stopping_rounds is None:
early_stopping_rounds = 999999
reporter = kwargs.get('reporter', None)
train_loss_name = self._get_train_loss_name() if reporter is not None else None
if train_loss_name is not None:
Expand Down Expand Up @@ -308,7 +310,7 @@ def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
time_start = time.time()
logger.log(15, "Beginning hyperparameter tuning for Gradient Boosting Model...")
self._set_default_searchspace()
params_copy = self.params.copy()
params_copy = self._get_params()
if isinstance(params_copy['min_data_in_leaf'], Int):
upper_minleaf = params_copy['min_data_in_leaf'].upper
if upper_minleaf > X.shape[0]: # TODO: this min_data_in_leaf adjustment based on sample size may not be necessary
Expand Down Expand Up @@ -389,6 +391,9 @@ def _get_train_loss_name(self):
raise ValueError(f"unknown problem_type for LGBModel: {self.problem_type}")
return train_loss_name

def _get_early_stopping_rounds(self, num_rows_train, strategy='auto'):
return get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=strategy)

def get_model_feature_importance(self, use_original_feature_names=False):
feature_names = self.model.feature_name()
importances = self.model.feature_importance()
Expand All @@ -405,3 +410,6 @@ def _get_default_auxiliary_params(self) -> dict:
)
default_auxiliary_params.update(extra_auxiliary_params)
return default_auxiliary_params

def _ag_params(self) -> set:
return {'ag.es'}
12 changes: 6 additions & 6 deletions tabular/src/autogluon/tabular/models/rf/rf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def _fit(self,
**kwargs):
time_start = time.time()
max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
hyperparams = self.params.copy()
n_estimators_final = hyperparams['n_estimators']
params = self._get_model_params()
n_estimators_final = params['n_estimators']

n_estimators_minimum = min(40, n_estimators_final)
n_estimators_test = min(4, max(1, math.floor(n_estimators_minimum/5)))
Expand All @@ -103,15 +103,15 @@ def _fit(self,
if n_estimators_final > n_estimators_test * 2:
if self.problem_type == MULTICLASS:
n_estimator_increments = [n_estimators_test, n_estimators_final]
hyperparams['warm_start'] = True
params['warm_start'] = True
else:
if expected_memory_usage > (0.05 * max_memory_usage_ratio): # Somewhat arbitrary, consider finding a better value, should it scale by cores?
# Causes ~10% training slowdown, so try to avoid if memory is not an issue
n_estimator_increments = [n_estimators_test, n_estimators_final]
hyperparams['warm_start'] = True
params['warm_start'] = True

hyperparams['n_estimators'] = n_estimator_increments[0]
self.model = self._get_model_type()(**hyperparams)
params['n_estimators'] = n_estimator_increments[0]
self.model = self._get_model_type()(**params)

time_train_start = time.time()
for i, n_estimators in enumerate(n_estimator_increments):
Expand Down
2 changes: 1 addition & 1 deletion tabular/src/autogluon/tabular/models/rf/rf_rapids_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,6 @@ def _fit(self, X, y, **kwargs):
'Consider using CPU instead if model quality is not sufficient.\n'
'\t\tLink to issue: https://github.com/rapidsai/cuml/issues/2518')
X = self.preprocess(X)
self.model = self._get_model_type()(**self.params)
self.model = self._get_model_type()(**self._get_model_params())
self.model = self.model.fit(X, y)
self.params_trained['n_estimators'] = self.model.n_estimators
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def _fit(self,
if sample_weight is not None: # TODO: support
logger.log(15, "sample_weight not yet supported for TabularNeuralNetModel, this model will ignore them in training.")

params = self.params.copy()
params = self._get_model_params()
params = fixedvals_from_searchspaces(params)
if self.feature_metadata is None:
raise ValueError("Trainer class must set feature_metadata for this model")
Expand Down Expand Up @@ -723,7 +723,7 @@ def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
raise ValueError("scheduler_cls and scheduler_params cannot be None for hyperparameter tuning")
num_cpus = scheduler_params['resource']['num_cpus']

params_copy = self.params.copy()
params_copy = self._get_params()

self.num_dataloading_workers = max(1, int(num_cpus/2.0))
self.batch_size = params_copy['batch_size']
Expand Down
3 changes: 3 additions & 0 deletions tabular/src/autogluon/tabular/models/xgboost/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
class EarlyStoppingCustom(EarlyStopping):
"""Augments early stopping in XGBoost to also consider time_limit and memory usage"""
def __init__(self, rounds, time_limit=None, start_time=None, verbose=False, **kwargs):
if rounds is None:
# Disable early stopping via rounds
rounds = 999999
super().__init__(rounds=rounds, **kwargs)
self.time_limit = time_limit
self.start_time = start_time
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_base_searchspace():
'booster': 'gbtree',
'n_jobs': os.cpu_count(), # TODO: xgboost plans to accept -1 for compability with other packages. After that, resolving this issue.
'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
'max_depth': Int(lower=3, upper=10, default=3),
'max_depth': Int(lower=3, upper=10, default=6),
'min_child_weight': Int(lower=1, upper=5, default=1),
'gamma': Real(lower=0, upper=5, default=0.01),
'subsample': Real(lower=0.5, upper=1.0, default=1.0),
Expand Down
Loading

0 comments on commit e93e409

Please sign in to comment.