Vision: Added exception if invalid kwargs specified (open-mmlab#1251)

* Vision: Error if invalid kwargs specified * Fixed vision unit tests * Fixed vision docs * Fix vision HPO
FANGAreNotGnu · Jul 27, 2021 · a9bb0e1 · a9bb0e1
1 parent dab2511
commit a9bb0e1
Show file tree

Hide file tree

Showing 9 changed files with 46 additions and 64 deletions.
diff --git a/core/src/autogluon/core/scheduler/seq_scheduler.py b/core/src/autogluon/core/scheduler/seq_scheduler.py
@@ -151,7 +151,12 @@ def run(self, **kwargs):
         r = range(self.num_trials)
         for i in (tqdm(r) if self.num_trials < 1000 else r):
             trial_start_time = time.time()
-            is_failed, result = self.run_trial(task_id=i)
+            try:
+                is_failed, result = self.run_trial(task_id=i)
+            except Exception:
+                # TODO: Add special exception type when there are no more new configurations to try (exhausted search space)
+                logger.log(30, f'\tWARNING: Encountered unexpected exception during trial {i}, stopping HPO early.')
+                break
             trial_end_time = time.time()
             trial_run_times.append(np.NaN if is_failed else (trial_end_time - trial_start_time))
 

diff --git a/core/src/autogluon/core/task/base/base_task.py b/core/src/autogluon/core/task/base/base_task.py
@@ -53,8 +53,7 @@ def run_fit(cls, train_fn, search_strategy, scheduler_options,
                 plot_results=False):
         start_time = time.time()
         # create scheduler and schedule tasks
-        scheduler = create_scheduler(
-            train_fn, search_strategy, scheduler_options)
+        scheduler = create_scheduler(train_fn, search_strategy, scheduler_options)
         scheduler.run()
         scheduler.join_jobs()
         # gather the best configuration
@@ -64,7 +63,8 @@ def run_fit(cls, train_fn, search_strategy, scheduler_options,
         args.final_fit = True
         if hasattr(args, 'epochs') and hasattr(args, 'final_fit_epochs'):
             args.epochs = args.final_fit_epochs
-        results = scheduler.run_with_config(best_config)
+        scheduler_final = create_scheduler(train_fn, search_strategy, scheduler_options)
+        results = scheduler_final.run_with_config(best_config)
         total_time = time.time() - start_time
         if plot_results or in_ipynb():
             plot_training_curves = scheduler_options['checkpoint'].replace('exp1.ag', 'plot_training_curves.png')

diff --git a/docs/tutorials/image_prediction/hpo.md b/docs/tutorials/image_prediction/hpo.md
@@ -75,8 +75,8 @@ parallel evaluations.
 ```{.python .input}
 hyperparameters={'model': model, 'batch_size': batch_size, 'lr': lr, 'epochs': 2}
 predictor = ImagePredictor()
-predictor.fit(train_data, search_strategy='bayesopt', time_limit=60*10, hyperparameters=hyperparameters,
-              hyperparameter_tune_kwargs={'num_trials': 2})
+predictor.fit(train_data, time_limit=60*10, hyperparameters=hyperparameters,
+              hyperparameter_tune_kwargs={'searcher': 'bayesopt', 'num_trials': 2})
 print('Top-1 val acc: %.3f' % predictor.fit_summary()['valid_acc'])
 ```
 
@@ -90,39 +90,3 @@ print('Test acc on hold-out data:', top1)
 
 Note that `num_trials=2` above is only used to speed up the tutorial. In normal
 practice, it is common to only use `time_limit` and drop `num_trials`.
-
-### Hyperband Early Stopping
-
-AutoGluon currently supports scheduling trials in serial order and with early
-stopping (e.g., if the performance of the model early within training already
-looks bad, the trial may be terminated early to free up resources).
-Here is an example of using an early stopping scheduler
-:class:`autogluon.core.scheduler.HyperbandScheduler`. `scheduler_options` is used
-to configure the scheduler. In this example, we run Hyperband with a single
-bracket, and stop/go decisions are made after 1 and 2 epochs (`grace_period`,
-`grace_period * reduction_factor`):
-
-```{.python .input}
-hyperparameters.update({
-  'search_strategy': 'hyperband',
-  'grace_period': 1
-  })
-```
-
-The `fit`, `evaluate` and `predict` processes are exactly the same, so we will skip training to save some time.
-
-### Bayesian Optimization and Hyperband ###
-
-While Hyperband scheduling is normally driven by a random searcher, AutoGluon
-also provides Hyperband together with Bayesian optimization. The tuning of expensive
-DL models typically works best with this combination.
-
-```{.python .input}
-hyperparameters.update({
-  'search_strategy': 'bayesopt_hyperband',
-  'grace_period': 1
-  })
-```
-
-For a comparison of different search algorithms and scheduling strategies, see :ref:`course_alg`.
-For more options using `fit`, see :class:`autogluon.vision.ImagePredictor`.
diff --git a/docs/tutorials/object_detection/beginner.md b/docs/tutorials/object_detection/beginner.md
@@ -7,10 +7,9 @@ Object detection is the process of identifying and localizing objects in an imag
 
 Our goal is to detect motorbike in images by [YOLOv3 model](https://pjreddie.com/media/files/papers/YOLOv3.pdf). A tiny dataset is collected from VOC dataset, which only contains the motorbike category. The model pretrained on the COCO dataset is used to fine-tune our small dataset. With the help of AutoGluon, we are able to try many models with different hyperparameters automatically, and return the best one as our final model.
 
-To start, import autogluon.vision and ObjectDetector:
+To start, import ObjectDetector:
 
 ```{.python .input}
-import autogluon.core as ag
 from autogluon.vision import ObjectDetector
 ```
 
@@ -33,16 +32,15 @@ We `fit` a classifier using AutoGluon as follows. In each experiment (one trial
 time_limit = 60*30  # at most 0.5 hour
 detector = ObjectDetector()
 hyperparameters = {'epochs': 5, 'batch_size': 8}
-hyperparamter_tune_kwargs={'num_trials': 2}
-detector.fit(dataset_train, time_limit=time_limit, hyperparameters=hyperparameters, hyperparamter_tune_kwargs=hyperparamter_tune_kwargs)
+hyperparameter_tune_kwargs={'num_trials': 2}
+detector.fit(dataset_train, time_limit=time_limit, hyperparameters=hyperparameters, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs)
 
 ```
 
 Note that `num_trials=2` above is only used to speed up the tutorial. In normal
 practice, it is common to only use `time_limit` and drop `num_trials`. Also note
 that hyperparameter tuning defaults to random search. Model-based variants, such
-as `search_strategy='bayesopt'` or `search_strategy='bayesopt_hyperband'` can be
-a lot more sample-efficient.
+as `searcher='bayesopt'` in `hyperparameter_tune_kwargs` can be a lot more sample-efficient.
 
 After fitting, AutoGluon automatically returns the best model among all models in the searching space. From the output, we know the best model is the one trained with the second learning rate. To see how well the returned model performed on test dataset, call detector.evaluate().
 

diff --git a/vision/src/autogluon/vision/detector/detector.py b/vision/src/autogluon/vision/detector/detector.py
@@ -11,7 +11,6 @@
 from autogluon.core.utils import set_logger_verbosity
 from gluoncv.auto.tasks import ObjectDetection as _ObjectDetection
 from ..configs.presets_configs import unpack, _check_gpu_memory_presets
-from ..utils import MXNetErrorCatcher
 
 __all__ = ['ObjectDetector']
 
@@ -203,7 +202,7 @@ def fit(self,
                 num_trials : int, default = 1
                     The limit of HPO trials that can be performed within `time_limit`. The HPO process will be terminated
                     when `num_trials` trials have finished or wall clock `time_limit` is reached, whichever comes first.
-                search_strategy : str, default = 'random'
+                searcher : str, default = 'random'
                     Searcher strategy for HPO, 'random' by default.
                     Options include: ‘random’ (random search), ‘bayesopt’ (Gaussian process Bayesian optimization),
                     ‘grid’ (grid search).
@@ -303,10 +302,9 @@ def fit(self,
         task._logger.propagate = True
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            with MXNetErrorCatcher() as err:
-                self._detector = task.fit(train_data, tuning_data, 1 - holdout_frac, random_state)
-            if err.exc_value is not None:
-                raise RuntimeError(err.exc_value)
+            # TODO: MXNetErrorCatcher was removed because it didn't return traceback,
+            #  Re-add once it returns full traceback regardless of which exception was caught
+            self._detector = task.fit(train_data, tuning_data, 1 - holdout_frac, random_state)
         self._detector._logger.setLevel(log_level)
         self._detector._logger.propagate = True
         self._fit_summary = task.fit_summary()
@@ -356,6 +354,15 @@ def _validate_data(self, data):
 
     def _validate_kwargs(self, kwargs):
         """validate and initialize default kwargs"""
+
+        valid_kwargs = {'holdout_frac', 'random_state', 'nthreads_per_trial', 'ngpus_per_trial', 'hyperparameter_tune_kwargs'}
+        invalid_kwargs = []
+        for key in kwargs:
+            if key not in valid_kwargs:
+                invalid_kwargs.append(key)
+        if invalid_kwargs:
+            raise KeyError(f'Invalid kwargs specified: {invalid_kwargs}. Valid kwargs names: {list(valid_kwargs)}')
+
         kwargs['holdout_frac'] = kwargs.get('holdout_frac', 0.1)
         if not (0 < kwargs['holdout_frac'] < 1.0):
             raise ValueError(f'Range error for `holdout_frac`, expected to be within range (0, 1), given {kwargs["holdout_frac"]}')

diff --git a/vision/src/autogluon/vision/predictor/predictor.py b/vision/src/autogluon/vision/predictor/predictor.py
@@ -16,7 +16,7 @@
 from autogluon.core.utils import verbosity2loglevel, get_gpu_count
 from autogluon.core.utils.utils import generate_train_test_split
 from ..configs.presets_configs import unpack, _check_gpu_memory_presets
-from ..utils import MXNetErrorCatcher, sanitize_batch_size
+from ..utils import sanitize_batch_size
 
 __all__ = ['ImagePredictor']
 
@@ -220,7 +220,7 @@ def fit(self,
                 num_trials : int, default = 1
                     The limit of HPO trials that can be performed within `time_limit`. The HPO process will be terminated
                     when `num_trials` trials have finished or wall clock `time_limit` is reached, whichever comes first.
-                search_strategy : str, default = 'random'
+                searcher : str, default = 'random'
                     Searcher strategy for HPO, 'random' by default.
                     Options include: ‘random’ (random search), ‘bayesopt’ (Gaussian process Bayesian optimization),
                     ‘grid’ (grid search).
@@ -409,10 +409,9 @@ def fit(self,
         self._train_classes = train_data.classes
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            with MXNetErrorCatcher() as err:
-                self._classifier = task.fit(train_data, tuning_data, 1 - holdout_frac, random_state)
-            if err.exc_value is not None:
-                raise RuntimeError(err.exc_value + err.hint)
+            # TODO: MXNetErrorCatcher was removed because it didn't return traceback
+            #  Re-add once it returns full traceback regardless of which exception was caught
+            self._classifier = task.fit(train_data, tuning_data, 1 - holdout_frac, random_state)
         self._classifier._logger.setLevel(log_level)
         self._classifier._logger.propagate = True
         self._fit_summary = task.fit_summary()
@@ -475,6 +474,15 @@ def _validate_data(self, data):
 
     def _validate_kwargs(self, kwargs):
         """validate and initialize default kwargs"""
+
+        valid_kwargs = {'holdout_frac', 'random_state', 'nthreads_per_trial', 'ngpus_per_trial', 'hyperparameter_tune_kwargs'}
+        invalid_kwargs = []
+        for key in kwargs:
+            if key not in valid_kwargs:
+                invalid_kwargs.append(key)
+        if invalid_kwargs:
+            raise KeyError(f'Invalid kwargs specified: {invalid_kwargs}. Valid kwargs names: {list(valid_kwargs)}')
+
         kwargs['holdout_frac'] = kwargs.get('holdout_frac', 0.1)
         if not (0 < kwargs['holdout_frac'] < 1.0):
             raise ValueError(f'Range error for `holdout_frac`, expected to be within range (0, 1), given {kwargs["holdout_frac"]}')

diff --git a/vision/tests/unittests/test_image_classification.py b/vision/tests/unittests/test_image_classification.py
@@ -10,7 +10,7 @@ def test_task():
     dataset, _, test_dataset = ImageDataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip')
     model_list = ImagePredictor.list_models()
     classifier = ImagePredictor()
-    classifier.fit(dataset, num_trials=2, hyperparameters={'epochs': 1, 'early_stop_patience': 3})
+    classifier.fit(dataset, hyperparameters={'epochs': 1, 'early_stop_patience': 3}, hyperparameter_tune_kwargs={'num_trials': 2})
     assert classifier.fit_summary()['valid_acc'] > 0.1, 'valid_acc is abnormal'
     test_result = classifier.predict(test_dataset)
     single_test = classifier.predict(test_dataset.iloc[0]['image'])
@@ -69,4 +69,4 @@ def test_image_predictor_presets():
     train_dataset, _, test_dataset = ImageDataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip')
     for preset in ['medium_quality_faster_train', 'medium_quality_faster_inference']:
         predictor = ImagePredictor()
-        predictor.fit(train_dataset,tuning_data=test_dataset, presets=[preset], time_limit=60, hyperparameters={'epochs':1})
+        predictor.fit(train_dataset, tuning_data=test_dataset, presets=[preset], time_limit=60, hyperparameters={'epochs': 1})
diff --git a/vision/tests/unittests/test_image_regression.py b/vision/tests/unittests/test_image_regression.py
@@ -9,7 +9,7 @@ def test_task():
     dataset, _, test_dataset = Task.Dataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip')
     model_list = Task.list_models()
     predictor = Task(problem_type='regression')
-    predictor.fit(dataset, num_trials=2, hyperparameters={'epochs': 3, 'batch_size': 8})
+    predictor.fit(dataset, hyperparameters={'epochs': 3, 'batch_size': 8}, hyperparameter_tune_kwargs={'num_trials': 2})
     test_result = predictor.predict(test_dataset)
     single_test = predictor.predict(test_dataset.iloc[0]['image'])
     predictor.save('regressor.ag')

diff --git a/vision/tests/unittests/test_object_detection.py b/vision/tests/unittests/test_object_detection.py
@@ -5,7 +5,7 @@ def test_task():
     train_data, _, test_data = dataset.random_split()
 
     detector = Task()
-    detector.fit(train_data, num_trials=1, hyperparameters={'batch_size': 4, 'epochs': 5, 'early_stop_max_value': 0.2})
+    detector.fit(train_data, hyperparameters={'batch_size': 4, 'epochs': 5, 'early_stop_max_value': 0.2}, hyperparameter_tune_kwargs={'num_trials': 1})
     test_result = detector.predict(test_data)
     detector.save('detector.ag')
     detector2 = Task.load('detector.ag')