From 63208afac0a57017214f13b66e6832f9ac80f237 Mon Sep 17 00:00:00 2001 From: Aaron David Schneider Date: Mon, 27 Feb 2023 16:57:46 +0100 Subject: [PATCH 1/3] add code for multioutput regression --- m2cgen/assemblers/boosting.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/m2cgen/assemblers/boosting.py b/m2cgen/assemblers/boosting.py index 66bf8baf..0ce02948 100644 --- a/m2cgen/assemblers/boosting.py +++ b/m2cgen/assemblers/boosting.py @@ -23,20 +23,24 @@ def __init__(self, model, estimator_params, base_score=0.0): self._is_classification = False model_class_name = type(model).__name__ - if model_class_name in self.classifier_names: - self._is_classification = True + self._is_classification = model_class_name in self.classifier_names + + if self._is_classification: if model.n_classes_ > 2: self._output_size = model.n_classes_ + else: + if getattr(self, "n_multioutput_targets", 1) > 1: + self._output_size = self.n_multioutput_targets def assemble(self): - if self._is_classification: - if self._output_size == 1: + if self._output_size > 1: + return self._assemble_multi_class_output(self._all_estimator_params) + else: + if self._is_classification: return self._assemble_bin_class_output(self._all_estimator_params) else: - return self._assemble_multi_class_output(self._all_estimator_params) - else: - result_ast = self._assemble_single_output(self._all_estimator_params, base_score=self._base_score) - return self._single_convert_output(result_ast) + result_ast = self._assemble_single_output(self._all_estimator_params, base_score=self._base_score) + return self._single_convert_output(result_ast) def _assemble_single_output(self, estimator_params, base_score=0.0, split_idx=0): estimators_ast = self._assemble_estimators(estimator_params, split_idx) @@ -69,7 +73,10 @@ def _assemble_multi_class_output(self, estimator_params): for i, e in enumerate(splits) ] - return self._multi_class_convert_output(exprs) + if self._is_classification: + return self._multi_class_convert_output(exprs) + else: + return self._multi_output_regression_convert_output(exprs) def _assemble_bin_class_output(self, estimator_params): # Base score is calculated based on @@ -94,6 +101,9 @@ def _final_transform(self, ast_to_transform): def _multi_class_convert_output(self, exprs): return ast.SoftmaxExpr(exprs) + def _multi_output_regression_convert_output(self, exprs): + return ast.VectorVal(exprs) + def _bin_class_convert_output(self, expr, to_reuse=True): return ast.SigmoidExpr(expr, to_reuse=to_reuse) @@ -142,6 +152,11 @@ def __init__(self, model): # assembling (if applicable). best_ntree_limit = getattr(model, "best_ntree_limit", None) + # handle case of multi output regression + model_class_name = type(model).__name__ + if model_class_name not in self.classifier_names: + self.n_multioutput_targets = int(len(trees) / model.n_estimators) + super().__init__(model, trees, base_score=model.get_params()["base_score"], From 669cc2ad21cb2ecb2fb99032362c5386330a1b7b Mon Sep 17 00:00:00 2001 From: Aaron David Schneider Date: Tue, 28 Feb 2023 09:41:30 +0100 Subject: [PATCH 2/3] add tests --- tests/assemblers/test_boosting_xgboost.py | 40 +++++++++++++++++++++++ tests/utils.py | 6 ++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/assemblers/test_boosting_xgboost.py b/tests/assemblers/test_boosting_xgboost.py index d6b20bd5..294afa35 100644 --- a/tests/assemblers/test_boosting_xgboost.py +++ b/tests/assemblers/test_boosting_xgboost.py @@ -57,6 +57,46 @@ def test_multi_class(): assert utils.cmp_exprs(actual, expected) +def test_regression_multioutput(): + base_score = 0.6 + estimator = xgb.XGBRegressor(n_estimators=1, random_state=1, max_depth=1, base_score=base_score) + utils.get_multioutput_regression_model_trainer()(estimator) + + assembler = XGBoostModelAssemblerSelector(estimator) + actual = assembler.assemble() + + expected = ast.VectorVal([ + ast.BinNumExpr(ast.NumVal(base_score), + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(4), + ast.NumVal(0.09817435592412949), + ast.CompOpType.GTE), + ast.NumVal(22.866134643554688), + ast.NumVal(-10.487930297851562)), + ast.BinNumOpType.ADD), + ast.BinNumExpr(ast.NumVal(base_score), + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(1), + ast.NumVal(0.031133096665143967), + ast.CompOpType.GTE), + ast.NumVal(13.26490592956543), + ast.NumVal(-11.912125587463379)), + ast.BinNumOpType.ADD), + ast.BinNumExpr(ast.NumVal(base_score), + ast.IfExpr( + ast.CompExpr( + ast.FeatureRef(4), + ast.NumVal(-0.42966189980506897), + ast.CompOpType.GTE), + ast.NumVal(17.365192413330078), + ast.NumVal(-24.488313674926758)), + ast.BinNumOpType.ADD)]) + + assert utils.cmp_exprs(actual, expected) + + def test_regression(): base_score = 0.6 estimator = xgb.XGBRegressor(n_estimators=2, random_state=1, max_depth=1, base_score=base_score) diff --git a/tests/utils.py b/tests/utils.py index 4c5630d6..09d428ff 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -72,6 +72,9 @@ def __init__(self, dataset_name, test_fraction): if dataset_name == "boston": self.name = "train_model_regression" self.X, self.y = datasets.load_boston(return_X_y=True) + elif dataset_name == "multiout_regression": + self.name = "train_model_regression_multioutput" + self.X, self.y = datasets.make_regression(n_samples=20, n_features=5, n_targets=3, random_state=1) elif dataset_name == "boston_y_bounded": self.name = "train_model_regression_bounded" self.X, self.y = datasets.load_boston(return_X_y=True) @@ -218,13 +221,12 @@ def assert_code_equal(actual, expected): get_regression_model_trainer = partial(ModelTrainer.get_instance, "boston") +get_multioutput_regression_model_trainer = partial(ModelTrainer.get_instance, "multiout_regression") get_classification_model_trainer = partial(ModelTrainer.get_instance, "iris") - get_binary_classification_model_trainer = partial(ModelTrainer.get_instance, "breast_cancer") - get_regression_random_data_model_trainer = partial(ModelTrainer.get_instance, "regression_rnd") From 08dda59652121d8fc912983fb842b18611868107 Mon Sep 17 00:00:00 2001 From: Aaron David Schneider Date: Tue, 28 Feb 2023 10:06:27 +0100 Subject: [PATCH 3/3] make sure that multioutput regression is not used for random forests. --- m2cgen/assemblers/boosting.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/m2cgen/assemblers/boosting.py b/m2cgen/assemblers/boosting.py index 0ce02948..da5d84af 100644 --- a/m2cgen/assemblers/boosting.py +++ b/m2cgen/assemblers/boosting.py @@ -138,6 +138,10 @@ class XGBoostTreeModelAssembler(BaseTreeBoostingAssembler): "XGBRFClassifier" } + multioutput_regression_names = { + "XGBRegressor" + } + def __init__(self, model): self.multiclass_params_seq_len = model.get_params().get("num_parallel_tree", 1) feature_names = model.get_booster().feature_names @@ -154,7 +158,7 @@ def __init__(self, model): # handle case of multi output regression model_class_name = type(model).__name__ - if model_class_name not in self.classifier_names: + if model_class_name in self.multioutput_regression_names: self.n_multioutput_targets = int(len(trees) / model.n_estimators) super().__init__(model,