From ae5eb1835ed2b10b6993da0bf5c8ed8f71a26c67 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Thu, 14 May 2020 01:34:35 +0300 Subject: [PATCH 1/3] added sklearn GLM --- README.md | 2 +- m2cgen/assemblers/__init__.py | 7 ++- m2cgen/assemblers/linear.py | 77 +++++++++++++++++++++++++-------- requirements-test.txt | 4 +- tests/assemblers/test_linear.py | 49 +++++++++++++++++++++ tests/e2e/test_e2e.py | 7 +++ 6 files changed, 124 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 55fd0014..3516947c 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ pip install m2cgen | | Classification | Regression | | --- | --- | --- | -| **Linear** | | | +| **Linear** | | | | **SVM** | | | | **Tree** | | | | **Random Forest** | | | diff --git a/m2cgen/assemblers/__init__.py b/m2cgen/assemblers/__init__.py index a3782fd4..edc67ede 100644 --- a/m2cgen/assemblers/__init__.py +++ b/m2cgen/assemblers/__init__.py @@ -2,7 +2,8 @@ StatsmodelsLinearModelAssembler, ProcessMLEModelAssembler, StatsmodelsGLMModelAssembler, - StatsmodelsModelAssemblerSelector) + StatsmodelsModelAssemblerSelector, + SklearnGLMModelAssembler) from .tree import TreeModelAssembler from .ensemble import RandomForestModelAssembler from .boosting import (XGBoostModelAssemblerSelector, @@ -27,6 +28,7 @@ LightningSVMModelAssembler, StatsmodelsGLMModelAssembler, StatsmodelsModelAssemblerSelector, + SklearnGLMModelAssembler, ] @@ -59,6 +61,7 @@ "sklearn_BayesianRidge": SklearnLinearModelAssembler, "sklearn_ElasticNet": SklearnLinearModelAssembler, "sklearn_ElasticNetCV": SklearnLinearModelAssembler, + "sklearn_GammaRegressor": SklearnGLMModelAssembler, "sklearn_HuberRegressor": SklearnLinearModelAssembler, "sklearn_Lars": SklearnLinearModelAssembler, "sklearn_LarsCV": SklearnLinearModelAssembler, @@ -71,11 +74,13 @@ "sklearn_OrthogonalMatchingPursuit": SklearnLinearModelAssembler, "sklearn_OrthogonalMatchingPursuitCV": SklearnLinearModelAssembler, "sklearn_PassiveAggressiveRegressor": SklearnLinearModelAssembler, + "sklearn_PoissonRegressor": SklearnGLMModelAssembler, "sklearn_RANSACRegressor": RANSACModelAssembler, "sklearn_Ridge": SklearnLinearModelAssembler, "sklearn_RidgeCV": SklearnLinearModelAssembler, "sklearn_SGDRegressor": SklearnLinearModelAssembler, "sklearn_TheilSenRegressor": SklearnLinearModelAssembler, + "sklearn_TweedieRegressor": SklearnGLMModelAssembler, # Statsmodels Linear Regressors "statsmodels_GLMResultsWrapper": StatsmodelsGLMModelAssembler, diff --git a/m2cgen/assemblers/linear.py b/m2cgen/assemblers/linear.py index ab2d8bfd..d9f56d37 100644 --- a/m2cgen/assemblers/linear.py +++ b/m2cgen/assemblers/linear.py @@ -76,29 +76,24 @@ def _get_coef(self): return self.model.params[:self.model.k_exog] -class StatsmodelsGLMModelAssembler(StatsmodelsLinearModelAssembler): +class GLMMixin: def _final_transform(self, ast_to_transform): - link_function = type(self.model.model.family.link).__name__ + link_function = self._get_link_function_name() link_function_lower = link_function.lower() - supported_inversed_functions = { - "logit": self._logit_inversed, - "power": self._power_inversed, - "inverse_power": self._inverse_power_inversed, - "sqrt": self._sqrt_inversed, - "inverse_squared": self._inverse_squared_inversed, - "identity": self._identity_inversed, - "log": self._log_inversed, - "cloglog": self._cloglog_inversed, - "negativebinomial": self._negativebinomial_inversed, - "nbinom": self._negativebinomial_inversed - } - if link_function_lower not in supported_inversed_functions: + supported_inversed_funs = self._get_supported_inversed_funs() + if link_function_lower not in supported_inversed_funs: raise ValueError( "Unsupported link function '{}'".format(link_function)) - fun = supported_inversed_functions[link_function_lower] + fun = supported_inversed_funs[link_function_lower] return fun(ast_to_transform) + def _get_link_function_name(self): + raise NotImplementedError + + def _get_supported_inversed_funs(self): + raise NotImplementedError + def _logit_inversed(self, ast_to_transform): return utils.div( ast.NumVal(1.0), @@ -110,7 +105,7 @@ def _logit_inversed(self, ast_to_transform): ast_to_transform)))) def _power_inversed(self, ast_to_transform): - power = self.model.model.family.link.power + power = self._get_power() if power == 1: return self._identity_inversed(ast_to_transform) elif power == -1: @@ -150,10 +145,11 @@ def _cloglog_inversed(self, ast_to_transform): ast.ExpExpr(ast_to_transform)))) def _negativebinomial_inversed(self, ast_to_transform): + alpha = self._get_alpha() return utils.div( ast.NumVal(-1.0), utils.mul( - ast.NumVal(self.model.model.family.link.alpha), + ast.NumVal(alpha), utils.sub( ast.NumVal(1.0), ast.ExpExpr( @@ -161,6 +157,38 @@ def _negativebinomial_inversed(self, ast_to_transform): ast.NumVal(0.0), ast_to_transform))))) + def _get_power(self): + raise NotImplementedError + + def _get_alpha(self): + raise NotImplementedError + + +class StatsmodelsGLMModelAssembler(GLMMixin, StatsmodelsLinearModelAssembler): + + def _get_link_function_name(self): + return type(self.model.model.family.link).__name__ + + def _get_supported_inversed_funs(self): + return { + "logit": self._logit_inversed, + "power": self._power_inversed, + "inverse_power": self._inverse_power_inversed, + "sqrt": self._sqrt_inversed, + "inverse_squared": self._inverse_squared_inversed, + "identity": self._identity_inversed, + "log": self._log_inversed, + "cloglog": self._cloglog_inversed, + "negativebinomial": self._negativebinomial_inversed, + "nbinom": self._negativebinomial_inversed + } + + def _get_power(self): + return self.model.model.family.link.power + + def _get_alpha(self): + return self.model.model.family.link.alpha + class StatsmodelsModelAssemblerSelector(ModelAssembler): @@ -181,6 +209,19 @@ def assemble(self): return self.assembler.assemble() +class SklearnGLMModelAssembler(GLMMixin, SklearnLinearModelAssembler): + + def _get_link_function_name(self): + return type(self.model._link_instance).__name__ + + def _get_supported_inversed_funs(self): + return { + "identitylink": self._identity_inversed, + "loglink": self._log_inversed, + "logitlink": self._logit_inversed + } + + def _linear_to_ast(coef, intercept): feature_weight_mul_ops = [ utils.mul(ast.FeatureRef(index), ast.NumVal(value)) diff --git a/requirements-test.txt b/requirements-test.txt index bf3644dc..920b31d2 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,9 +1,9 @@ # Supported models -scikit-learn==0.22.2.post1 +scikit-learn==0.23.0 xgboost==1.0.2 lightgbm==2.3.1 statsmodels==0.11.1 -git+git://github.com/scikit-learn-contrib/lightning.git@b96f9c674968496e854078163c8814049a7b9f43 +git+git://github.com/scikit-learn-contrib/lightning@refs/pull/142/head # Testing tools flake8==3.7.9 diff --git a/tests/assemblers/test_linear.py b/tests/assemblers/test_linear.py index 8fb01044..21a4128e 100644 --- a/tests/assemblers/test_linear.py +++ b/tests/assemblers/test_linear.py @@ -687,6 +687,55 @@ class ValidPowerLink(sm.families.links.Power): assembler.assemble() +def test_sklearn_glm_identity_link_func(): + estimator = linear_model.TweedieRegressor( + power=0, link="identity", max_iter=10) + estimator = estimator.fit([[1], [2]], [0.1, 0.2]) + + assembler = assemblers.SklearnGLMModelAssembler(estimator) + actual = assembler.assemble() + + expected = ast.BinNumExpr( + ast.NumVal(0.12), + ast.BinNumExpr( + ast.FeatureRef(0), + ast.NumVal(0.02), + ast.BinNumOpType.MUL), + ast.BinNumOpType.ADD) + + assert utils.cmp_exprs(actual, expected) + + +def test_sklearn_glm_log_link_func(): + estimator = linear_model.TweedieRegressor( + power=1, link="log", fit_intercept=False, max_iter=10) + estimator = estimator.fit([[1], [2]], [0.1, 0.2]) + + assembler = assemblers.SklearnGLMModelAssembler(estimator) + actual = assembler.assemble() + + expected = ast.ExpExpr( + ast.BinNumExpr( + ast.NumVal(0.0), + ast.BinNumExpr( + ast.FeatureRef(0), + ast.NumVal(-0.4619711397), + ast.BinNumOpType.MUL), + ast.BinNumOpType.ADD)) + + assert utils.cmp_exprs(actual, expected) + + +@pytest.mark.xfail(raises=ValueError, strict=True) +def test_sklearn_glm_unknown_link_func(): + estimator = linear_model.TweedieRegressor( + power=1, link="this_link_func_does_not_exist", max_iter=10) + estimator = estimator.fit([[1], [2]], [0.1, 0.2]) + + assembler = assemblers.SklearnGLMModelAssembler(estimator) + assembler.assemble() + + def test_lightning_regression(): estimator = AdaGradRegressor(random_state=1) utils.get_regression_model_trainer()(estimator) diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py index 2e2904e8..09500a59 100644 --- a/tests/e2e/test_e2e.py +++ b/tests/e2e/test_e2e.py @@ -262,6 +262,7 @@ def regression_bounded(model, test_fraction=0.02): regression(linear_model.BayesianRidge()), regression(linear_model.ElasticNet(random_state=RANDOM_SEED)), regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)), + regression(linear_model.GammaRegressor()), regression(linear_model.HuberRegressor()), regression(linear_model.Lars()), regression(linear_model.LarsCV()), @@ -275,6 +276,7 @@ def regression_bounded(model, test_fraction=0.02): regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.PassiveAggressiveRegressor( random_state=RANDOM_SEED)), + regression(linear_model.PoissonRegressor()), regression(linear_model.RANSACRegressor( base_estimator=tree.ExtraTreeRegressor(**TREE_PARAMS), random_state=RANDOM_SEED)), @@ -282,6 +284,11 @@ def regression_bounded(model, test_fraction=0.02): regression(linear_model.RidgeCV()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)), + regression(linear_model.TweedieRegressor(power=0.0)), + regression(linear_model.TweedieRegressor(power=1.0)), + regression(linear_model.TweedieRegressor(power=1.5)), + regression(linear_model.TweedieRegressor(power=2.0)), + regression(linear_model.TweedieRegressor(power=3.0)), # Statsmodels Linear Regression classification_binary(utils.StatsmodelsSklearnLikeWrapper( From 1f37c68f9e6b2bd3c4d7a46393d81a3175a87924 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Thu, 14 May 2020 04:40:50 +0300 Subject: [PATCH 2/3] added comment for lightning version --- requirements-test.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 920b31d2..5e756371 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,7 +3,10 @@ scikit-learn==0.23.0 xgboost==1.0.2 lightgbm==2.3.1 statsmodels==0.11.1 -git+git://github.com/scikit-learn-contrib/lightning@refs/pull/142/head +# The latest master branch of lightning is incompatible with scikit-learn >=0.23, +# but you still can install it with older scikit-learn (<0.23) via +# git+git://github.com/scikit-learn-contrib/lightning.git@b96f9c674968496e854078163c8814049a7b9f43 +git+git://github.com/scikit-learn-contrib/lightning.git@refs/pull/142/head # Testing tools flake8==3.7.9 From ba9675caafefe9005c991c1294cb0f092df44bdd Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Wed, 20 May 2020 22:48:48 +0300 Subject: [PATCH 3/3] bump scikit-learn version --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 5e756371..dbd7ff16 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ # Supported models -scikit-learn==0.23.0 +scikit-learn==0.23.1 xgboost==1.0.2 lightgbm==2.3.1 statsmodels==0.11.1