Handle missing values replacement in LightGBM (#290)

BayesWitnesses · Sep 4, 2020 · 34f7069 · 34f7069
1 parent 2501263
commit 34f7069
Show file tree

Hide file tree

Showing 18 changed files with 161 additions and 18 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -50,3 +50,10 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python${python
     python -m pip install --upgrade pip && \
     pip install --no-cache-dir Cython numpy && \
     pip install --no-cache-dir -r requirements-test.txt
+
+ENV MKL_NUM_THREADS=2
+ENV NUMEXPR_NUM_THREADS=2
+ENV OMP_NUM_THREADS=2
+ENV OPENBLAS_NUM_THREADS=2
+ENV VECLIB_MAXIMUM_THREADS=2
+ENV BLIS_NUM_THREADS=2
diff --git a/m2cgen/assemblers/boosting.py b/m2cgen/assemblers/boosting.py
@@ -317,9 +317,14 @@ def _assemble_tree(self, tree):
         op = ast.CompOpType.from_str_op(tree["decision_type"])
         assert op == ast.CompOpType.LTE, "Unexpected comparison op"
 
-        # Make sure that if the "default_left" is true the left tree branch
-        # ends up in the "else" branch of the ast.IfExpr.
-        if tree["default_left"]:
+        missing_type = tree['missing_type']
+
+        if missing_type not in {"NaN", "None"}:
+            raise ValueError(f"Unknown missing_type: {missing_type}")
+
+        reverse_condition = missing_type == "NaN" and tree["default_left"]
+        reverse_condition |= missing_type == "None" and tree["threshold"] >= 0
+        if reverse_condition:
             op = ast.CompOpType.GT
             true_child = tree["right_child"]
             false_child = tree["left_child"]

diff --git a/tests/assemblers/test_lightgbm.py b/tests/assemblers/test_lightgbm.py
@@ -133,6 +133,43 @@ def test_regression_random_forest():
     assert utils.cmp_exprs(actual, expected)
 
 
+def test_regression_with_negative_values():
+    estimator = lightgbm.LGBMRegressor(n_estimators=3, random_state=1,
+                                       max_depth=1)
+    utils.get_regression_w_missing_values_model_trainer()(estimator)
+
+    assembler = assemblers.LightGBMModelAssembler(estimator)
+    actual = assembler.assemble()
+
+    expected = ast.BinNumExpr(
+        ast.BinNumExpr(
+            ast.IfExpr(
+                ast.CompExpr(
+                    ast.FeatureRef(8),
+                    ast.NumVal(0.0),
+                    ast.CompOpType.GT),
+                ast.NumVal(155.96889994777868),
+                ast.NumVal(147.72971715548434)),
+            ast.IfExpr(
+                ast.CompExpr(
+                    ast.FeatureRef(2),
+                    ast.NumVal(0.00780560282464346),
+                    ast.CompOpType.GT),
+                ast.NumVal(4.982244683562974),
+                ast.NumVal(-2.978315963345233)),
+            ast.BinNumOpType.ADD),
+        ast.IfExpr(
+            ast.CompExpr(
+                ast.FeatureRef(8),
+                ast.NumVal(-0.0010539205031971832),
+                ast.CompOpType.LTE),
+            ast.NumVal(-3.488666332734598),
+            ast.NumVal(3.670539900363904)),
+        ast.BinNumOpType.ADD)
+
+    assert utils.cmp_exprs(actual, expected)
+
+
 def test_simple_sigmoid_output_transform():
     estimator = lightgbm.LGBMRegressor(n_estimators=2, random_state=1,
                                        max_depth=1, objective="cross_entropy")

diff --git a/tests/e2e/executors/c.py b/tests/e2e/executors/c.py
@@ -54,7 +54,7 @@ def __init__(self, model):
     def predict(self, X):
 
         exec_args = [os.path.join(self._resource_tmp_dir, self.model_name)]
-        exec_args.extend(map(interpreters.utils.format_float, X))
+        exec_args.extend(map(utils.format_arg, X))
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/c_sharp.py b/tests/e2e/executors/c_sharp.py
@@ -49,7 +49,7 @@ def __init__(self, model):
 
     def predict(self, X):
         exec_args = [os.path.join(self.target_exec_dir, self.project_name)]
-        exec_args.extend(map(interpreters.utils.format_float, X))
+        exec_args.extend(map(utils.format_arg, X))
         return utils.predict_from_commandline(exec_args)
 
     @classmethod

diff --git a/tests/e2e/executors/dart.py b/tests/e2e/executors/dart.py
@@ -42,7 +42,7 @@ def predict(self, X):
                                  f"{self.executor_name}.dart")
         exec_args = [self._dart,
                      file_name,
-                     *map(interpreters.utils.format_float, X)]
+                     *map(utils.format_arg, X)]
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/f_sharp.py b/tests/e2e/executors/f_sharp.py
@@ -35,7 +35,7 @@ def __init__(self, model):
 
     def predict(self, X):
         exec_args = [os.path.join(self.target_exec_dir, self.project_name)]
-        exec_args.extend(map(interpreters.utils.format_float, X))
+        exec_args.extend(map(utils.format_arg, X))
         return utils.predict_from_commandline(exec_args)
 
     @classmethod

diff --git a/tests/e2e/executors/go.py b/tests/e2e/executors/go.py
@@ -55,7 +55,7 @@ def __init__(self, model):
     def predict(self, X):
 
         exec_args = [os.path.join(self._resource_tmp_dir, self.model_name)]
-        exec_args.extend(map(interpreters.utils.format_float, X))
+        exec_args.extend(map(utils.format_arg, X))
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/haskell.py b/tests/e2e/executors/haskell.py
@@ -40,7 +40,7 @@ def predict(self, X):
         app_name = os.path.join(self._resource_tmp_dir,
                                 self.executor_name)
         exec_args = [app_name,
-                     *map(interpreters.utils.format_float, X)]
+                     *map(utils.format_arg, X)]
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/java.py b/tests/e2e/executors/java.py
@@ -24,7 +24,7 @@ def predict(self, X):
             self._java_bin, "-cp", self._resource_tmp_dir,
             "Executor", "Model", "score"
         ]
-        exec_args.extend(map(m2c.interpreters.utils.format_float, X))
+        exec_args.extend(map(utils.format_arg, X))
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/javascript.py b/tests/e2e/executors/javascript.py
@@ -3,6 +3,7 @@
 from py_mini_racer import py_mini_racer
 
 import m2cgen as m2c
+from tests import utils
 from tests.e2e.executors import base
 
 
@@ -17,7 +18,7 @@ def predict(self, X):
         with open(file_name, 'r') as myfile:
             code = myfile.read()
 
-        args = ",".join(map(m2c.interpreters.utils.format_float, X))
+        args = ",".join(map(utils.format_arg, X))
         caller = f"score([{args}]);\n"
 
         ctx = py_mini_racer.MiniRacer()

diff --git a/tests/e2e/executors/php.py b/tests/e2e/executors/php.py
@@ -47,7 +47,8 @@ def predict(self, X):
         exec_args = [self._php,
                      "-f",
                      file_name,
-                     *map(interpreters.utils.format_float, X)]
+                     "--",
+                     *map(utils.format_arg, X)]
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/powershell.py b/tests/e2e/executors/powershell.py
@@ -40,7 +40,7 @@ def predict(self, X):
                      "-File",
                      file_name,
                      "-InputArray",
-                     ",".join(map(interpreters.utils.format_float, X))]
+                     ",".join(map(utils.format_arg, X))]
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/r.py b/tests/e2e/executors/r.py
@@ -34,7 +34,7 @@ def predict(self, X):
         exec_args = [self._r,
                      "--vanilla",
                      file_name,
-                     *map(interpreters.utils.format_float, X)]
+                     *map(utils.format_arg, X)]
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/ruby.py b/tests/e2e/executors/ruby.py
@@ -40,7 +40,7 @@ def predict(self, X):
                                  f"{self.model_name}.rb")
         exec_args = [self._ruby,
                      file_name,
-                     *map(interpreters.utils.format_float, X)]
+                     *map(utils.format_arg, X)]
         return utils.predict_from_commandline(exec_args)
 
     def prepare(self):

diff --git a/tests/e2e/executors/visual_basic.py b/tests/e2e/executors/visual_basic.py
@@ -51,7 +51,7 @@ def __init__(self, model):
 
     def predict(self, X):
         exec_args = [os.path.join(self.target_exec_dir, self.project_name)]
-        exec_args.extend(map(interpreters.utils.format_float, X))
+        exec_args.extend(map(utils.format_arg, X))
         return utils.predict_from_commandline(exec_args)
 
     @classmethod

diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py
@@ -34,6 +34,8 @@
 RUBY = pytest.mark.ruby
 F_SHARP = pytest.mark.f_sharp
 REGRESSION = pytest.mark.regr
+REGRESSION_WITH_MISSING_VALUES = pytest.mark.regr_missing_val
+CLASSIFICATION_WITH_MISSING_VALUES = pytest.mark.clf_missing_val
 CLASSIFICATION = pytest.mark.clf
 
 
@@ -95,6 +97,32 @@ def regression_bounded(model, test_fraction=0.02):
     )
 
 
+def regression_w_missing_values(model, test_fraction=0.02):
+    return (
+        model,
+        utils.get_regression_w_missing_values_model_trainer(test_fraction),
+        REGRESSION_WITH_MISSING_VALUES,
+    )
+
+
+def classification_random_w_missing_values(model, test_fraction=0.02):
+    return (
+        model,
+        utils.get_classification_random_w_missing_values_model_trainer(
+            test_fraction),
+        CLASSIFICATION_WITH_MISSING_VALUES,
+    )
+
+
+def classification_binary_random_w_missing_values(model, test_fraction=0.02):
+    return (
+        model,
+        utils.get_classification_binary_random_w_missing_values_model_trainer(
+            test_fraction),
+        CLASSIFICATION_WITH_MISSING_VALUES,
+    )
+
+
 # Absolute tolerance. Used in np.isclose to compare 2 values.
 # We compare 6 decimal digits.
 ATOL = 1.e-6
@@ -186,6 +214,14 @@ def regression_bounded(model, test_fraction=0.02):
         classification_binary_random(
             lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS_LARGE)),
 
+        # LightGBM (Missing values during train)
+        regression_w_missing_values(
+            lightgbm.LGBMRegressor(**LIGHTGBM_PARAMS)),
+        classification_random_w_missing_values(
+            lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS)),
+        classification_binary_random_w_missing_values(
+            lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS)),
+
         # LightGBM (Different Objectives)
         regression(lightgbm.LGBMRegressor(
             **LIGHTGBM_PARAMS, objective="mse", reg_sqrt=True)),
@@ -549,6 +585,10 @@ def regression_bounded(model, test_fraction=0.02):
         classification_binary(
             ensemble.RandomForestClassifier(**FOREST_PARAMS)),
     ],
+    [
+        (R, REGRESSION_WITH_MISSING_VALUES),
+        (R, CLASSIFICATION_WITH_MISSING_VALUES),
+    ]
 
     # Following is the list of extra tests for languages/models which are
     # not fully supported yet.

diff --git a/tests/utils.py b/tests/utils.py
@@ -24,6 +24,7 @@
 
 from m2cgen import ast
 from m2cgen.assemblers import _get_full_model_name
+from m2cgen.interpreters.utils import format_float
 
 
 class StatsmodelsSklearnLikeWrapper(BaseEstimator, RegressorMixin):
@@ -69,6 +70,7 @@ class ModelTrainer:
     def __init__(self, dataset_name, test_fraction):
         self.dataset_name = dataset_name
         self.test_fraction = test_fraction
+        additional_test_data = None
         np.random.seed(seed=7)
         if dataset_name == "boston":
             self.name = "train_model_regression"
@@ -77,6 +79,12 @@ def __init__(self, dataset_name, test_fraction):
             self.name = "train_model_regression_bounded"
             self.X, self.y = datasets.load_boston(return_X_y=True)
             self.y = np.arctan(self.y) / np.pi + 0.5  # (0; 1)
+        elif dataset_name == "diabetes":
+            self.name = "train_model_regression_w_missing_values"
+            self.X, self.y = datasets.load_diabetes(return_X_y=True)
+            additional_test_data = np.array([
+                [np.NaN] * self.X.shape[1],
+            ])
         elif dataset_name == "iris":
             self.name = "train_model_classification"
             self.X, self.y = datasets.load_iris(return_X_y=True)
@@ -93,17 +101,36 @@ def __init__(self, dataset_name, test_fraction):
             N = 1000
             self.X = np.random.random(size=(N, 200))
             self.y = np.random.randint(3, size=(N,))
+        elif dataset_name == "classification_rnd_w_missing_values":
+            self.name = "train_model_classification_rnd_w_missing_values"
+            N = 100
+            self.X = np.random.random(size=(N, 20)) - 0.5
+            self.y = np.random.randint(3, size=(N,))
+            additional_test_data = np.array([
+                [np.NaN] * self.X.shape[1],
+            ])
         elif dataset_name == "classification_binary_rnd":
             self.name = "train_model_classification_binary_random_data"
             N = 1000
             self.X = np.random.random(size=(N, 200))
             self.y = np.random.randint(2, size=(N,))
+        elif dataset_name == "classification_binary_rnd_w_missing_values":
+            self.name = \
+                "train_model_classification_binary_rnd_w_missing_values"
+            N = 100
+            self.X = np.random.random(size=(N, 20)) - 0.5
+            self.y = np.random.randint(2, size=(N,))
+            additional_test_data = np.array([
+                [np.NaN] * self.X.shape[1],
+            ])
         else:
             raise ValueError(f"Unknown dataset name: {dataset_name}")
 
         (self.X_train, self.X_test,
-         self.y_train, self.y_test) = train_test_split(
+         self.y_train, _) = train_test_split(
             self.X, self.y, test_size=test_fraction, random_state=13)
+        if additional_test_data is not None:
+            self.X_test = np.vstack((additional_test_data, self.X_test))
 
     @classmethod
     def get_instance(cls, dataset_name, test_fraction=0.02):
@@ -201,10 +228,25 @@ def assert_code_equal(actual, expected):
 get_classification_binary_random_data_model_trainer = functools.partial(
     ModelTrainer.get_instance, "classification_binary_rnd")
 
+
 get_bounded_regression_model_trainer = functools.partial(
     ModelTrainer.get_instance, "boston_y_bounded")
 
 
+get_regression_w_missing_values_model_trainer = functools.partial(
+    ModelTrainer.get_instance, "diabetes")
+
+
+get_classification_random_w_missing_values_model_trainer = functools.partial(
+    ModelTrainer.get_instance, "classification_rnd_w_missing_values")
+
+
+get_classification_binary_random_w_missing_values_model_trainer = \
+    functools.partial(
+        ModelTrainer.get_instance,
+        "classification_binary_rnd_w_missing_values")
+
+
 @contextlib.contextmanager
 def tmp_dir():
     dirpath = tempfile.mkdtemp()
@@ -245,7 +287,7 @@ def predict_from_commandline(exec_args):
 
 
 def cartesian_e2e_params(executors_with_marks, models_with_trainers_with_marks,
-                         *additional_params):
+                         skip_executor_trainer_pairs, *additional_params):
     result_params = list(additional_params)
 
     # Specifying None for additional parameters makes pytest to generate
@@ -257,6 +299,9 @@ def cartesian_e2e_params(executors_with_marks, models_with_trainers_with_marks,
         executors_with_marks, models_with_trainers_with_marks)
 
     for (executor, executor_mark), (model, trainer, trainer_mark) in prod:
+        if (executor_mark, trainer_mark) in skip_executor_trainer_pairs:
+            continue
+
         # Since we reuse the same model across multiple tests we want it
         # to be clean.
         model = clone(model)
@@ -286,3 +331,10 @@ def inner(*args, **kwarg):
 
 def _is_float(value):
     return isinstance(value, (float, np.floating))
+
+
+def format_arg(value):
+    if np.isnan(value):
+        return "NaN"
+
+    return format_float(value)