Skip to content

Commit

Permalink
Handle missing values replacement in LightGBM (#290)
Browse files Browse the repository at this point in the history
  • Loading branch information
Aulust committed Sep 4, 2020
1 parent 2501263 commit 34f7069
Show file tree
Hide file tree
Showing 18 changed files with 161 additions and 18 deletions.
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,10 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python${python
python -m pip install --upgrade pip && \
pip install --no-cache-dir Cython numpy && \
pip install --no-cache-dir -r requirements-test.txt

ENV MKL_NUM_THREADS=2
ENV NUMEXPR_NUM_THREADS=2
ENV OMP_NUM_THREADS=2
ENV OPENBLAS_NUM_THREADS=2
ENV VECLIB_MAXIMUM_THREADS=2
ENV BLIS_NUM_THREADS=2
11 changes: 8 additions & 3 deletions m2cgen/assemblers/boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,9 +317,14 @@ def _assemble_tree(self, tree):
op = ast.CompOpType.from_str_op(tree["decision_type"])
assert op == ast.CompOpType.LTE, "Unexpected comparison op"

# Make sure that if the "default_left" is true the left tree branch
# ends up in the "else" branch of the ast.IfExpr.
if tree["default_left"]:
missing_type = tree['missing_type']

if missing_type not in {"NaN", "None"}:
raise ValueError(f"Unknown missing_type: {missing_type}")

reverse_condition = missing_type == "NaN" and tree["default_left"]
reverse_condition |= missing_type == "None" and tree["threshold"] >= 0
if reverse_condition:
op = ast.CompOpType.GT
true_child = tree["right_child"]
false_child = tree["left_child"]
Expand Down
37 changes: 37 additions & 0 deletions tests/assemblers/test_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,43 @@ def test_regression_random_forest():
assert utils.cmp_exprs(actual, expected)


def test_regression_with_negative_values():
estimator = lightgbm.LGBMRegressor(n_estimators=3, random_state=1,
max_depth=1)
utils.get_regression_w_missing_values_model_trainer()(estimator)

assembler = assemblers.LightGBMModelAssembler(estimator)
actual = assembler.assemble()

expected = ast.BinNumExpr(
ast.BinNumExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(8),
ast.NumVal(0.0),
ast.CompOpType.GT),
ast.NumVal(155.96889994777868),
ast.NumVal(147.72971715548434)),
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(2),
ast.NumVal(0.00780560282464346),
ast.CompOpType.GT),
ast.NumVal(4.982244683562974),
ast.NumVal(-2.978315963345233)),
ast.BinNumOpType.ADD),
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(8),
ast.NumVal(-0.0010539205031971832),
ast.CompOpType.LTE),
ast.NumVal(-3.488666332734598),
ast.NumVal(3.670539900363904)),
ast.BinNumOpType.ADD)

assert utils.cmp_exprs(actual, expected)


def test_simple_sigmoid_output_transform():
estimator = lightgbm.LGBMRegressor(n_estimators=2, random_state=1,
max_depth=1, objective="cross_entropy")
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/c.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(self, model):
def predict(self, X):

exec_args = [os.path.join(self._resource_tmp_dir, self.model_name)]
exec_args.extend(map(interpreters.utils.format_float, X))
exec_args.extend(map(utils.format_arg, X))
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/c_sharp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self, model):

def predict(self, X):
exec_args = [os.path.join(self.target_exec_dir, self.project_name)]
exec_args.extend(map(interpreters.utils.format_float, X))
exec_args.extend(map(utils.format_arg, X))
return utils.predict_from_commandline(exec_args)

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/dart.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def predict(self, X):
f"{self.executor_name}.dart")
exec_args = [self._dart,
file_name,
*map(interpreters.utils.format_float, X)]
*map(utils.format_arg, X)]
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/f_sharp.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, model):

def predict(self, X):
exec_args = [os.path.join(self.target_exec_dir, self.project_name)]
exec_args.extend(map(interpreters.utils.format_float, X))
exec_args.extend(map(utils.format_arg, X))
return utils.predict_from_commandline(exec_args)

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/go.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self, model):
def predict(self, X):

exec_args = [os.path.join(self._resource_tmp_dir, self.model_name)]
exec_args.extend(map(interpreters.utils.format_float, X))
exec_args.extend(map(utils.format_arg, X))
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/haskell.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def predict(self, X):
app_name = os.path.join(self._resource_tmp_dir,
self.executor_name)
exec_args = [app_name,
*map(interpreters.utils.format_float, X)]
*map(utils.format_arg, X)]
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/java.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def predict(self, X):
self._java_bin, "-cp", self._resource_tmp_dir,
"Executor", "Model", "score"
]
exec_args.extend(map(m2c.interpreters.utils.format_float, X))
exec_args.extend(map(utils.format_arg, X))
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
3 changes: 2 additions & 1 deletion tests/e2e/executors/javascript.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from py_mini_racer import py_mini_racer

import m2cgen as m2c
from tests import utils
from tests.e2e.executors import base


Expand All @@ -17,7 +18,7 @@ def predict(self, X):
with open(file_name, 'r') as myfile:
code = myfile.read()

args = ",".join(map(m2c.interpreters.utils.format_float, X))
args = ",".join(map(utils.format_arg, X))
caller = f"score([{args}]);\n"

ctx = py_mini_racer.MiniRacer()
Expand Down
3 changes: 2 additions & 1 deletion tests/e2e/executors/php.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def predict(self, X):
exec_args = [self._php,
"-f",
file_name,
*map(interpreters.utils.format_float, X)]
"--",
*map(utils.format_arg, X)]
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/powershell.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def predict(self, X):
"-File",
file_name,
"-InputArray",
",".join(map(interpreters.utils.format_float, X))]
",".join(map(utils.format_arg, X))]
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/r.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def predict(self, X):
exec_args = [self._r,
"--vanilla",
file_name,
*map(interpreters.utils.format_float, X)]
*map(utils.format_arg, X)]
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/ruby.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def predict(self, X):
f"{self.model_name}.rb")
exec_args = [self._ruby,
file_name,
*map(interpreters.utils.format_float, X)]
*map(utils.format_arg, X)]
return utils.predict_from_commandline(exec_args)

def prepare(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/executors/visual_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self, model):

def predict(self, X):
exec_args = [os.path.join(self.target_exec_dir, self.project_name)]
exec_args.extend(map(interpreters.utils.format_float, X))
exec_args.extend(map(utils.format_arg, X))
return utils.predict_from_commandline(exec_args)

@classmethod
Expand Down
40 changes: 40 additions & 0 deletions tests/e2e/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
RUBY = pytest.mark.ruby
F_SHARP = pytest.mark.f_sharp
REGRESSION = pytest.mark.regr
REGRESSION_WITH_MISSING_VALUES = pytest.mark.regr_missing_val
CLASSIFICATION_WITH_MISSING_VALUES = pytest.mark.clf_missing_val
CLASSIFICATION = pytest.mark.clf


Expand Down Expand Up @@ -95,6 +97,32 @@ def regression_bounded(model, test_fraction=0.02):
)


def regression_w_missing_values(model, test_fraction=0.02):
return (
model,
utils.get_regression_w_missing_values_model_trainer(test_fraction),
REGRESSION_WITH_MISSING_VALUES,
)


def classification_random_w_missing_values(model, test_fraction=0.02):
return (
model,
utils.get_classification_random_w_missing_values_model_trainer(
test_fraction),
CLASSIFICATION_WITH_MISSING_VALUES,
)


def classification_binary_random_w_missing_values(model, test_fraction=0.02):
return (
model,
utils.get_classification_binary_random_w_missing_values_model_trainer(
test_fraction),
CLASSIFICATION_WITH_MISSING_VALUES,
)


# Absolute tolerance. Used in np.isclose to compare 2 values.
# We compare 6 decimal digits.
ATOL = 1.e-6
Expand Down Expand Up @@ -186,6 +214,14 @@ def regression_bounded(model, test_fraction=0.02):
classification_binary_random(
lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS_LARGE)),
# LightGBM (Missing values during train)
regression_w_missing_values(
lightgbm.LGBMRegressor(**LIGHTGBM_PARAMS)),
classification_random_w_missing_values(
lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS)),
classification_binary_random_w_missing_values(
lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS)),
# LightGBM (Different Objectives)
regression(lightgbm.LGBMRegressor(
**LIGHTGBM_PARAMS, objective="mse", reg_sqrt=True)),
Expand Down Expand Up @@ -549,6 +585,10 @@ def regression_bounded(model, test_fraction=0.02):
classification_binary(
ensemble.RandomForestClassifier(**FOREST_PARAMS)),
],
[
(R, REGRESSION_WITH_MISSING_VALUES),
(R, CLASSIFICATION_WITH_MISSING_VALUES),
]
# Following is the list of extra tests for languages/models which are
# not fully supported yet.
Expand Down
56 changes: 54 additions & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from m2cgen import ast
from m2cgen.assemblers import _get_full_model_name
from m2cgen.interpreters.utils import format_float


class StatsmodelsSklearnLikeWrapper(BaseEstimator, RegressorMixin):
Expand Down Expand Up @@ -69,6 +70,7 @@ class ModelTrainer:
def __init__(self, dataset_name, test_fraction):
self.dataset_name = dataset_name
self.test_fraction = test_fraction
additional_test_data = None
np.random.seed(seed=7)
if dataset_name == "boston":
self.name = "train_model_regression"
Expand All @@ -77,6 +79,12 @@ def __init__(self, dataset_name, test_fraction):
self.name = "train_model_regression_bounded"
self.X, self.y = datasets.load_boston(return_X_y=True)
self.y = np.arctan(self.y) / np.pi + 0.5 # (0; 1)
elif dataset_name == "diabetes":
self.name = "train_model_regression_w_missing_values"
self.X, self.y = datasets.load_diabetes(return_X_y=True)
additional_test_data = np.array([
[np.NaN] * self.X.shape[1],
])
elif dataset_name == "iris":
self.name = "train_model_classification"
self.X, self.y = datasets.load_iris(return_X_y=True)
Expand All @@ -93,17 +101,36 @@ def __init__(self, dataset_name, test_fraction):
N = 1000
self.X = np.random.random(size=(N, 200))
self.y = np.random.randint(3, size=(N,))
elif dataset_name == "classification_rnd_w_missing_values":
self.name = "train_model_classification_rnd_w_missing_values"
N = 100
self.X = np.random.random(size=(N, 20)) - 0.5
self.y = np.random.randint(3, size=(N,))
additional_test_data = np.array([
[np.NaN] * self.X.shape[1],
])
elif dataset_name == "classification_binary_rnd":
self.name = "train_model_classification_binary_random_data"
N = 1000
self.X = np.random.random(size=(N, 200))
self.y = np.random.randint(2, size=(N,))
elif dataset_name == "classification_binary_rnd_w_missing_values":
self.name = \
"train_model_classification_binary_rnd_w_missing_values"
N = 100
self.X = np.random.random(size=(N, 20)) - 0.5
self.y = np.random.randint(2, size=(N,))
additional_test_data = np.array([
[np.NaN] * self.X.shape[1],
])
else:
raise ValueError(f"Unknown dataset name: {dataset_name}")

(self.X_train, self.X_test,
self.y_train, self.y_test) = train_test_split(
self.y_train, _) = train_test_split(
self.X, self.y, test_size=test_fraction, random_state=13)
if additional_test_data is not None:
self.X_test = np.vstack((additional_test_data, self.X_test))

@classmethod
def get_instance(cls, dataset_name, test_fraction=0.02):
Expand Down Expand Up @@ -201,10 +228,25 @@ def assert_code_equal(actual, expected):
get_classification_binary_random_data_model_trainer = functools.partial(
ModelTrainer.get_instance, "classification_binary_rnd")


get_bounded_regression_model_trainer = functools.partial(
ModelTrainer.get_instance, "boston_y_bounded")


get_regression_w_missing_values_model_trainer = functools.partial(
ModelTrainer.get_instance, "diabetes")


get_classification_random_w_missing_values_model_trainer = functools.partial(
ModelTrainer.get_instance, "classification_rnd_w_missing_values")


get_classification_binary_random_w_missing_values_model_trainer = \
functools.partial(
ModelTrainer.get_instance,
"classification_binary_rnd_w_missing_values")


@contextlib.contextmanager
def tmp_dir():
dirpath = tempfile.mkdtemp()
Expand Down Expand Up @@ -245,7 +287,7 @@ def predict_from_commandline(exec_args):


def cartesian_e2e_params(executors_with_marks, models_with_trainers_with_marks,
*additional_params):
skip_executor_trainer_pairs, *additional_params):
result_params = list(additional_params)

# Specifying None for additional parameters makes pytest to generate
Expand All @@ -257,6 +299,9 @@ def cartesian_e2e_params(executors_with_marks, models_with_trainers_with_marks,
executors_with_marks, models_with_trainers_with_marks)

for (executor, executor_mark), (model, trainer, trainer_mark) in prod:
if (executor_mark, trainer_mark) in skip_executor_trainer_pairs:
continue

# Since we reuse the same model across multiple tests we want it
# to be clean.
model = clone(model)
Expand Down Expand Up @@ -286,3 +331,10 @@ def inner(*args, **kwarg):

def _is_float(value):
return isinstance(value, (float, np.floating))


def format_arg(value):
if np.isnan(value):
return "NaN"

return format_float(value)

0 comments on commit 34f7069

Please sign in to comment.