Skip to content

Commit

Permalink
Add tests. Cosmetic changes
Browse files Browse the repository at this point in the history
  • Loading branch information
izeigerman committed Nov 30, 2019
1 parent 0a1b462 commit 041f780
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 38 deletions.
79 changes: 41 additions & 38 deletions m2cgen/assemblers/boosting.py
Expand Up @@ -4,17 +4,17 @@
from m2cgen.assemblers import utils
from m2cgen.assemblers.base import ModelAssembler

MAX_LEAVES_PER_METHOD = 3000


class BaseBoostingAssembler(ModelAssembler):

classifier_name = None

def __init__(self, model, trees, base_score=0, tree_limit=None):
def __init__(self, model, trees, base_score=0, tree_limit=None,
leaves_cutoff_threshold=3000):
super().__init__(model)
self.all_trees = trees
self._base_score = base_score
self._leaves_cutoff_threshold = leaves_cutoff_threshold

self._output_size = 1
self._is_classification = False
Expand Down Expand Up @@ -45,38 +45,11 @@ def _assemble_single_output(self, trees, base_score=0):
trees_ast = [self._assemble_tree(t) for t in trees]
to_sum = trees_ast

# in a large tree we need to generate multiple subroutines to avoid
# java limitations https://github.com/BayesWitnesses/m2cgen/issues/103
# In a large tree we need to generate multiple subroutines to avoid
# java limitations https://github.com/BayesWitnesses/m2cgen/issues/103.
trees_num_leaves = [self._count_leaves(t) for t in trees]

if sum(trees_num_leaves) > MAX_LEAVES_PER_METHOD:
to_sum = []

subroutine_trees = []
subroutine_sum_leaves = 0
for tree, num_leaves in zip(trees_ast, trees_num_leaves):

result_total = subroutine_sum_leaves + num_leaves
if subroutine_trees and result_total > MAX_LEAVES_PER_METHOD:
# exceeded the max leaves in the current subroutine,
# finialize this one and start a new one
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)

to_sum.append(ast.SubroutineExpr(partial_result))

subroutine_trees = []
subroutine_sum_leaves = 0

subroutine_sum_leaves += num_leaves
subroutine_trees.append(tree)

if len(subroutine_trees) != 0:
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)
to_sum.append(ast.SubroutineExpr(partial_result))
if sum(trees_num_leaves) > self._leaves_cutoff_threshold:
to_sum = self._split_into_subroutines(trees_ast, trees_num_leaves)

result_ast = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
Expand Down Expand Up @@ -112,6 +85,34 @@ def _assemble_bin_class_output(self, trees):
proba_expr
])

def _split_into_subroutines(self, trees_ast, trees_num_leaves):
result = []
subroutine_trees = []
subroutine_sum_leaves = 0
for tree, num_leaves in zip(trees_ast, trees_num_leaves):
next_sum = subroutine_sum_leaves + num_leaves
if subroutine_trees and next_sum > self._leaves_cutoff_threshold:
# Exceeded the max leaves in the current subroutine,
# finalize this one and start a new one.
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)

result.append(ast.SubroutineExpr(partial_result))

subroutine_trees = []
subroutine_sum_leaves = 0

subroutine_sum_leaves += num_leaves
subroutine_trees.append(tree)

if subroutine_trees:
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)
result.append(ast.SubroutineExpr(partial_result))
return result

def _assemble_tree(self, tree):
raise NotImplementedError

Expand All @@ -124,7 +125,7 @@ class XGBoostModelAssembler(BaseBoostingAssembler):

classifier_name = "XGBClassifier"

def __init__(self, model):
def __init__(self, model, leaves_cutoff_threshold=3000):
feature_names = model.get_booster().feature_names
self._feature_name_to_idx = {
name: idx for idx, name in enumerate(feature_names or [])
Expand All @@ -138,7 +139,8 @@ def __init__(self, model):
best_ntree_limit = getattr(model, "best_ntree_limit", None)

super().__init__(model, trees, base_score=model.base_score,
tree_limit=best_ntree_limit)
tree_limit=best_ntree_limit,
leaves_cutoff_threshold=leaves_cutoff_threshold)

def _assemble_tree(self, tree):
if "leaf" in tree:
Expand Down Expand Up @@ -191,11 +193,12 @@ class LightGBMModelAssembler(BaseBoostingAssembler):

classifier_name = "LGBMClassifier"

def __init__(self, model):
def __init__(self, model, leaves_cutoff_threshold=3000):
model_dump = model.booster_.dump_model()
trees = [m["tree_structure"] for m in model_dump["tree_info"]]

super().__init__(model, trees)
super().__init__(model, trees,
leaves_cutoff_threshold=leaves_cutoff_threshold)

def _assemble_tree(self, tree):
if "leaf_value" in tree:
Expand Down
50 changes: 50 additions & 0 deletions tests/assemblers/test_lightgbm.py
Expand Up @@ -110,3 +110,53 @@ def test_regression():
ast.BinNumOpType.ADD))

assert utils.cmp_exprs(actual, expected)


def test_leaves_cutoff_threshold():
estimator = lightgbm.LGBMClassifier(n_estimators=2, random_state=1,
max_depth=1)
utils.train_model_classification_binary(estimator)

assembler = assemblers.LightGBMModelAssembler(estimator,
leaves_cutoff_threshold=1)
actual = assembler.assemble()

sigmoid = ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(
ast.NumVal(1),
ast.ExpExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(23),
ast.NumVal(868.2000000000002),
ast.CompOpType.GT),
ast.NumVal(0.2762557140263451),
ast.NumVal(0.6399134166614473))),
ast.BinNumOpType.ADD),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(27),
ast.NumVal(0.14205000000000004),
ast.CompOpType.GT),
ast.NumVal(-0.2139321843285849),
ast.NumVal(0.1151466338793227))),
ast.BinNumOpType.ADD)),
ast.BinNumOpType.SUB)),
ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=True)

expected = ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
sigmoid])

assert utils.cmp_exprs(actual, expected)
50 changes: 50 additions & 0 deletions tests/assemblers/test_xgboost.py
Expand Up @@ -268,3 +268,53 @@ def test_regression_saved_without_feature_names():
ast.BinNumOpType.ADD))

assert utils.cmp_exprs(actual, expected)


def test_leaves_cutoff_threshold():
estimator = xgboost.XGBClassifier(n_estimators=2, random_state=1,
max_depth=1)
utils.train_model_classification_binary(estimator)

assembler = assemblers.XGBoostModelAssembler(estimator,
leaves_cutoff_threshold=1)
actual = assembler.assemble()

sigmoid = ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(
ast.NumVal(1),
ast.ExpExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(-0.0),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(20),
ast.NumVal(16.7950001),
ast.CompOpType.GTE),
ast.NumVal(-0.17062147),
ast.NumVal(0.1638484))),
ast.BinNumOpType.ADD),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(27),
ast.NumVal(0.142349988),
ast.CompOpType.GTE),
ast.NumVal(-0.16087772),
ast.NumVal(0.149866998))),
ast.BinNumOpType.ADD)),
ast.BinNumOpType.SUB)),
ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=True)

expected = ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
sigmoid])

assert utils.cmp_exprs(actual, expected)

0 comments on commit 041f780

Please sign in to comment.