Skip to content

Commit

Permalink
Limit the number of leaves in each subroutine for gradient boosted trees
Browse files Browse the repository at this point in the history
Fixes #103
  • Loading branch information
chris-smith-zocdoc authored and izeigerman committed Nov 30, 2019
1 parent 30abeef commit 02388fa
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 11 deletions.
89 changes: 81 additions & 8 deletions m2cgen/assemblers/boosting.py
Expand Up @@ -9,10 +9,12 @@ class BaseBoostingAssembler(ModelAssembler):

classifier_name = None

def __init__(self, model, trees, base_score=0, tree_limit=None):
def __init__(self, model, trees, base_score=0, tree_limit=None,
leaves_cutoff_threshold=3000):
super().__init__(model)
self.all_trees = trees
self._base_score = base_score
self._leaves_cutoff_threshold = leaves_cutoff_threshold

self._output_size = 1
self._is_classification = False
Expand Down Expand Up @@ -41,10 +43,19 @@ def _assemble_single_output(self, trees, base_score=0):
trees = trees[:self._tree_limit]

trees_ast = [self._assemble_tree(t) for t in trees]
to_sum = trees_ast

# In a large tree we need to generate multiple subroutines to avoid
# java limitations https://github.com/BayesWitnesses/m2cgen/issues/103.
trees_num_leaves = [self._count_leaves(t) for t in trees]
if sum(trees_num_leaves) > self._leaves_cutoff_threshold:
to_sum = self._split_into_subroutines(trees_ast, trees_num_leaves)

result_ast = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
ast.NumVal(base_score),
*trees_ast)
*to_sum)

return ast.SubroutineExpr(result_ast)

def _assemble_multi_class_output(self, trees):
Expand Down Expand Up @@ -74,15 +85,47 @@ def _assemble_bin_class_output(self, trees):
proba_expr
])

def _split_into_subroutines(self, trees_ast, trees_num_leaves):
result = []
subroutine_trees = []
subroutine_sum_leaves = 0
for tree, num_leaves in zip(trees_ast, trees_num_leaves):
next_sum = subroutine_sum_leaves + num_leaves
if subroutine_trees and next_sum > self._leaves_cutoff_threshold:
# Exceeded the max leaves in the current subroutine,
# finalize this one and start a new one.
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)

result.append(ast.SubroutineExpr(partial_result))

subroutine_trees = []
subroutine_sum_leaves = 0

subroutine_sum_leaves += num_leaves
subroutine_trees.append(tree)

if subroutine_trees:
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)
result.append(ast.SubroutineExpr(partial_result))
return result

def _assemble_tree(self, tree):
raise NotImplementedError

@staticmethod
def _count_leaves(trees):
raise NotImplementedError


class XGBoostModelAssembler(BaseBoostingAssembler):

classifier_name = "XGBClassifier"

def __init__(self, model):
def __init__(self, model, leaves_cutoff_threshold=3000):
feature_names = model.get_booster().feature_names
self._feature_name_to_idx = {
name: idx for idx, name in enumerate(feature_names or [])
Expand All @@ -96,7 +139,8 @@ def __init__(self, model):
best_ntree_limit = getattr(model, "best_ntree_limit", None)

super().__init__(model, trees, base_score=model.base_score,
tree_limit=best_ntree_limit)
tree_limit=best_ntree_limit,
leaves_cutoff_threshold=leaves_cutoff_threshold)

def _assemble_tree(self, tree):
if "leaf" in tree:
Expand Down Expand Up @@ -130,16 +174,31 @@ def _assemble_child_tree(self, tree, child_id):
return self._assemble_tree(child)
assert False, "Unexpected child ID {}".format(child_id)

@staticmethod
def _count_leaves(tree):
queue = [tree]
num_leaves = 0

while queue:
tree = queue.pop()
if "leaf" in tree:
num_leaves += 1
elif "children" in tree:
for child in tree["children"]:
queue.append(child)
return num_leaves


class LightGBMModelAssembler(BaseBoostingAssembler):

classifier_name = "LGBMClassifier"

def __init__(self, model):
def __init__(self, model, leaves_cutoff_threshold=3000):
model_dump = model.booster_.dump_model()
trees = [m["tree_structure"] for m in model_dump["tree_info"]]

super().__init__(model, trees)
super().__init__(model, trees,
leaves_cutoff_threshold=leaves_cutoff_threshold)

def _assemble_tree(self, tree):
if "leaf_value" in tree:
Expand All @@ -151,9 +210,9 @@ def _assemble_tree(self, tree):
op = ast.CompOpType.from_str_op(tree["decision_type"])
assert op == ast.CompOpType.LTE, "Unexpected comparison op"

# Make sure that if the 'default_left' is true the left tree branch
# Make sure that if the "default_left" is true the left tree branch
# ends up in the "else" branch of the ast.IfExpr.
if tree['default_left']:
if tree["default_left"]:
op = ast.CompOpType.GT
true_child = tree["right_child"]
false_child = tree["left_child"]
Expand All @@ -166,6 +225,20 @@ def _assemble_tree(self, tree):
self._assemble_tree(true_child),
self._assemble_tree(false_child))

@staticmethod
def _count_leaves(tree):
queue = [tree]
num_leaves = 0

while queue:
tree = queue.pop()
if "leaf_value" in tree:
num_leaves += 1
else:
queue.append(tree["left_child"])
queue.append(tree["right_child"])
return num_leaves


def _split_trees_by_classes(trees, n_classes):
# Splits are computed based on a comment
Expand Down
50 changes: 50 additions & 0 deletions tests/assemblers/test_lightgbm.py
Expand Up @@ -110,3 +110,53 @@ def test_regression():
ast.BinNumOpType.ADD))

assert utils.cmp_exprs(actual, expected)


def test_leaves_cutoff_threshold():
estimator = lightgbm.LGBMClassifier(n_estimators=2, random_state=1,
max_depth=1)
utils.train_model_classification_binary(estimator)

assembler = assemblers.LightGBMModelAssembler(estimator,
leaves_cutoff_threshold=1)
actual = assembler.assemble()

sigmoid = ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(
ast.NumVal(1),
ast.ExpExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(23),
ast.NumVal(868.2000000000002),
ast.CompOpType.GT),
ast.NumVal(0.2762557140263451),
ast.NumVal(0.6399134166614473))),
ast.BinNumOpType.ADD),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(27),
ast.NumVal(0.14205000000000004),
ast.CompOpType.GT),
ast.NumVal(-0.2139321843285849),
ast.NumVal(0.1151466338793227))),
ast.BinNumOpType.ADD)),
ast.BinNumOpType.SUB)),
ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=True)

expected = ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
sigmoid])

assert utils.cmp_exprs(actual, expected)
50 changes: 50 additions & 0 deletions tests/assemblers/test_xgboost.py
Expand Up @@ -268,3 +268,53 @@ def test_regression_saved_without_feature_names():
ast.BinNumOpType.ADD))

assert utils.cmp_exprs(actual, expected)


def test_leaves_cutoff_threshold():
estimator = xgboost.XGBClassifier(n_estimators=2, random_state=1,
max_depth=1)
utils.train_model_classification_binary(estimator)

assembler = assemblers.XGBoostModelAssembler(estimator,
leaves_cutoff_threshold=1)
actual = assembler.assemble()

sigmoid = ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(
ast.NumVal(1),
ast.ExpExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(-0.0),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(20),
ast.NumVal(16.7950001),
ast.CompOpType.GTE),
ast.NumVal(-0.17062147),
ast.NumVal(0.1638484))),
ast.BinNumOpType.ADD),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(27),
ast.NumVal(0.142349988),
ast.CompOpType.GTE),
ast.NumVal(-0.16087772),
ast.NumVal(0.149866998))),
ast.BinNumOpType.ADD)),
ast.BinNumOpType.SUB)),
ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=True)

expected = ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
sigmoid])

assert utils.cmp_exprs(actual, expected)
45 changes: 45 additions & 0 deletions tests/e2e/test_e2e.py
Expand Up @@ -49,6 +49,30 @@ def classification_binary(model):
)


def regression_random(model):
return (
model,
utils.train_model_regression_random_data,
REGRESSION,
)


def classification_random(model):
return (
model,
utils.train_model_classification_random_data,
CLASSIFICATION,
)


def classification_binary_random(model):
return (
model,
utils.train_model_classification_binary_random_data,
CLASSIFICATION,
)


# Absolute tolerance. Used in np.isclose to compare 2 values.
# We compare 6 decimal digits.
ATOL = 1.e-6
Expand All @@ -61,6 +85,11 @@ def classification_binary(model):
LIGHT_GBM_PARAMS = dict(n_estimators=10, random_state=RANDOM_SEED)
SVC_PARAMS = dict(random_state=RANDOM_SEED, decision_function_shape="ovo")

XGBOOST_PARAMS_LARGE = dict(base_score=0.6, n_estimators=100, max_depth=12,
random_state=RANDOM_SEED)
LIGHT_GBM_PARAMS_LARGE = dict(n_estimators=100, num_leaves=100, max_depth=64,
random_state=RANDOM_SEED)


@utils.cartesian_e2e_params(
# These are the languages which support all models specified in the
Expand All @@ -81,11 +110,27 @@ def classification_binary(model):
classification(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)),
classification_binary(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)),
# LightGBM (Large Trees)
regression_random(
lightgbm.LGBMRegressor(**LIGHT_GBM_PARAMS_LARGE)),
classification_random(
lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)),
classification_binary_random(
lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)),
# XGBoost
regression(xgboost.XGBRegressor(**XGBOOST_PARAMS)),
classification(xgboost.XGBClassifier(**XGBOOST_PARAMS)),
classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS)),
# XGBoost (Large Trees)
regression_random(
xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)),
classification_random(
xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
classification_binary_random(
xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
# Linear SVM
regression(svm.LinearSVR(random_state=RANDOM_SEED)),
classification(svm.LinearSVC(random_state=RANDOM_SEED)),
Expand Down

0 comments on commit 02388fa

Please sign in to comment.