Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit the number of leaves in each subroutine for gradient boosted trees #123

Merged
merged 1 commit into from Nov 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
89 changes: 81 additions & 8 deletions m2cgen/assemblers/boosting.py
Expand Up @@ -9,10 +9,12 @@ class BaseBoostingAssembler(ModelAssembler):

classifier_name = None

def __init__(self, model, trees, base_score=0, tree_limit=None):
def __init__(self, model, trees, base_score=0, tree_limit=None,
leaves_cutoff_threshold=3000):
super().__init__(model)
self.all_trees = trees
self._base_score = base_score
self._leaves_cutoff_threshold = leaves_cutoff_threshold

self._output_size = 1
self._is_classification = False
Expand Down Expand Up @@ -41,10 +43,19 @@ def _assemble_single_output(self, trees, base_score=0):
trees = trees[:self._tree_limit]

trees_ast = [self._assemble_tree(t) for t in trees]
to_sum = trees_ast

# In a large tree we need to generate multiple subroutines to avoid
# java limitations https://github.com/BayesWitnesses/m2cgen/issues/103.
trees_num_leaves = [self._count_leaves(t) for t in trees]
if sum(trees_num_leaves) > self._leaves_cutoff_threshold:
to_sum = self._split_into_subroutines(trees_ast, trees_num_leaves)

result_ast = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
ast.NumVal(base_score),
*trees_ast)
*to_sum)

return ast.SubroutineExpr(result_ast)

def _assemble_multi_class_output(self, trees):
Expand Down Expand Up @@ -74,15 +85,47 @@ def _assemble_bin_class_output(self, trees):
proba_expr
])

def _split_into_subroutines(self, trees_ast, trees_num_leaves):
result = []
subroutine_trees = []
subroutine_sum_leaves = 0
for tree, num_leaves in zip(trees_ast, trees_num_leaves):
next_sum = subroutine_sum_leaves + num_leaves
if subroutine_trees and next_sum > self._leaves_cutoff_threshold:
# Exceeded the max leaves in the current subroutine,
# finalize this one and start a new one.
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)

result.append(ast.SubroutineExpr(partial_result))

subroutine_trees = []
subroutine_sum_leaves = 0

subroutine_sum_leaves += num_leaves
subroutine_trees.append(tree)

if subroutine_trees:
partial_result = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
*subroutine_trees)
result.append(ast.SubroutineExpr(partial_result))
return result

def _assemble_tree(self, tree):
raise NotImplementedError

@staticmethod
def _count_leaves(trees):
raise NotImplementedError


class XGBoostModelAssembler(BaseBoostingAssembler):

classifier_name = "XGBClassifier"

def __init__(self, model):
def __init__(self, model, leaves_cutoff_threshold=3000):
feature_names = model.get_booster().feature_names
self._feature_name_to_idx = {
name: idx for idx, name in enumerate(feature_names or [])
Expand All @@ -96,7 +139,8 @@ def __init__(self, model):
best_ntree_limit = getattr(model, "best_ntree_limit", None)

super().__init__(model, trees, base_score=model.base_score,
tree_limit=best_ntree_limit)
tree_limit=best_ntree_limit,
leaves_cutoff_threshold=leaves_cutoff_threshold)

def _assemble_tree(self, tree):
if "leaf" in tree:
Expand Down Expand Up @@ -130,16 +174,31 @@ def _assemble_child_tree(self, tree, child_id):
return self._assemble_tree(child)
assert False, "Unexpected child ID {}".format(child_id)

@staticmethod
def _count_leaves(tree):
queue = [tree]
num_leaves = 0

while queue:
tree = queue.pop()
if "leaf" in tree:
num_leaves += 1
elif "children" in tree:
for child in tree["children"]:
queue.append(child)
return num_leaves


class LightGBMModelAssembler(BaseBoostingAssembler):

classifier_name = "LGBMClassifier"

def __init__(self, model):
def __init__(self, model, leaves_cutoff_threshold=3000):
model_dump = model.booster_.dump_model()
trees = [m["tree_structure"] for m in model_dump["tree_info"]]

super().__init__(model, trees)
super().__init__(model, trees,
leaves_cutoff_threshold=leaves_cutoff_threshold)

def _assemble_tree(self, tree):
if "leaf_value" in tree:
Expand All @@ -151,9 +210,9 @@ def _assemble_tree(self, tree):
op = ast.CompOpType.from_str_op(tree["decision_type"])
assert op == ast.CompOpType.LTE, "Unexpected comparison op"

# Make sure that if the 'default_left' is true the left tree branch
# Make sure that if the "default_left" is true the left tree branch
# ends up in the "else" branch of the ast.IfExpr.
if tree['default_left']:
if tree["default_left"]:
op = ast.CompOpType.GT
true_child = tree["right_child"]
false_child = tree["left_child"]
Expand All @@ -166,6 +225,20 @@ def _assemble_tree(self, tree):
self._assemble_tree(true_child),
self._assemble_tree(false_child))

@staticmethod
def _count_leaves(tree):
queue = [tree]
num_leaves = 0

while queue:
tree = queue.pop()
if "leaf_value" in tree:
num_leaves += 1
else:
queue.append(tree["left_child"])
queue.append(tree["right_child"])
return num_leaves


def _split_trees_by_classes(trees, n_classes):
# Splits are computed based on a comment
Expand Down
50 changes: 50 additions & 0 deletions tests/assemblers/test_lightgbm.py
Expand Up @@ -110,3 +110,53 @@ def test_regression():
ast.BinNumOpType.ADD))

assert utils.cmp_exprs(actual, expected)


def test_leaves_cutoff_threshold():
estimator = lightgbm.LGBMClassifier(n_estimators=2, random_state=1,
max_depth=1)
utils.train_model_classification_binary(estimator)

assembler = assemblers.LightGBMModelAssembler(estimator,
leaves_cutoff_threshold=1)
actual = assembler.assemble()

sigmoid = ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(
ast.NumVal(1),
ast.ExpExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(23),
ast.NumVal(868.2000000000002),
ast.CompOpType.GT),
ast.NumVal(0.2762557140263451),
ast.NumVal(0.6399134166614473))),
ast.BinNumOpType.ADD),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(27),
ast.NumVal(0.14205000000000004),
ast.CompOpType.GT),
ast.NumVal(-0.2139321843285849),
ast.NumVal(0.1151466338793227))),
ast.BinNumOpType.ADD)),
ast.BinNumOpType.SUB)),
ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=True)

expected = ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
sigmoid])

assert utils.cmp_exprs(actual, expected)
50 changes: 50 additions & 0 deletions tests/assemblers/test_xgboost.py
Expand Up @@ -268,3 +268,53 @@ def test_regression_saved_without_feature_names():
ast.BinNumOpType.ADD))

assert utils.cmp_exprs(actual, expected)


def test_leaves_cutoff_threshold():
estimator = xgboost.XGBClassifier(n_estimators=2, random_state=1,
max_depth=1)
utils.train_model_classification_binary(estimator)

assembler = assemblers.XGBoostModelAssembler(estimator,
leaves_cutoff_threshold=1)
actual = assembler.assemble()

sigmoid = ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(
ast.NumVal(1),
ast.ExpExpr(
ast.BinNumExpr(
ast.NumVal(0),
ast.SubroutineExpr(
ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(-0.0),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(20),
ast.NumVal(16.7950001),
ast.CompOpType.GTE),
ast.NumVal(-0.17062147),
ast.NumVal(0.1638484))),
ast.BinNumOpType.ADD),
ast.SubroutineExpr(
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(27),
ast.NumVal(0.142349988),
ast.CompOpType.GTE),
ast.NumVal(-0.16087772),
ast.NumVal(0.149866998))),
ast.BinNumOpType.ADD)),
ast.BinNumOpType.SUB)),
ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=True)

expected = ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), sigmoid, ast.BinNumOpType.SUB),
sigmoid])

assert utils.cmp_exprs(actual, expected)
45 changes: 45 additions & 0 deletions tests/e2e/test_e2e.py
Expand Up @@ -51,6 +51,30 @@ def classification_binary(model):
)


def regression_random(model):
return (
model,
utils.train_model_regression_random_data,
REGRESSION,
)


def classification_random(model):
return (
model,
utils.train_model_classification_random_data,
CLASSIFICATION,
)


def classification_binary_random(model):
return (
model,
utils.train_model_classification_binary_random_data,
CLASSIFICATION,
)


# Absolute tolerance. Used in np.isclose to compare 2 values.
# We compare 6 decimal digits.
ATOL = 1.e-6
Expand All @@ -63,6 +87,11 @@ def classification_binary(model):
LIGHT_GBM_PARAMS = dict(n_estimators=10, random_state=RANDOM_SEED)
SVC_PARAMS = dict(random_state=RANDOM_SEED, decision_function_shape="ovo")

XGBOOST_PARAMS_LARGE = dict(base_score=0.6, n_estimators=100, max_depth=12,
random_state=RANDOM_SEED)
LIGHT_GBM_PARAMS_LARGE = dict(n_estimators=100, num_leaves=100, max_depth=64,
random_state=RANDOM_SEED)


@utils.cartesian_e2e_params(
# These are the languages which support all models specified in the
Expand All @@ -85,11 +114,27 @@ def classification_binary(model):
classification(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)),
classification_binary(lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS)),

# LightGBM (Large Trees)
regression_random(
lightgbm.LGBMRegressor(**LIGHT_GBM_PARAMS_LARGE)),
classification_random(
lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)),
classification_binary_random(
lightgbm.LGBMClassifier(**LIGHT_GBM_PARAMS_LARGE)),

# XGBoost
regression(xgboost.XGBRegressor(**XGBOOST_PARAMS)),
classification(xgboost.XGBClassifier(**XGBOOST_PARAMS)),
classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS)),

# XGBoost (Large Trees)
regression_random(
xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)),
classification_random(
xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
classification_binary_random(
xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),

# Linear SVM
regression(svm.LinearSVR(random_state=RANDOM_SEED)),
classification(svm.LinearSVC(random_state=RANDOM_SEED)),
Expand Down