Skip to content

Commit

Permalink
Merge c68ecfe into c441b7f
Browse files Browse the repository at this point in the history
  • Loading branch information
izeigerman committed Feb 11, 2019
2 parents c441b7f + c68ecfe commit 3594f60
Show file tree
Hide file tree
Showing 21 changed files with 464 additions and 18 deletions.
5 changes: 5 additions & 0 deletions README.md
Expand Up @@ -42,6 +42,11 @@
<td>RandomForestClassifier, ExtraTreesClassifier</td>
<td>RandomForestRegressor, ExtraTreesRegressor</td>
</tr>
<tr>
<th>Boosting</th>
<td>XGBClassifier(gbtree/dart booster only)</td>
<td>XGBRegressor(gbtree/dart booster only)</td>
</tr>
</tbody>
</table>

Expand Down
8 changes: 7 additions & 1 deletion m2cgen/assemblers/__init__.py
@@ -1,15 +1,21 @@
from .linear import LinearModelAssembler
from .tree import TreeModelAssembler
from .ensemble import RandomForestModelAssembler
from .xgboost import XGBoostModelAssembler

__all__ = [
LinearModelAssembler,
TreeModelAssembler,
RandomForestModelAssembler
RandomForestModelAssembler,
XGBoostModelAssembler,
]


SUPPORTED_MODELS = {
# XGBoost
"XGBClassifier": XGBoostModelAssembler,
"XGBRegressor": XGBoostModelAssembler,

# SVM
"LinearSVC": LinearModelAssembler,
"LinearSVR": LinearModelAssembler,
Expand Down
26 changes: 24 additions & 2 deletions m2cgen/assemblers/utils.py
Expand Up @@ -33,7 +33,7 @@ def apply_bin_op(left, right, op):
return exr_class(left, right, op)


def apply_op_to_expressions(op, *exprs):
def apply_op_to_expressions(op, *exprs, to_reuse=False):
if len(exprs) < 2:
raise ValueError("At least two expressions are required")

Expand All @@ -44,7 +44,9 @@ def _inner(current_expr, *rest_exprs):
return _inner(
apply_bin_op(current_expr, rest_exprs[0], op), *rest_exprs[1:])

return _inner(apply_bin_op(exprs[0], exprs[1], op), *exprs[2:])
result = _inner(apply_bin_op(exprs[0], exprs[1], op), *exprs[2:])
result.to_reuse = to_reuse
return result


def to_1d_array(var):
Expand All @@ -57,3 +59,23 @@ def to_2d_array(var):
else:
x, y = 1, np.size(var)
return np.reshape(np.asarray(var), (x, y))


def sigmoid_expr(expr, to_reuse=False):
neg_expr = ast.BinNumExpr(ast.NumVal(0), expr, ast.BinNumOpType.SUB)
exp_expr = ast.ExpExpr(neg_expr)
return ast.BinNumExpr(
ast.NumVal(1),
ast.BinNumExpr(ast.NumVal(1), exp_expr, ast.BinNumOpType.ADD),
ast.BinNumOpType.DIV,
to_reuse=to_reuse)


def softmax_exprs(exprs):
exp_exprs = [ast.ExpExpr(e, to_reuse=True) for e in exprs]
exp_sum_expr = apply_op_to_expressions(ast.BinNumOpType.ADD, *exp_exprs,
to_reuse=True)
return [
ast.BinNumExpr(e, exp_sum_expr, ast.BinNumOpType.DIV)
for e in exp_exprs
]
110 changes: 110 additions & 0 deletions m2cgen/assemblers/xgboost.py
@@ -0,0 +1,110 @@
import json
import numpy as np
from m2cgen import ast
from m2cgen.assemblers import utils
from m2cgen.assemblers.base import ModelAssembler


class XGBoostModelAssembler(ModelAssembler):

def __init__(self, model):
super().__init__(model)
self._base_score = self.model.base_score

feature_names = self.model.get_booster().feature_names
self._feature_name_to_idx = {
name: idx for idx, name in enumerate(feature_names)
}

self._output_size = 1
self._is_classification = False
model_class_name = type(model).__name__
if model_class_name == "XGBClassifier":
self._is_classification = True
if self.model.n_classes_ > 2:
self._output_size = self.model.n_classes_

def assemble(self):
model_dump = self.model.get_booster().get_dump(dump_format="json")
trees = [json.loads(d) for d in model_dump]

if self._is_classification:
if self._output_size == 1:
return self._assemble_bin_class_output(trees)
else:
return self._assemble_multi_class_output(trees)
else:
return self._assemble_single_output(trees, self._base_score)

def _assemble_multi_class_output(self, trees):
# Multi-class output is calculated based on discussion in
# https://github.com/dmlc/xgboost/issues/1746#issuecomment-295962863
splits = _split_trees_by_classes(trees, self._output_size)

base_score = self._base_score
exprs = [self._assemble_single_output(t, base_score) for t in splits]

proba_exprs = utils.softmax_exprs(exprs)
return ast.VectorVal(proba_exprs)

def _assemble_bin_class_output(self, trees):
# Base score is calculated based on https://github.com/dmlc/xgboost/blob/master/src/objective/regression_loss.h#L64 # noqa
# return -logf(1.0f / base_score - 1.0f);
base_score = -np.log(1.0 / self._base_score - 1.0)
expr = self._assemble_single_output(trees, base_score)

proba_expr = utils.sigmoid_expr(expr, to_reuse=True)

return ast.VectorVal([
ast.BinNumExpr(ast.NumVal(1), proba_expr, ast.BinNumOpType.SUB),
proba_expr
])

def _assemble_single_output(self, trees, base_score):
trees_ast = [self._assemble_tree(t) for t in trees]
result_ast = utils.apply_op_to_expressions(
ast.BinNumOpType.ADD,
ast.NumVal(base_score),
*trees_ast)
return ast.SubroutineExpr(result_ast)

def _assemble_tree(self, tree):
if "leaf" in tree:
return ast.NumVal(tree["leaf"])

threshold = ast.NumVal(tree["split_condition"])
feature_idx = self._feature_name_to_idx[tree["split"]]
feature_ref = ast.FeatureRef(feature_idx)

# Since comparison with NaN (missing) value always returns false we
# should make sure that the node ID specified in the "missing" field
# always ends up in the "else" branch of the ast.IfExpr.
use_lt_comp = tree["missing"] == tree["no"]
if use_lt_comp:
comp_op = ast.CompOpType.LT
true_child_id = tree["yes"]
false_child_id = tree["no"]
else:
comp_op = ast.CompOpType.GTE
true_child_id = tree["no"]
false_child_id = tree["yes"]

return ast.IfExpr(ast.CompExpr(feature_ref, threshold, comp_op),
self._assemble_child_tree(tree, true_child_id),
self._assemble_child_tree(tree, false_child_id))

def _assemble_child_tree(self, tree, child_id):
for child in tree["children"]:
if child["nodeid"] == child_id:
return self._assemble_tree(child)
assert False, "Unexpected child ID {}".format(child_id)


def _split_trees_by_classes(trees, n_classes):
# Splits are computed based on a comment
# https://github.com/dmlc/xgboost/issues/1746#issuecomment-267400592.
trees_by_classes = [[] for _ in range(n_classes)]
for i in range(len(trees)):
class_idx = i % n_classes
trees_by_classes[class_idx].append(trees[i])
return trees_by_classes
26 changes: 24 additions & 2 deletions m2cgen/ast.py
Expand Up @@ -3,6 +3,12 @@

class Expr:
output_size = 1
# Setting this value to true serves as an indication that the result
# of evaluation of this expression is being used in other expressions
# and it's recommended to persist or cache it in some way.
# The actual caching mechanism (if any) is left up to a specific
# interpreter implementation to provide.
to_reuse = False


class FeatureRef(Expr):
Expand Down Expand Up @@ -31,6 +37,16 @@ def __str__(self):
return "NumVal(" + str(self.value) + ")"


class ExpExpr(NumExpr):
def __init__(self, expr, to_reuse=False):
self.expr = expr
self.to_reuse = to_reuse

def __str__(self):
args = ",".join([str(self.expr), "to_reuse=" + str(self.to_reuse)])
return "ExpExpr(" + args + ")"


class BinNumOpType(Enum):
ADD = '+'
SUB = '-'
Expand All @@ -39,16 +55,22 @@ class BinNumOpType(Enum):


class BinNumExpr(NumExpr, BinExpr):
def __init__(self, left, right, op):
def __init__(self, left, right, op, to_reuse=False):
assert left.output_size == 1, "Only scalars are supported"
assert right.output_size == 1, "Only scalars are supported"

self.left = left
self.right = right
self.op = op
self.to_reuse = to_reuse

def __str__(self):
args = ",".join([str(self.left), str(self.right), self.op.name])
args = ",".join([
str(self.left),
str(self.right),
self.op.name,
"to_reuse=" + str(self.to_reuse)
])
return "BinNumExpr(" + args + ")"


Expand Down
4 changes: 4 additions & 0 deletions m2cgen/interpreters/c/code_generator.py
Expand Up @@ -55,6 +55,10 @@ def add_assign_array_statement(self, source_var, target_var, size):
self.add_code_line("assign_array({}, {}, {});".format(
source_var, target_var, size))

def add_dependency(self, dep):
dep_str = "#include " + dep
super().prepend_code_line(dep_str)

def vector_init(self, values):
return "(double[]){" + ", ".join(values) + "}"

Expand Down
10 changes: 10 additions & 0 deletions m2cgen/interpreters/c/interpreter.py
Expand Up @@ -20,9 +20,11 @@ class CInterpreter(ToCodeInterpreter,
def __init__(self, indent=4, *args, **kwargs):
cg = CCodeGenerator(indent=indent)
super(CInterpreter, self).__init__(cg, *args, **kwargs)
self.with_exponent = False

def interpret(self, expr):
self._cg.reset_state()
self._reset_reused_expr_cache()

args = [(True, self._feature_array_name)]

Expand Down Expand Up @@ -54,6 +56,9 @@ def interpret(self, expr):
os.path.dirname(__file__), "assign_array.c")
self._cg.prepend_code_lines(utils.get_file_content(filename))

if self.with_exponent:
self._cg.add_dependency("<math.h>")

return self._cg.code

# Both methods supporting linear algebra do several things:
Expand Down Expand Up @@ -86,3 +91,8 @@ def interpret_bin_vector_num_expr(self, expr, **kwargs):
self._cg.add_code_line(func_inv + ";")

return var_name

def interpret_exp_expr(self, expr):
self.with_exponent = True
nested_result = self._do_interpret(expr.expr)
return self._cg.function_invocation("exp", nested_result)
6 changes: 3 additions & 3 deletions m2cgen/interpreters/code_generator.py
Expand Up @@ -123,6 +123,9 @@ def array_index_access(self, array_name, index):
return self.tpl_array_index_access(
array_name=array_name, index=index)

def function_invocation(self, function_name, *args):
return function_name + "(" + ", ".join(map(str, args)) + ")"

# Helpers

def _get_var_declare_type(self, expr):
Expand All @@ -144,6 +147,3 @@ class CLikeCodeGenerator(BaseCodeGenerator):
tpl_else_statement = CodeTemplate("} else {")
tpl_block_termination = CodeTemplate("}")
tpl_var_assignment = CodeTemplate("${var_name} = ${value};")

def function_invocation(self, function_name, *args):
return function_name + "(" + ", ".join(map(str, args)) + ")"
32 changes: 29 additions & 3 deletions m2cgen/interpreters/interpreter.py
Expand Up @@ -9,8 +9,11 @@ class BaseInterpreter:
which takes instance of AST expression and recursively applies method
_do_interpret() to it.
"""
def __init__(self):
self._cached_expr_results = {}

def interpret(self, expr):
self._reset_reused_expr_cache()
return self._do_interpret(expr)

# Private methods implementing Visitor pattern
Expand All @@ -30,10 +33,24 @@ def _do_interpret(self, expr, **kwargs):
handler = self._select_handler(expr)
except NotImplementedError:
if isinstance(expr, ast.TransparentExpr):
return self._do_interpret(expr.expr, **kwargs)
return self._do_interpret(expr.expr, **kwargs)
raise

return handler(expr, **kwargs)
if not expr.to_reuse:
return handler(expr, **kwargs)

if expr in self._cached_expr_results:
return self._cached_expr_results[expr]

result = handler(expr, **kwargs)
return self._cache_reused_expr(expr, result)

def _cache_reused_expr(self, expr, expr_result):
# No caching by default.
return expr_result

def _reset_reused_expr_cache(self):
self._cached_expr_results = {}

def _select_handler(self, expr):
handler_name = self._handler_name(type(expr))
Expand All @@ -55,6 +72,7 @@ def _normalize_expr_name(name):
class BaseToCodeInterpreter(BaseInterpreter):

def __init__(self, cg, feature_array_name="input"):
super().__init__()
self._cg = cg
self._feature_array_name = feature_array_name

Expand All @@ -71,7 +89,9 @@ class ToCodeInterpreter(BaseToCodeInterpreter):
about AST.
"""

with_vectors = False
def __init__(self, cg, feature_array_name="input"):
super().__init__(cg, feature_array_name=feature_array_name)
self.with_vectors = False

def interpret_if_expr(self, expr, if_var_name=None, **kwargs):
if if_var_name is not None:
Expand Down Expand Up @@ -119,3 +139,9 @@ def interpret_vector_val(self, expr, **kwargs):
self.with_vectors = True
nested = [self._do_interpret(expr, **kwargs) for expr in expr.exprs]
return self._cg.vector_init(nested)

def _cache_reused_expr(self, expr, expr_result):
var_name = self._cg.add_var_declaration(expr.output_size)
self._cg.add_var_assignment(var_name, expr_result, expr.output_size)
self._cached_expr_results[expr] = var_name
return var_name
4 changes: 4 additions & 0 deletions m2cgen/interpreters/java/interpreter.py
Expand Up @@ -51,6 +51,10 @@ def interpret(self, expr):

return top_cg.code

def interpret_exp_expr(self, expr):
nested_result = self._do_interpret(expr.expr)
return self._cg.function_invocation("Math.exp", nested_result)

# Required by SubroutinesAsFunctionsMixin to create new code generator for
# each subroutine.
def create_code_generator(self):
Expand Down
1 change: 1 addition & 0 deletions m2cgen/interpreters/mixins.py
Expand Up @@ -121,6 +121,7 @@ def process_subroutine_queue(self, top_code_generator):
self._subroutine_idx = 0

while len(self.subroutine_expr_queue):
self._reset_reused_expr_cache()
subroutine = self.subroutine_expr_queue.pop(0)
subroutine_code = self.process_subroutine(subroutine)
top_code_generator.add_code_lines(subroutine_code)
Expand Down

0 comments on commit 3594f60

Please sign in to comment.