# Forest Inference Library
The forest inference library is used to load saved forest models of xgboost, lightgbm or protobuf and perform inference on them. It can be used to perform both classification and regression. This notebook shows how to use the Forest Inference library with xgboost and lightgbm models.

The model accepts both numpy arrays and cuDF dataframes. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. 

For additional information on the forest inference library please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html

In [1]:
import numpy as np
import pytest
import os

from cuml import ForestInference
from cuml.test.utils import array_equal
from cuml.utils.import_utils import has_xgboost, has_lightgbm

from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

Environment variables with the 'NUMBAPRO' prefix are deprecated, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-numbapro-environment-variables
Environment variables with the 'NUMBAPRO' prefix are deprecated, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-numbapro-environment-variables


In [2]:
if has_xgboost():
    import xgboost as xgb
else:
    raise("Please install xgboost using the conda package,"
          " Use conda install -c conda-forge xgboost "
          "command to install xgboost")
    
if has_lightgbm():
    import lightgbm as lgb
else:
    raise("Please install lightgbm using the conda package,"
          " Use conda install -c conda-forge lightgbm "
          "command to install lightgbm")
    

In [3]:
# Create classification and regression data
def simulate_data(m, n, k=2, random_state=None, classification=True):
    if classification:
        features, labels = make_classification(n_samples=m,
                                               n_features=n,
                                               n_informative=int(n/5),
                                               n_classes=k,
                                               random_state=random_state)
    else:
        features, labels = make_regression(n_samples=m,
                                           n_features=n,
                                           n_informative=int(n/5),
                                           n_targets=1,
                                           random_state=random_state)
    return np.c_[features].astype(np.float32), \
        np.c_[labels].astype(np.float32).flatten()

#### for additional information on the xgboost library please refer to the documentation on : 
#### https://xgboost.readthedocs.io/en/latest/parameter.html

In [4]:
# function that trains the xgboost model and performs prediction on it as well
def train_xgboost_model(X_train, y_train,
                        X_validation,
                        y_validation,
                        num_rounds, classification):

    # set the xgboost model parameters
    xgboost_params={}
    params = {'silent': 1}
    if classification:
        params['eval_metric'] = 'error'
        params['objective'] = 'binary:logistic'
    else:
        params['eval_metric'] = 'error'
        params['objective'] = 'reg:squarederror'
        params['base_score'] = 0.0
    params['max_depth'] = 25
    params.update(xgboost_params)
    model_path = "xgb.model"
    dtrain = xgb.DMatrix(X_train, label=y_train)
    bst = xgb.train(params, dtrain, num_rounds)

    # save the trained xgboost model
    bst.save_model(model_path)

    # predict the xgboost model
    dvalidation = xgb.DMatrix(X_validation, label=y_validation)
    xgb_preds = bst.predict(dvalidation)

    # if the model is used for classification then convert
    # the predicted values into class labels
    if classification:
        xgb_preds = np.around(xgb_preds)

    return xgb_preds, model_path

#### for additional information on the lightgbm library please refer to the documentation on : 
#### https://lightgbm.readthedocs.io/en/latest/

In [5]:
# function that trains the lightgbm model and performs prediction on it as well
def train_lightgbm_model(X_train, y_train,
                        X_validation,
                        num_round):
    # convert the data into the lightgbm input format
    train_data = lgb.Dataset(X_train, label=y_train)
    # select the params for the lightgbm model
    param = {'objective': 'binary',
             'metric': 'binary_logloss'}

    # train the lightgbm model
    bst = lgb.train(param, train_data, num_round)
    # perform prediction on the lightgbm model
    gbm_preds = bst.predict(X_validation)

    # path where the model is saved
    model_path = "lgb.model"
    bst.save_model(model_path)

    return gbm_preds, model_path

In [6]:
# set parameters for creating the dataset
classification = False
n_rows = 10000
n_columns = 100
n_categories = 2
random_state = np.random.RandomState(43210)

# select the model on which you want to perform
# inference
select_model = 'xgboost'

# num of iterations for which the model is trained
num_rounds = 15

In [7]:
# create the dataset
X, y = simulate_data(n_rows, n_columns, n_categories,
                     random_state=random_state,
                     classification=classification)
n_rows, n_columns = X.shape
train_size = 0.8

#split the dataset into training and validation splits
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=train_size)

if select_model == 'xgboost':
    trained_model_preds, model_path = train_xgboost_model(X_train, y_train,
                                                          X_validation,
                                                          y_validation,
                                                          num_rounds,
                                                          classification)
elif select_model == 'lightgbm':
    trained_model_preds, model_path = train_lightgbm_model(X_train,
                                                           y_train,
                                                           X_validation,
                                                           num_rounds)
else:
    raise(" This model is not supported, please choose either"
          " an xgboost model or lightgbm model")

#### The load function of the ForestInference class accepts the following parameters:
        filename : str
           Path to saved model file in a treelite-compatible format
           (See https://treelite.readthedocs.io/en/latest/treelite-api.html
        output_class : bool
           If true, return a 1 or 0 depending on whether the raw prediction
           exceeds the threshold. If False, just return the raw prediction.
        threshold : float
           Cutoff value above which a prediction is set to 1.0
           Only used if the model is classification and output_class is True
        algo : string
           Which inference algorithm to use.
           See documentation in FIL.load_from_treelite_model
        model_type : str
            Format of saved treelite model to load.
            Can be 'xgboost', 'lightgbm', or 'protobuf'

In [8]:
# load the saved xgboost model to return the forest in the format used as an input by the forest inference library
fm = ForestInference.load(filename=model_path,
                          algo='BATCH_TREE_REORG',
                          output_class=classification,
                          threshold=0.50,
                          model_type=select_model)
# perform prediction on the model loaded from path
fil_preds = fm.predict(X_validation)

In [9]:
# check if the labels predicted with the selected models and the 
# labels predicted by the ForestInference library are similar or not
array_equal(trained_model_preds, fil_preds, tol=1e-3)

True