In [1]:
import numpy as np
import pandas as pd
import catboost
import ngboost
import KTBoost.KTBoost as KTBoost
import xgboost
import matplotlib
from matplotlib import pyplot as plt
import sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
import IPython

# Functions to get data

In [2]:
def linear_data(nobs, noise_std, low=0, high=1):
    coefficients = np.array([13, -10, 15])
    x = np.random.uniform(size=[nobs, 3], low=low, high=high)
    y = x @ coefficients + np.random.normal(size=nobs)*noise_std
    sorted_inds = np.argsort(y)
    y = y[sorted_inds]
    x = x[sorted_inds, ...]
    return x, y

def nonlinear_data(nobs, noise_std, low=0, high=1):
    x = np.random.uniform(size=[nobs, 3], low=low, high=high)
    y = x[:, 0]**2 + 2*np.sin(x[:, 1]) + x[:, 1]*x[:, 2] + np.random.normal(size=nobs)*noise_std
    sorted_inds = np.argsort(y)
    y = y[sorted_inds]
    x = x[sorted_inds, ...]
    return x, y

def get_train_test_data(nobs, noise_std, linear):
    if linear:
        x, y = linear_data(nobs, noise_std, low=-.7, high=.7)
        xtest, ytest = linear_data(nobs, noise_std, low=-1, high=1)
    else:
        x, y = nonlinear_data(nobs, noise_std, low=-.7, high=.7)
        xtest, ytest = nonlinear_data(nobs, noise_std, low=-1, high=1)
    return x, y, xtest, ytest

# Compare predictions from different methods

In [3]:
%matplotlib widget
def func(nobs, noise_std, linear):
    methods = {
    'Linear regression': [LinearRegression(), {}],
    'SKLearn GB': [GradientBoostingRegressor(), {}],
    'NGB': [ngboost.NGBRegressor(), {}],
    'Catboost': [catboost.CatBoostRegressor(), {'verbose': False}],
    'KTBoost': [KTBoost.BoostingRegressor(), {}],
    'XGBoost': [xgboost.XGBRegressor(), {}]}

    x, y, xtest, ytest = get_train_test_data(nobs, noise_std, linear)
    
    for key, value in methods.items():
        value[0].fit(x, y, **value[1])
    
    plt.figure(figsize=[10, 5])
    plt.subplot(1, 2, 1)
    plt.plot(y, 'ok', label='Observations')
    
    for key, value in methods.items():
        plt.plot(value[0].predict(x), label=key)
    plt.title('Training')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(ytest, 'ok', label='Observations')
    for key, value in methods.items():
        plt.plot(value[0].predict(xtest), label=key)
    plt.title('Testing')
    plt.legend()
    
compare_methods_widget = interactive(func, {'manual': True}, nobs=(10, 1000), noise_std=(0, 1, 0.1), linear=[('True', True), ('False', False)])
display(compare_methods_widget)

interactive(children=(IntSlider(value=505, description='nobs', max=1000, min=10), FloatSlider(value=0.0, descr…

# Compare probabilistic predictions

In [4]:
%matplotlib widget
def func(nobs, noise_std, linear, posterior_sampling):
    x, y = linear_data(nobs, noise_std)
    xtest, ytest = linear_data(nobs, noise_std)

    ngb = ngboost.NGBRegressor()
    catgb = catboost.CatBoostRegressor(posterior_sampling=posterior_sampling, loss_function='RMSEWithUncertainty')

    x, y, xtest, ytest = get_train_test_data(nobs, noise_std, linear)
    
    ngb.fit(x, y)
    catgb.fit(x, y, verbose=False)

    plt.figure(figsize=[10, 5])
    plt.subplot(1, 2, 1)
    plt.plot(y, 'ok', label='Observations')
    pred_dist = ngb.pred_dist(x)
    mu = pred_dist.params['loc']
    std = pred_dist.params['scale']
    line, = plt.plot(mu, lw=2, label='NGB Predicted $\mu$')
    plt.fill_between(
        line.get_xdata(), (mu - 2*std), (mu + 2*std), color=line.get_color(), alpha=0.4,
        label='NGB Predicted $\mu \pm 2*\sigma$')
    predictions = catgb.predict(x)
    #ensemble_predictions = catgb.virtual_ensembles_predict(x, prediction_type='VirtEnsembles')
    #mins = np.min(ensemble_predictions, axis=1)
    #maxs = np.max(ensemble_predictions, axis=1)
    line, = plt.plot(predictions[:, 0], lw=2, label='Conditional mean, Catboost')
    plt.fill_between(
    line.get_xdata(), 
    predictions[:, 0]-2*np.sqrt(predictions[:, 1]), predictions[:, 0]+2*np.sqrt(predictions[:, 1]), color=line.get_color(), alpha=0.4,
    label='Catboost $\mu \pm 2*\sigma$')
    #plt.fill_between(
    #    line.get_xdata(), 
    #    mins[:, 0], maxs[:, 0], color=line.get_color(), alpha=0.4,
    #    label='Catboost min and max')
    plt.legend()
    plt.title('Training')

    plt.subplot(1, 2, 2)
    plt.plot(ytest, 'ok', label='Observations')
    pred_dist = ngb.pred_dist(xtest)
    mu = pred_dist.params['loc']
    std = pred_dist.params['scale']
    line, = plt.plot(mu, lw=2, label='NGB Predicted $\mu$')
    plt.fill_between(
        line.get_xdata(), (mu - 2*std), (mu + 2*std), color=line.get_color(), alpha=0.4,
        label='NGB Predicted $\mu \pm 2*\sigma$')
    predictions = catgb.predict(xtest)
    #ensemble_predictions = catgb.virtual_ensembles_predict(xtest, prediction_type='VirtEnsembles', virtual_ensembles_count=10)
    #mins = np.min(ensemble_predictions, axis=1)
    #maxs = np.max(ensemble_predictions, axis=1)
    line, = plt.plot(predictions[:, 0], lw=2, label='Conditional mean, Catboost')
    plt.fill_between(
        line.get_xdata(),  predictions[:, 0]-2*np.sqrt(predictions[:, 1]), predictions[:, 0]+2*np.sqrt(predictions[:, 1]), 
        color=line.get_color(), alpha=0.4, label='Catboost $\mu \pm 2*\sigma$')
    plt.title('Test')
    
prob_methods_widget = interactive(func, {'manual': True}, nobs=(10, 1000), noise_std=(0, 1, 0.1), linear=[('True', True), ('False', False)], posterior_sampling=[('True', True), ('False', False)])
display(prob_methods_widget)

interactive(children=(IntSlider(value=505, description='nobs', max=1000, min=10), FloatSlider(value=0.0, descr…

# Time to fit methods

In [5]:
x, y, xtest, ytest = get_train_test_data(nobs=10000, noise_std=0.5, linear=True)

## NGBoosting

In [6]:
%%time
ngb = ngboost.NGBRegressor()
ngb.fit(x, y)

[iter 0] loss=3.6073 val_loss=0.0000 scale=1.0000 norm=7.3383
[iter 100] loss=3.0563 val_loss=0.0000 scale=1.0000 norm=3.7772
[iter 200] loss=2.2904 val_loss=0.0000 scale=2.0000 norm=3.1227
[iter 300] loss=1.5174 val_loss=0.0000 scale=2.0000 norm=1.7511
[iter 400] loss=0.9957 val_loss=0.0000 scale=2.0000 norm=1.3642
Wall time: 22.8 s


NGBRegressor(random_state=RandomState(MT19937) at 0x1E17FD30940)

## Catboost

In [7]:
%%time
catgb = catboost.CatBoostRegressor(posterior_sampling=True, loss_function='RMSEWithUncertainty')
catgb.fit(x, y, verbose=False)

Wall time: 5.94 s


<catboost.core.CatBoostRegressor at 0x1e12369f580>

In [8]:
%%time
catgb = catboost.CatBoostRegressor(posterior_sampling=False, loss_function='RMSEWithUncertainty')
catgb.fit(x, y, verbose=False)

Wall time: 5.49 s


<catboost.core.CatBoostRegressor at 0x1e1236626a0>

In [9]:
%%time
catgb = catboost.CatBoostRegressor(posterior_sampling=True, loss_function='RMSE')
catgb.fit(x, y, verbose=False)

Wall time: 4.86 s


<catboost.core.CatBoostRegressor at 0x1e1236528b0>

In [14]:
%%time
catgb = catboost.CatBoostRegressor(posterior_sampling=False, loss_function='RMSE')
catgb.fit(x, y, verbose=False)

Wall time: 4.9 s


<catboost.core.CatBoostRegressor at 0x1e121562ca0>

## SKLearn

In [11]:
%%time
skgb = GradientBoostingRegressor()
skgb.fit(x, y)

Wall time: 1.27 s


GradientBoostingRegressor()

## KTBoost

In [12]:
%%time
ktb = KTBoost.BoostingRegressor()
ktb.fit(x, y)

Wall time: 1.98 s


BoostingRegressor()

## XGBoost

In [13]:
%%time
xgb = xgboost.XGBRegressor()
xgb.fit(x, y)

Wall time: 1.64 s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)