# Regression using Cyclic Boosting

First, install the  package and its dependencies

```sh
!pip install cyclic-boosting
```

In [1]:
# Optional formatting if juypter-black is installed
try:
    import jupyter_black

    jupyter_black.load(line_length=80)
except ImportError:
    ...

In [2]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import OrdinalEncoder

from cyclic_boosting import flags, common_smoothers, observers
from cyclic_boosting.plots import plot_analysis
from cyclic_boosting.pipelines import (
    pipeline_CBPoissonRegressor,
    pipeline_CBNBinomRegressor,
)

from cyclic_boosting.smoothing.onedim import SeasonalSmoother, IsotonicRegressor

Let's load simulated demand dataset from remote repository

In [3]:
import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/Blue-Yonder-OSS/cyclic-boosting/main/tests/integration_test_data.csv",
    "data.csv"
)

df = pd.read_csv("data.csv")

In [4]:
df.head()

Unnamed: 0,P_ID,PG_ID_3,PG_ID_2,PG_ID_1,NORMAL_PRICE,L_ID,SALES_AREA,DATE,SCHOOL_HOLIDAY,EVENT,PROMOTION_TYPE,SALES_PRICE,LAMBDA,SALES
0,15,2,1,1,19.11,2,6321.680089,2020-09-19,0.0,,0,19.11,1.405019,0
1,15,2,1,1,19.11,2,6321.680089,2020-08-14,0.0,,0,19.11,2.815779,4
2,5,2,1,1,10.21,2,6321.680089,2021-11-28,0.0,,1,6.774357,4.716399,5
3,20,1,1,1,10.34,1,6993.30121,2021-11-06,0.0,,0,10.34,1.783195,2
4,10,1,1,1,9.89,1,6993.30121,2019-10-17,0.0,,0,9.89,2.747644,0


# Prepare the data

The data has to be prepared for the training. We want to convert the categorical variables into numerical values using the scikit-learn OrdinalEncoder (guess, who contributed this 😜).

In [5]:
def prepare_data(df):
    df["DATE"] = pd.to_datetime(df["DATE"])
    df["dayofweek"] = df["DATE"].dt.dayofweek
    df["dayofyear"] = df["DATE"].dt.dayofyear

    df["price_ratio"] = df["SALES_PRICE"] / df["NORMAL_PRICE"]
    df["price_ratio"].fillna(1, inplace=True)
    df["price_ratio"].clip(0, 1, inplace=True)
    df.loc[df["price_ratio"] == 1.0, "price_ratio"] = np.nan

    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
    df[["L_ID", "P_ID", "PG_ID_3"]] = enc.fit_transform(df[["L_ID", "P_ID", "PG_ID_3"]])

    y = np.asarray(df["SALES"])
    X = df.drop(columns="SALES")
    return X, y

# Set the feature properties

We need to tell Cyclic Boosting which feature to use and what type of feature these are and how to handle them.

We want the continuous features be `IS_CONTINUOUS` with missing values (very handy, isn't it 😎) and the categorical features to be treated as unordered classes (no neighboring relation as in weekdays for example).

Note: there is next to no feature engineering done here deliberately. Checking the feature carefully, there can be potentially improved a lot by treating the features individually and maybe even combing them into 2D features (see documentation). We just want to get it up-and-running here.

In [6]:
feature_properties = {
    "P_ID": flags.IS_UNORDERED,
    "PG_ID_3": flags.IS_UNORDERED,
    "L_ID": flags.IS_UNORDERED,
    "dayofweek": flags.IS_ORDERED,
    "dayofyear": flags.IS_CONTINUOUS | flags.IS_LINEAR,
    "price_ratio": flags.IS_CONTINUOUS | flags.HAS_MISSING | flags.MISSING_NOT_LEARNED,
    "PROMOTION_TYPE": flags.IS_ORDERED,
}

features = [
    "dayofweek",
    "L_ID",
    "PG_ID_3",
    "P_ID",
    "PROMOTION_TYPE",
    "price_ratio",
    "dayofyear",
    ("P_ID", "PG_ID_3"),
]

# Build the model

The model is implemented as a scikit-learn pipeline, stitching together a Binner and the CB regressor estimator. Most natably, we reduce the number of used bins in all continuous features to 10 instead of 100, should be plenty.

In [7]:
def cb_poisson_regressor_model():
    explicit_smoothers = {
        ("dayofyear",): SeasonalSmoother(order=3),
        ("price_ratio",): IsotonicRegressor(increasing=False),
    }

    plobs = [
        observers.PlottingObserver(iteration=1),
        observers.PlottingObserver(iteration=-1),
    ]

    CB_pipeline = pipeline_CBNBinomRegressor(
        feature_properties=feature_properties,
        feature_groups=features,
        observers=plobs,
        maximal_iterations=50,
        smoother_choice=common_smoothers.SmootherChoiceGroupBy(
            use_regression_type=True,
            use_normalization=False,
            explicit_smoothers=explicit_smoothers,
        ),
    )

    return CB_pipeline


cb_poisson_regressor_model()

# The training

In [13]:
X, y = prepare_data(df)

CB_est = cb_poisson_regressor_model()
print(CB_est)
_ = CB_est.fit(X.copy(), y)





Pipeline(steps=[('binning',
                 BinNumberTransformer(feature_properties={'L_ID': 4,
                                                          'PG_ID_3': 4,
                                                          'PROMOTION_TYPE': 2,
                                                          'P_ID': 4,
                                                          'dayofweek': 2,
                                                          'dayofyear': 257,
                                                          'price_ratio': 49})),
                ('CB',
                 CBNBinomRegressor(feature_groups=['dayofweek', 'L_ID',
                                                   'PG_ID_3', 'P_ID',
                                                   'PROMOTION_TYPE',
                                                   'price_ratio', 'dayofyear',
                                                   ('P_ID', 'PG_ID_3')],
                                   feature_properties={'L_ID': 4, '

## Evaluation

Now we can do the inference for all samples.

In [9]:
yhat = CB_est.predict(X.copy())

With this we can calculate the mean absolute deviation

In [10]:
mad = np.nanmean(np.abs(y - yhat))
mad

1.7084460715918635

# Some nice plots

Cyclic Boosting has some useful reporting of the traning included. We can create a pdf with this code

In [11]:
def plot_CB(filename, plobs, binner):
    for i, p in enumerate(plobs):
        plot_analysis(
            plot_observer=p,
            file_obj=filename + "_{}".format(i),
            use_tightlayout=False,
            binners=[binner],
        )

In [12]:
plot_CB("analysis_CB_iterlast", [CB_est[-1].observers[-1]], CB_est[-2])