# Classification using Cyclic Boosting

First, install the  package and its dependencies

```sh
!pip install cyclic-boosting
```

In [1]:
# Optional formatting if juypter-black is installed
try:
    import jupyter_black

    jupyter_black.load(line_length=80)
except ImportError:
    ...

In [2]:
import pandas as pd
import numpy as np

Let's load the test dataset from Blue-Yonder-OSS

In [3]:
import urllib.request

# urllib.request.urlretrieve(
#     "https://raw.githubusercontent.com/Blue-Yonder-OSS/"
#     "cyclic-boosting/main/tests/integration_test_data.csv",
#     "data.csv",
# )

path = "./data.csv"
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,P_ID,PG_ID_3,PG_ID_2,PG_ID_1,NORMAL_PRICE,L_ID,SALES_AREA,DATE,SCHOOL_HOLIDAY,EVENT,PROMOTION_TYPE,SALES_PRICE,LAMBDA,SALES
0,15,2,1,1,19.11,2,6321.680089,2020-09-19,0.0,,0,19.11,1.405019,0
1,15,2,1,1,19.11,2,6321.680089,2020-08-14,0.0,,0,19.11,2.815779,4
2,5,2,1,1,10.21,2,6321.680089,2021-11-28,0.0,,1,6.774357,4.716399,5
3,20,1,1,1,10.34,1,6993.30121,2021-11-06,0.0,,0,10.34,1.783195,2
4,10,1,1,1,9.89,1,6993.30121,2019-10-17,0.0,,0,9.89,2.747644,0


# Prepare Data

The variable LAMBDA is potential demand and is normally unobtainable data, therefore it should be deleted.

Categorical variables must be converted to int type and continuous variables to float type.

In [5]:
def drop_LAMBDA(df):
    df = df.drop(columns="LAMBDA")
    return df


def convert_datatype(df, col):
    if df[col].dtype == np.float64:
        df = df.astype({col: np.int64})
    elif df[col].dtype == np.int64:
        df = df.astype({col: np.float64})
    return df


df_test = df.copy()
df_test = drop_LAMBDA(df_test)
df_test = convert_datatype(df_test, col="SCHOOL_HOLIDAY")
df_test.to_csv("./data_test.csv", index=False)

In [6]:
df_test.head()

Unnamed: 0,P_ID,PG_ID_3,PG_ID_2,PG_ID_1,NORMAL_PRICE,L_ID,SALES_AREA,DATE,SCHOOL_HOLIDAY,EVENT,PROMOTION_TYPE,SALES_PRICE,SALES
0,15,2,1,1,19.11,2,6321.680089,2020-09-19,0,,0,19.11,0
1,15,2,1,1,19.11,2,6321.680089,2020-08-14,0,,0,19.11,4
2,5,2,1,1,10.21,2,6321.680089,2021-11-28,0,,1,6.774357,5
3,20,1,1,1,10.34,1,6993.30121,2021-11-06,0,,0,10.34,2
4,10,1,1,1,9.89,1,6993.30121,2019-10-17,0,,0,9.89,0


# Automated Machine Learning with Tornado
With tornado, you can automatically perform data preparation, feature property setting, hyperparameter tuning, model building, training, evaluation, and plotting!

In [7]:
from cyclic_boosting.tornado import Generator, Manager, Trainer

data_deliverler = Generator.TornadoDataModule("data_test.csv")
manager = Manager.TornadoVariableSelectionModule()
trainer = Trainer.SqueezeTrainer(data_deliverler, manager)
trainer.run(target="sales", log_policy="compute_COD", verbose=False)

Auto analysis target ['normal_price', 'sales_area', 'sales_price']
{'has_trend': [], 'has_seasonality': [], 'has_up_monotonicity': [], 'has_down_monotonicity': [], 'has_linearity': [], 'has_missing': []}
iter: 3 / 90



iter: 38 / 90



iter: 47 / 90



iter: 90 / 90
TRUNCATED
['sales_price', 'p_id', 'promotion_type', 'normal_price', 'pg_id_3', 'l_id', 'dayofyear', 'dayofweek', 'event', 'school_holiday', 'pg_id_2', 'pg_id_1', 'sales_area', ('promotion_type', 'sales_price'), ('p_id', 'promotion_type'), ('normal_price', 'promotion_type'), ('l_id', 'sales_price'), ('sales_area', 'sales_price'), ('p_id', 'sales_price'), ('normal_price', 'sales_price'), ('sales_price', 'dayofweek'), ('p_id', 'l_id'), ('normal_price', 'l_id'), ('pg_id_3', 'sales_price'), ('pg_id_2', 'sales_price'), ('pg_id_1', 'sales_price'), ('event', 'sales_price'), ('school_holiday', 'sales_price'), ('p_id', 'dayofweek'), ('normal_price', 'dayofweek'), ('pg_id_3', 'normal_price'), ('p_id', 'pg_id_2'), ('p_id', 'pg_id_1'), ('p_id', 'event'), ('normal_price', 'event'), ('p_id', 'sales_area'), ('normal_price', 'sales_area'), ('p_id', 'school_holiday'), ('normal_price', 'school_holiday'), ('p_id', 'pg_id_3'), ('sales_price', 'dayofyear'), ('p_id', 'dayofyear'), ('normal_pric

  plt.figure(figsize=figsize)


Now, you can make a forecasting analysis with the best modelusing the pickle file in the ./models directory!For instructions, please refer to the file tornado.ipynb inthe examples/regression/tornado directory.


# Load the best model and make predictions.

Get the best model path.

In [8]:
import pickle
from pathlib import Path

model_nos = []
for p in sorted(Path("./models/").glob("model*")):
    model_nos.append(str(p)[str(p).find("_") + 1 :])
model_path = f"./models/model_{model_nos[-1]}/model_{model_nos[-1]}.pkl"
print(model_path)

./models/model_66/model_66.pkl


Make predictions with the best model.

In [40]:
data = {
    "dayofweek": [4],
    "dayofyear": [190],
    "event": [0],
    "l_id": [1],
    "normal_price": [10.34],
    "p_id": [20],
    "pg_id_1": [1],
    "pg_id_2": [1],
    "pg_id_3": [1],
    "promotion_type": [1],
    "sales_area": [6321.6800893695445],
    "sales_price": [10.34],
    "school_holiday": [0],
}

X = pd.DataFrame(data)
print(X)

with open(model_path, "rb") as f:
    CB_est = pickle.load(f)
    yhat = CB_est.predict(X.copy())
    print(yhat)

   dayofweek  dayofyear  event  l_id  normal_price  p_id  pg_id_1  pg_id_2  \
0          4        190      0     1         10.34    20        1        1   

   pg_id_3  promotion_type   sales_area  sales_price  school_holiday  
0        1               1  6321.680089        10.34               0  
[1.97384934]
