# Regression using Cyclic Boosting

First, install the  package and its dependencies

```sh
!pip install cyclic-boosting
```

In [None]:
import pandas as pd
import numpy as np
import os
import datetime

# Let's use the test dataset from kaggle

Sign in to Kaggle at the URL below and download the dataset.  
https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset

Place the downloaded dataset in the following directory.  
examples/regression/tornado/test_1/bike_sharing_data/

For time-series data, a "date" column must be included to indicate the date and time the data was obtained. The column name and format must be consistent. The "dayofweek" column for the day of the week and the "dayofyear" column for the total number of days in the year are automatically created if not already present, but if they are already present, the column names must be correct.

This dataset has hourly data. In this dataset, the "instant" column is the data number. The "casual" and "registered" columns are the breakdown of sales, so they should be deleted.

In [None]:
parpath = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
df = pd.read_csv(parpath + "/bike_sharing_data/hour.csv")
df = df.rename(columns={'dteday': 'date', 'weekday': 'dayofweek'})
df = df.drop(columns=['instant', 'casual', 'registered'])
df["date"] = pd.to_datetime(df["date"])
df['date'] = df['date'] + df['hr'].map(lambda x: datetime.timedelta(hours=float(x)))

df.to_csv("./bike_sharing_hour.csv", index=False)

In [None]:
df.head()

# Automated Machine Learning with Tornado
With tornado, you can automatically perform data preparation, feature property setting, hyperparameter tuning, model building, training, evaluation, and plotting!

In [None]:
from cyclic_boosting.tornado import Generator, Manager, Trainer

data_deliverler = Generator.TornadoDataModule("./bike_sharing_hour.csv")
manager = Manager.TornadoVariableSelectionModule()
trainer = Trainer.SqueezeTrainer(data_deliverler, manager)
trainer.run(target="cnt", log_policy="compute_COD", verbose=False)

# Load the best model and make predictions.

Get the best model path.

In [None]:
import pickle
from pathlib import Path

model_nos = []
for p in sorted(Path("./models/").glob("model*")):
    model_nos.append(str(p)[str(p).find("_") + 1 :])
model_path = f"./models/model_{model_nos[-1]}/model_{model_nos[-1]}.pkl"
print(model_path)

Make predictions with the best model.

In [None]:
data = {
    'season': [4],
    'yr': [0],
    'mnth': [11],
    'hr': [18],
    'holiday': [0],
    'workingday': [1],
    'weathersit': [2],
    'temp': [0.341667],
    'atemp': [0.323221],
    'hum': [0.575833],
    'windspeed': [0.305362],
    'dayofweek': [4],
    'dayofyear': [180],
}
X = pd.DataFrame(data)

with open(model_path, "rb") as f:
    CB_est = pickle.load(f)
    yhat = CB_est.predict(X.copy())
    print(yhat)

# Accuracy comparison with base Cyclic boosting

In [None]:
from cyclic_boosting import flags, common_smoothers, observers
from cyclic_boosting.pipelines import pipeline_CBPoissonRegressor

from cyclic_boosting.smoothing.onedim import SeasonalSmoother

from sklearn.model_selection import train_test_split


df["dayofyear"] = df["date"].dt.dayofyear
train, validation = train_test_split(df, test_size=0.2, random_state=0)
y = np.asarray(train["cnt"])
X = train.drop(columns="cnt")
y_val = np.asarray(validation["cnt"])
X_val = validation.drop(columns="cnt")
if not all(np.asarray(train['cnt']) == manager.y):
    raise ValueError("Accuracy comparison is not available because\n"
                     "the data split is not the same as that of tornado.")

feature_properties = {
    "season": flags.IS_UNORDERED,
    "dayofweek": flags.IS_ORDERED,
    "dayofyear": flags.IS_CONTINUOUS | flags.IS_LINEAR,
    "yr": flags.IS_ORDERED,
    "mnth": flags.IS_ORDERED,
    "hr": flags.IS_ORDERED,
    "holiday": flags.IS_UNORDERED,
    "workingday": flags.IS_UNORDERED,
    "weathersit": flags.IS_UNORDERED,
    "temp": flags.IS_CONTINUOUS,
    "atemp": flags.IS_CONTINUOUS,
    "hum": flags.IS_CONTINUOUS,
    "windspeed": flags.IS_CONTINUOUS,
}

features = [
    "season",
    "dayofweek",
    "dayofyear",
    "yr",
    "mnth",
    "hr",
    "holiday",
    "workingday",
    "weathersit",
    "temp",
    "atemp",
    "hum",
    "windspeed",
]

explicit_smoothers = {
    ("dayofyear",): SeasonalSmoother(order=3),
}

plobs = [
    observers.PlottingObserver(iteration=1),
    observers.PlottingObserver(iteration=-1),
]

CB_est = pipeline_CBPoissonRegressor(
    feature_properties=feature_properties,
    feature_groups=features,
    observers=plobs,
    maximal_iterations=50,
    smoother_choice=common_smoothers.SmootherChoiceGroupBy(
        use_regression_type=True,
        use_normalization=False,
        explicit_smoothers=explicit_smoothers,
    ),
)

_ = CB_est.fit(X.copy(), y)

metrics_path = f"./models/model_{model_nos[-1]}/metrics_{model_nos[-1]}.txt"

with open(metrics_path, "rb") as f:
    metrics_tornado = f.read().decode("utf-8")
    mse_tornado = float([x for x in metrics_tornado.split("\n") if "MSE" in x][0].split(":")[1])
    mae_tornado = float([x for x in metrics_tornado.split("\n") if "MAE" in x][0].split(":")[1])
    wmape_tornado = float([x for x in metrics_tornado.split("\n") if "WMAPE" in x][0].split(":")[1])


yhat = CB_est.predict(X_val.copy())
mse = np.nanmean(np.square(y_val - yhat))
mae = np.nanmean(np.abs(y_val - yhat))
wmape = np.nansum(np.abs(y_val - yhat) * y_val) / np.nansum(y_val)

pd.options.display.float_format = '{:.2f}'.format
val_results = pd.DataFrame([[np.sqrt(mse_tornado), mae_tornado, wmape_tornado],
                            [np.sqrt(mse), mae, wmape]],
                           columns=["RMSE", "MAE", "WMAPE"],
                           index=["CB_tornado", "Base CB"])
print(val_results)