# Hierarchical model

In [16]:
import pandas as pd
import numpy as np
import sklearn.metrics
import xgboost as xgb
import mlflow

from sklearn.metrics import mean_absolute_error as mape
from sklearn.metrics import mean_squared_error as mse
from category_encoders import *

In [21]:
train_df = pd.read_csv('../data/preprocessed/train_df.csv',
                       sep=',', index_col=0)
test_df = pd.read_csv('../data/preprocessed/test_df.csv',
                       sep=',', index_col=0)

train_df.head()

  and should_run_async(code)


Unnamed: 0,game_code,ops_num,last_year_sales,general_sales,game_code_sales,ops_num_sales,sales,mean_w12__benford_correlation,mean_w8__benford_correlation,mean_w8__abs_energy,...,mean_w12__minimum,previous__benford_correlation,deviation__benford_correlation,"deviation__fft_coefficient__attr_""angle""__coeff_0",deviation__range_count__max_1000000000000.0__min_0,deviation__count_above__t_0,deviation__count_below__t_0,deviation__range_count__max_1__min_-1,"deviation__fft_coefficient__attr_""abs""__coeff_0",deviation__abs_energy
2017-02-12,7101,105005,,719.0,,,25.0,,,,...,,,,,,,,,,
2017-02-19,7101,105005,,731.0,,,16.0,,,,...,,,,,,,,,,
2017-02-26,7101,105005,,1092.0,,,17.0,,,,...,,,,,,,,,,
2017-03-05,7101,105005,,1614.0,,,15.0,,,,...,,,,,,,,,,
2017-03-12,7101,105005,,1296.0,,,19.0,,,,...,,,,,,,,,,


In [22]:
y_train = train_df["sales"]
y_test = test_df["sales"]
X_train = train_df.drop(["sales"], axis=1)
X_test = test_df.drop(["sales"], axis=1)

  and should_run_async(code)


In [23]:
# use binary encoding to encode two categorical features
enc = BinaryEncoder(cols=['game_code', 'ops_num']).fit(X_train)

# transform the dataset
train_numeric_dataset = enc.transform(X_train)
test_numeric_dataset = enc.transform(X_test)

print(test_numeric_dataset.head())

  elif pd.api.types.is_categorical(cols):


            game_code_0  game_code_1  game_code_2  game_code_3  ops_num_0  \
2018-11-04            0            0            0            1          0   
2018-11-11            0            0            0            1          0   
2018-11-18            0            0            0            1          0   
2018-11-25            0            0            0            1          0   
2018-12-02            0            0            0            1          0   

            ops_num_1  ops_num_2  ops_num_3  ops_num_4  ops_num_5  ...  \
2018-11-04          0          0          0          0          0  ...   
2018-11-11          0          0          0          0          0  ...   
2018-11-18          0          0          0          0          0  ...   
2018-11-25          0          0          0          0          0  ...   
2018-12-02          0          0          0          0          0  ...   

            mean_w12__minimum  previous__benford_correlation  \
2018-11-04                Na

In [24]:
%time
from mlflow.models.signature import infer_signature

with mlflow.start_run(run_name='XGBoost_base_all_ts'):
    xgb_regressor = xgb.XGBRegressor(
            n_estimators=100,
            reg_lambda=1,
            gamma=0,
            max_depth=6
        )
    xgb_regressor.fit(train_numeric_dataset, y_train)
    
    signature = infer_signature(train_numeric_dataset, xgb_regressor.predict(train_numeric_dataset))
    mlflow.sklearn.log_model(xgb_regressor, "XGB_base_all_ts", signature=signature)
    mape_base_xgb = mape(y_test.to_numpy(), xgb_regressor.predict(test_numeric_dataset))
    rmse_base_xgb = mse(y_test.to_numpy(), xgb_regressor.predict(test_numeric_dataset), squared=False)
    mlflow.log_metric("mape", mape_base_xgb)
    mlflow.log_metric("rmse", rmse_base_xgb)

print(mape_base_xgb)
print(rmse_base_xgb)

  and should_run_async(code)


CPU times: user 21 µs, sys: 16 µs, total: 37 µs
Wall time: 1.6 ms
43.24498949672859
195.78439633625948


In [26]:
import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)
    
def objective(trial):
    dtrain = xgb.DMatrix(train_numeric_dataset, label=y_train)
    dtest = xgb.DMatrix(test_numeric_dataset, label=y_test)

    param = {
        "silent": 1,
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
        param["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 1.0)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
        param["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")
    bst = xgb.train(param, dtrain, evals=[(dtest, "validation")], callbacks=[pruning_callback])
    preds = bst.predict(dtest)
    metric = mse(y_test, preds, squared=False)
    return metric

study = optuna.create_study()
study.optimize(objective, n_trials=100)

print(study.best_value)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation-rmse:231.95392
[1]	validation-rmse:231.95342
[2]	validation-rmse:231.95287
[3]	validation-rmse:231.95227
[4]	validation-rmse:231.95167
[5]	validation-rmse:231.95113
[6]	validation-rmse:231.95062
[7]	validation-rmse:231.95007
[8]	validation-rmse:231.94959
[9]	validation-rmse:231.94908
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation-rmse:230.52298
[1]	validation-rmse:229.16280
[2]	validation-rmse:227.86995
[3]	validation-rmse:226.64146
[4]	validation-r

[7]	validation-rmse:198.20425
[8]	validation-rmse:198.19672
[9]	validation-rmse:198.20514
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation-rmse:199.66806
[1]	validation-rmse:198.90958
[2]	validation-rmse:198.53633
[3]	validation-rmse:198.37483
[4]	validation-rmse:198.27269
[5]	validation-rmse:198.24712
[6]	validation-rmse:198.20409
[7]	validation-rmse:198.20677
[8]	validation-rmse:198.17940
[9]	validation-rmse:198.17842
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  T

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation-rmse:199.31868
[1]	validation-rmse:198.79068
[2]	validation-rmse:198.48398
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation-rmse:199.66771
[1]	validation-rmse:198.90782
[2]	validation-rmse:198.55756
[3]	validation-rmse:198.37701
[4]	validation-rmse:198.29234
[5]	validation-rmse:198.24197
[6]	validation-rmse:198.21603
[7]	validation-rmse:198.19287
[8]	validation-rmse:198.18750
[9]	validation-rmse:198.17683
Parameters: { silent } might not be used.

  T

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation-rmse:199.69058
[1]	validation-rmse:198.90683
[2]	validation-rmse:198.56277
[3]	validation-rmse:198.36426
[4]	validation-rmse:198.28252
[5]	

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

[2]	validation-rmse:198.55212
[3]	validation-rmse:198.37492
[4]	validation-rmse:198.28757
[5]	validation-rmse:198.23102
[6]	validation-rmse:198.20810
[7]	validation-rmse:198.19394
[8]	validation-rmse:198.18401
[9]	validation-rmse:198.17650
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used 

In [35]:
print(study.best_params)

{'booster': 'gblinear', 'lambda': 1.2579329387874718e-06, 'alpha': 7.818178345020804e-05}


  and should_run_async(code)


In [34]:
with mlflow.start_run(run_name='XGBoost_bst_all_ts'):
    xgb_bst = xgb.XGBRegressor(booster= 'gblinear', reg_lambda=1.2579329387874718e-06, alpha= 7.818178345020804e-05)
    xgb_bst.fit(train_numeric_dataset, y_train)
    
    signature = infer_signature(train_numeric_dataset, xgb_bst.predict(train_numeric_dataset))
    mlflow.sklearn.log_model(xgb_bst, "XGB_bst_all_ts", signature=signature)
    
    mape_bst_xgb = mape(y_test.to_numpy(), xgb_bst.predict(test_numeric_dataset))
    rmse_bst_xgb = mse(y_test.to_numpy(), xgb_bst.predict(test_numeric_dataset), squared=False)
    mlflow.log_metric("mape", mape_bst_xgb)
    mlflow.log_metric("rmse", rmse_bst_xgb)

print(mape_bst_xgb)
print(rmse_bst_xgb)

44.49013728294498
197.74307846555098
