# Model training: [CatBoost](https://github.com/catboost/catboost)
---
Description:

In this notebook, you can generate two solutions.
One is using 6 `single output regressor` to predict 6 target variable respectively, and the other is `muti-output regressor` that can output 6 predictions of target variable. It depends on `model_train_config.py` to lead the data flow.

---

## 1. Import Libraries

In [1]:
# basic
import pandas as pd
import numpy as np
import pickle
import model_train_config
# utils
from utils.time_utils import current_time
from utils.metric_utils import rmse_score
# model
from catboost import CatBoostRegressor
from catboost import cv, Pool
# config
import feature_engineering_config as fe_config
# feature engineering
from feature_engineering import Standardization

## 2. Config

In [2]:
random_state = model_train_config.random_state

## 3. Load Data

In [3]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'rb') as f:
    train_x_after_fe = pickle.load(f)

In [4]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'rb') as f:
    train_y_after_fe = pickle.load(f)

In [5]:
# You need to use original values of target variable to evaluate RMSE
with open(f'../Data/meta/non_scaled_train_y_after_feature_engineering.pickle', 'rb') as f:
    non_scaled_train_y = pickle.load(f)

In [6]:
train_y_after_fe.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,-2.818367,-1.363911,-1.094471,-1.636244,-0.747381,-2.730355
1,-3.198302,-1.213496,-2.02931,-1.6786,-1.199743,-2.905334
2,-3.350275,-0.987873,-1.873504,-1.720955,-0.093969,-2.730355
3,-3.198302,-1.288703,-2.02931,-1.593889,-1.802893,-3.080313
4,-1.678564,-0.536627,-1.873504,-1.21269,-1.199743,-1.563825


In [7]:
train_x_after_fe.head()

Unnamed: 0,clean_pressure11,clean_pressure23,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure62,clean_pressure71,...,painting_g2_act_hvc_group,painting_g4_act_a_air_group,painting_g4_act_hvv_group,painting_g8_act_f_air_group,painting_g8_act_t_air_group,painting_g10_act_t_air_group,painting_g11_act_hvc_group,env_rpi05_pm1_group,env_rpi07_pm1_group,env_rpi15_pm1_group
0,0.997022,-2.388433,-0.843851,-2.240894,2.296172,2.122483,-2.895754,1.899981,-0.765094,0.433714,...,1,3,3,1,0,4,1,4,2,2
1,0.995513,-2.388208,-0.844471,-2.240395,2.296172,2.131458,-2.895754,1.899981,-0.765295,0.425788,...,1,3,3,1,2,4,1,4,2,2
2,0.995513,-2.388208,-0.844471,-2.240395,2.296172,2.131458,-2.895754,1.899981,-0.765295,0.425788,...,1,3,3,1,2,4,1,4,2,2
3,0.995513,-2.388208,-0.844471,-2.240395,2.296172,2.131458,-2.895754,1.899981,-0.765295,0.425788,...,1,3,3,1,2,4,1,4,2,2
4,1.006083,-2.390687,-0.854257,-2.261857,2.296172,2.126968,-2.894711,1.899981,-0.774144,0.37886,...,1,3,3,1,2,4,1,4,4,2


In [8]:
categorical_features_indices = np.where(train_x_after_fe.dtypes != float)[0]

## 4. Single Output Regressor

Training model for each target variable.

In [9]:
single_output_model_dic = {}
for col_y in train_y_after_fe.columns:

    params = {
        'loss_function': 'RMSE',
        'random_seed': random_state,
        'early_stopping_rounds':100,
        'task_type': 'CPU',
        'verbose':False,
        'has_time':True,
    }
    model = CatBoostRegressor(**params)

    grid = {
        'learning_rate': [0.05, 0.07, 0.09, 0.3],
        'iterations': [50, 60, 70, 80, 90, 100, 120, ],
        'depth': [3, 4, 5, 6, 7],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
    }
    randomized_search_result = model.randomized_search(
        grid,
        X=train_x_after_fe,
        y=train_y_after_fe[col_y],
        cv=3,
        plot=False,
        verbose=False
    )
    
    single_output_model_dic[col_y] = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 0.7168204577
bestIteration = 97


bestTest = 0.6982473377
bestIteration = 93


bestTest = 0.711567168
bestIteration = 37


bestTest = 0.725529971
bestIteration = 98


bestTest = 0.7035183267
bestIteration = 34


bestTest = 0.7077820346
bestIteration = 68


bestTest = 0.7161348069
bestIteration = 89


bestTest = 0.7138894293
bestIteration = 31

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7231186067
bestIteration = 6


bestTest = 0.6990322579
bestIteration = 68

Training on fold [0/3]

bestTest = 0.6016065544
bestIteration = 99

Training on fold [1/3]

bestTest = 0.6928381088
bestIteration = 97

Training on fold [2/3]

bestTest = 0.6898138804
bestIteration = 85


bestTest = 0.4692543269
bestIteration = 98


bestTest = 0.4673527841
bestIteration = 98


bestTest = 0.4542795728
bestIteration = 43


bestTest = 0.4676451741
bestIteration = 99


bestTest = 0.4580370056
bestIteration = 78


bestTest = 0.4726047788
bestIteration = 79


bestTest = 0.472901102


### Save single output model

In [10]:
# time = current_time()
with open(f'../Model/catboost_single.pickle', 'wb' ) as f:
    pickle.dump(single_output_model_dic, f)

```python
model_dic[col_y]['model'].predict(train_x_after_fe)  # get predictions
model_dic[col_y]['randomized_search_result']['params']  # get parameters
model_dic[col_y]['randomized_search_result']['cv_results']['test-RMSE-mean']  # get metric result
```

* ### Save the list of target variable name

In [11]:
col_y_li = train_y_after_fe.columns.tolist()
with open(f'../Model/target_col_list.pickle', 'wb' ) as f:
    pickle.dump(col_y_li, f)

* ### Predict 6 target variable

In [12]:
array_li= [ single_output_model_dic[col_y]['model'].predict(train_x_after_fe).reshape(-1,1) for col_y in col_y_li ]
all_res = np.concatenate(array_li, axis=1)

In [13]:
temp = pd.DataFrame(all_res, columns=col_y_li)
# save model result
with open('../Data/meta/catboost_single_predict_train_y.pickle', 'wb') as f:
    pickle.dump(temp, f)

In [14]:
# Load y-scaler model
with open(f'../Model/standard_scaler_y.pickle', 'rb') as f:
    scaler_y = pickle.load(f)

In [15]:
predictions = scaler_y.inverse_transform(all_res)
targets = non_scaled_train_y
print(rmse_score(predictions, targets))

sensor_point5_i_value      7.177684
sensor_point6_i_value      7.737494
sensor_point7_i_value     13.083644
sensor_point8_i_value     10.410481
sensor_point9_i_value     12.000323
sensor_point10_i_value     9.599019
dtype: float64


## 5. Muti-output Regressor
Training one model for Muti-Output result.

In [16]:
params = {
    'loss_function': 'MultiRMSE',
    'random_seed': random_state,
    'early_stopping_rounds':100,
    'task_type': 'CPU',
    'verbose':False,
    'has_time':True,
}
model = CatBoostRegressor(**params)

grid = {
    'learning_rate': [0.05, 0.07, 0.09, 0.3],
    'iterations': [50, 60, 70, 80, 90, 100],
    'depth': [3, 4, 5, 6, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

randomized_search_result = model.randomized_search(
    grid,
    X=train_x_after_fe,
    y=train_y_after_fe,
    cv=3,
    partition_random_seed=random_state,
    plot=False,
    verbose=False
)
model_muti_output_dic = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 1.721603953
bestIteration = 79


bestTest = 1.66595969
bestIteration = 35


bestTest = 1.657399866
bestIteration = 69


bestTest = 1.649586526
bestIteration = 59


bestTest = 1.597904734
bestIteration = 62


bestTest = 1.613736376
bestIteration = 72


bestTest = 1.672710653
bestIteration = 99


bestTest = 1.62740129
bestIteration = 54


bestTest = 1.678627738
bestIteration = 99


bestTest = 1.643451628
bestIteration = 79

Training on fold [0/3]

bestTest = 1.530720673
bestIteration = 67

Training on fold [1/3]

bestTest = 1.639366844
bestIteration = 68

Training on fold [2/3]

bestTest = 1.54810075
bestIteration = 56



In [17]:
predictions = scaler_y.inverse_transform( model_muti_output_dic['model'].predict(train_x_after_fe) )
targets = non_scaled_train_y
print(rmse_score(predictions, targets))

sensor_point5_i_value      6.613232
sensor_point6_i_value      9.813324
sensor_point7_i_value     12.953555
sensor_point8_i_value     11.020815
sensor_point9_i_value     10.321013
sensor_point10_i_value     9.293708
dtype: float64


In [18]:
with open(f'../Model/catboost_all.pickle', 'wb' ) as f:
    pickle.dump(model_muti_output_dic['model'], f)

In [19]:
temp = pd.DataFrame(model_muti_output_dic['model'].predict(train_x_after_fe), columns=col_y_li)
# save model result
with open('../Data/meta/catboost_all_predict_train_y.pickle', 'wb') as f:
    pickle.dump(temp, f)