In [1]:
import pandas as pd
import numpy as np
import pickle
import model_config

from catboost import CatBoostRegressor
from bayes_opt import BayesianOptimization


from sklearn.model_selection import train_test_split
from catboost import cv, Pool

## Config

In [2]:
random_state = model_config.random_state

## Load Data

In [3]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'rb') as f:
    x_df = pickle.load(f)

In [4]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'rb') as f:
    y_df = pickle.load(f)

In [5]:
train_x, val_x, train_y, val_y = train_test_split(x_df, y_df, test_size=0.33, random_state=random_state)

In [6]:
train_y.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
172,58,90,96,92,82,142
564,83,96,112,82,121,94
113,70,88,76,80,79,93
18,67,76,80,48,72,74
76,86,82,112,66,58,82


In [7]:
train_x.head()

Unnamed: 0,clean_pressure11,clean_pressure23,clean_pressure31,clean_pressure41,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure62,clean_pressure72,clean_pressure81,...,painting_g4_act_hvv_group,painting_g7_act_hvv_group,painting_g8_act_a_air_group,painting_g8_act_t_air_group,painting_g9_act_t_air_group,painting_g10_act_hvc_group,painting_g11_act_hvc_group,env_rpi05_temp_group,env_rpi07_hum_group,env_rpi15_pm1_group
172,0.932984,0.012026,-0.394486,-0.152276,0.535788,-0.108904,0.695881,-0.759566,0.697026,-0.461315,...,1,1,2,4,0,1,1,1,1,3
564,-1.131781,1.496023,1.572485,-0.152276,1.211348,0.635416,-0.661092,0.586983,0.049092,0.615458,...,0,3,1,2,0,0,0,0,0,3
113,1.379899,-0.275836,-0.467823,-0.152276,0.67896,0.995332,1.068148,-0.799203,0.867246,-0.647959,...,3,1,2,4,0,5,0,1,3,1
18,0.822923,-2.344591,-3.21392,2.154155,1.079582,-2.891692,0.707889,-0.792437,2.657301,1.459564,...,1,1,2,4,0,0,1,4,1,1
76,1.675822,0.110349,-0.605748,2.154155,0.661383,0.036348,1.464432,-0.957793,-0.439605,1.868534,...,0,0,2,0,4,5,3,4,0,3


In [8]:
categorical_features_indices = np.where(train_x.dtypes != float)[0]

In [17]:
col_y = 'sensor_point5_i_value'

In [18]:
params = {
    'loss_function': 'RMSE',
    'iterations': 80,
    'random_seed': random_state,
    'learning_rate': 0.5,
    # 'use_best_model': True
}

cv_data = cv(
    params = params,
    pool = Pool(train_x, label=train_y[col_y], cat_features=categorical_features_indices),
    fold_count=5,
    type = 'Classical',  # The method to split the dataset into folds.
    shuffle=True,
    partition_random_seed=random_state,
    plot=True,
    stratified=False,
    verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 10.59142865
bestIteration = 26

Training on fold [1/5]

bestTest = 9.81059736
bestIteration = 17

Training on fold [2/5]

bestTest = 9.6041228
bestIteration = 19

Training on fold [3/5]

bestTest = 10.86144611
bestIteration = 18

Training on fold [4/5]

bestTest = 9.824769321
bestIteration = 32



In [19]:
best_value = np.argmin(cv_data['test-RMSE-mean'])
best_iter = np.argmin(cv_data['test-RMSE-mean'])

print('Best validation RMSE score: {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-RMSE-std'][best_iter],
    best_iter)
)

Best validation RMSE score: 19.0000±0.5469 on step 19


In [20]:
params['iterations'] = best_iter

In [21]:
model = CatBoostRegressor(
    **params,
    eval_metric='RMSE',
    verbose=False
)

model.fit(
    train_x, train_y[col_y],
    cat_features=categorical_features_indices,
    eval_set=(val_x, val_y[col_y]),
    # logging_level='Verbose',
    plot=False
)

<catboost.core.CatBoostRegressor at 0x260aca17220>

In [22]:
# model.get_evals_result()

In [23]:
predict_y = model.predict(val_x)

In [24]:
def rmse_score(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

In [27]:
rmse_score(predict_y, val_y[col_y])

7.552117927213621

In [None]:
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score

In [29]:
params = {
    'loss_function': 'MultiRMSE',
    'iterations': 150,
    'random_seed': random_state,
    'learning_rate': 0.5,
    # 'use_best_model': True
}

cv_data = cv(
    params = params,
    pool = Pool(train_x, label=train_y, cat_features=categorical_features_indices),
    fold_count=5,
    type = 'Classical',  # The method to split the dataset into folds.
    shuffle=True,
    partition_random_seed=random_state,
    plot=True,
    stratified=False,
    verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 35.30689052
bestIteration = 19

Training on fold [1/5]

bestTest = 41.00472506
bestIteration = 71

Training on fold [2/5]

bestTest = 40.13883252
bestIteration = 13

Training on fold [3/5]

bestTest = 35.21158011
bestIteration = 57

Training on fold [4/5]

bestTest = 35.85192104
bestIteration = 53



In [32]:
best_value = np.argmin(cv_data['test-MultiRMSE-mean'])
best_iter = np.argmin(cv_data['test-MultiRMSE-mean'])

print('Best validation MultiRMSE score: {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-MultiRMSE-std'][best_iter],
    best_iter)
)

Best validation MultiRMSE score: 32.0000±2.8392 on step 32


In [33]:
params['iterations'] = best_iter

In [34]:
model = CatBoostRegressor(
    **params,
    eval_metric='MultiRMSE',
    verbose=False
)

model.fit(
    train_x, train_y,
    cat_features=categorical_features_indices,
    eval_set=(val_x, val_y),
    # logging_level='Verbose',
    plot=False
)

<catboost.core.CatBoostRegressor at 0x260b181e4c0>

In [22]:
# model.get_evals_result()

In [35]:
predict_y = model.predict(val_x)

In [39]:
def rmse_score(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2, axis=0))

In [40]:
rmse_score(predict_y, val_y)

sensor_point5_i_value      8.514762
sensor_point6_i_value     13.133348
sensor_point7_i_value     16.337178
sensor_point8_i_value     13.039559
sensor_point9_i_value     13.622460
sensor_point10_i_value    12.808895
dtype: float64