# Model training
---
Description:

In this notebook, you can generate two solutions.
One is using 6 `single output regressor` to predict 6 target variable respectively, and the other is `muti-output regressor` that can output 6 predictions of target variable. It depends on `model_train_config.py` to lead the data flow.

---

## 1. Import Libraries

In [1]:
# basic
import pandas as pd
import numpy as np
import pickle
import model_train_config
# utils
from utils.time_utils import current_time
from utils.metric_utils import rmse_score
# model
from catboost import CatBoostRegressor
from catboost import cv, Pool

import feature_engineering_config as fe_config  
from feature_engineering import Standardization

## 2. Config

In [2]:
random_state = model_train_config.random_state

## 3. Load Data

In [3]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'rb') as f:
    train_x_after_fe = pickle.load(f)

In [4]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'rb') as f:
    train_y_after_fe = pickle.load(f)

In [5]:
# You need to use original values of target variable to evaluate RMSE
with open(f'../Data/meta/non_scaled_train_y_after_feature_engineering.pickle', 'rb') as f:
    non_scaled_train_y = pickle.load(f)

In [6]:
train_y_after_fe.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,-2.808926,-1.356983,-1.095099,-1.683992,-0.748292,-2.752699
1,-3.187453,-1.207992,-2.033918,-1.728135,-1.198188,-2.929558
2,-3.338863,-0.984506,-1.877448,-1.772278,-0.098443,-2.752699
3,-3.187453,-1.282487,-2.033918,-1.639849,-1.79805,-3.106417
4,-1.673345,-0.537534,-1.877448,-1.242563,-1.198188,-1.573637


In [7]:
train_x_after_fe.head()

Unnamed: 0,clean_pressure11,clean_pressure12,clean_pressure21,clean_pressure22,clean_pressure23,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure42,clean_pressure51,...,painting_g8_act_hvv_group,painting_g9_act_a_air_group,painting_g9_act_t_air_group,painting_g9_act_hvv_group,painting_g10_act_hvc_group,painting_g11_act_hvc_group,env_rpi05_temp_group,env_rpi07_pm25_group,env_rpi15_lux_group,env_rpi15_pm25_group
0,0.994151,-1.519511,-1.852233,-2.289004,-2.390387,-0.840732,-2.244639,2.291633,-2.615687,2.118289,...,0,0,1,4,2,1,1,0,3,2
1,0.992637,-1.518782,-1.851409,-2.289088,-2.390161,-0.841353,-2.244137,2.291633,-2.616245,2.127222,...,0,2,1,0,2,1,1,0,3,2
2,0.992637,-1.518782,-1.851409,-2.289088,-2.390161,-0.841353,-2.244137,2.291633,-2.616245,2.127222,...,0,2,1,0,2,1,1,0,3,2
3,0.992637,-1.518782,-1.851409,-2.289088,-2.390161,-0.841353,-2.244137,2.291633,-2.616245,2.127222,...,0,2,1,0,2,1,1,0,3,2
4,1.003244,-1.514706,-1.853113,-2.289004,-2.39265,-0.851163,-2.26573,2.291633,-2.617757,2.122753,...,0,0,1,0,2,1,1,3,3,2


In [8]:
categorical_features_indices = np.where(train_x_after_fe.dtypes != float)[0]

## 4. Single Output Regressor

Training model for each target variable.

In [9]:
single_output_model_dic = {}
for col_y in train_y_after_fe.columns:

    params = {
        'loss_function': 'RMSE',
        'random_seed': random_state,
        'early_stopping_rounds':100,
        'task_type': 'CPU',
        'verbose':False,
        'has_time':True,
    }
    model = CatBoostRegressor(**params)

    grid = {
        'learning_rate': [0.05, 0.07, 0.09, 0.3],
        'iterations': [50, 60, 70, 80, 90, 100, 120, ],
        'depth': [3, 4, 5, 6, 7],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
    }
    randomized_search_result = model.randomized_search(
        grid,
        X=train_x_after_fe,
        y=train_y_after_fe[col_y],
        cv=3,
        plot=False,
        verbose=False
    )
    
    single_output_model_dic[col_y] = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 0.7294917258
bestIteration = 99


bestTest = 0.7233121349
bestIteration = 86


bestTest = 0.7275003991
bestIteration = 23


bestTest = 0.7160457586
bestIteration = 99


bestTest = 0.7193648323
bestIteration = 58


bestTest = 0.7261004241
bestIteration = 74


bestTest = 0.7183132827
bestIteration = 85


bestTest = 0.7277654338
bestIteration = 40

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7118642158
bestIteration = 7


bestTest = 0.7017700621
bestIteration = 66

Training on fold [0/3]

bestTest = 0.5835435841
bestIteration = 69

Training on fold [1/3]

bestTest = 0.6713180503
bestIteration = 68

Training on fold [2/3]

bestTest = 0.7100423469
bestIteration = 69


bestTest = 0.5243754874
bestIteration = 99


bestTest = 0.5147284586
bestIteration = 99


bestTest = 0.5130468966
bestIteration = 89


bestTest = 0.5248503192
bestIteration = 99


bestTest = 0.5044945
bestIteration = 89


bestTest = 0.520301174
bestIteration = 79


bestTest = 0.5100793242
b

### Save single output model

In [10]:
time = current_time()
with open(f'../Model/catboost_single_{time}.pickle', 'wb' ) as f:
    pickle.dump(single_output_model_dic, f)

```python
model_dic[col_y]['model'].predict(train_x_after_fe)  # get predictions
model_dic[col_y]['randomized_search_result']['params']  # get parameters
model_dic[col_y]['randomized_search_result']['cv_results']['test-RMSE-mean']  # get metric result
```

* ### Save the list of target variable name

In [11]:
col_y_li = train_y_after_fe.columns.tolist()
with open(f'../Model/target_col_list.pickle', 'wb' ) as f:
    pickle.dump(col_y_li, f)

* ### Predict 6 target variable

In [12]:
array_li= [ single_output_model_dic[col_y]['model'].predict(train_x_after_fe).reshape(-1,1) for col_y in col_y_li ]
all_res = np.concatenate(array_li, axis=1)

In [13]:
# Load y-scaler model
with open(f'../Model/standard_scaler_y.pickle', 'rb') as f:
    scaler_y = pickle.load(f)

In [14]:
predictions = scaler_y.inverse_transform(all_res)
targets = non_scaled_train_y
print(rmse_score(predictions, targets))

sensor_point5_i_value      6.953221
sensor_point6_i_value      9.152518
sensor_point7_i_value     13.277984
sensor_point8_i_value      7.970957
sensor_point9_i_value     10.062283
sensor_point10_i_value     9.341136
dtype: float64


## 5. Muti-output Regressor
Training one model for Muti-Output result.

In [16]:
params = {
    'loss_function': 'MultiRMSE',
    'random_seed': random_state,
    'early_stopping_rounds':100,
    'task_type': 'CPU',
    'verbose':False,
    'has_time':True,
}
model = CatBoostRegressor(**params)

grid = {
    'learning_rate': [0.05, 0.07, 0.09, 0.3],
    'iterations': [50, 60, 70, 80, 90, 100],
    'depth': [3, 4, 5, 6, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

randomized_search_result = model.randomized_search(
    grid,
    X=train_x_after_fe,
    y=train_y_after_fe,
    cv=3,
    partition_random_seed=random_state,
    plot=False,
    verbose=False
)
model_muti_output_dic = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 1.785861137
bestIteration = 79


bestTest = 1.618116546
bestIteration = 52


bestTest = 1.696677867
bestIteration = 69


bestTest = 1.705093289
bestIteration = 59


bestTest = 1.603070753
bestIteration = 64


bestTest = 1.601719072
bestIteration = 93


bestTest = 1.709542634
bestIteration = 99


bestTest = 1.622062129
bestIteration = 79


bestTest = 1.71811735
bestIteration = 99


bestTest = 1.688963583
bestIteration = 79

Training on fold [0/3]

bestTest = 1.502597412
bestIteration = 99

Training on fold [1/3]

bestTest = 1.597835858
bestIteration = 93

Training on fold [2/3]

bestTest = 1.568156467
bestIteration = 95



In [17]:
predictions = scaler_y.inverse_transform( model_muti_output_dic['model'].predict(train_x_after_fe) )
targets = non_scaled_train_y
print(rmse_score(predictions, targets))

sensor_point5_i_value      6.824877
sensor_point6_i_value     10.320584
sensor_point7_i_value     13.081080
sensor_point8_i_value     10.730843
sensor_point9_i_value     10.667331
sensor_point10_i_value     9.339279
dtype: float64


In [18]:
with open(f'../Model/catboost_all_{time}.pickle', 'wb' ) as f:
    pickle.dump(model_muti_output_dic['model'], f)

In [19]:
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score

In [20]:
# params = {
#     'loss_function': 'MultiRMSE',
#     'iterations': 150,
#     'random_seed': random_state,
#     'learning_rate': 0.5,
#     # 'use_best_model': True
# }

# cv_data = cv(
#     params = params,
#     pool = Pool(train_x, label=train_y, cat_features=categorical_features_indices),
#     fold_count=5,
#     type = 'Classical',  # The method to split the dataset into folds.
#     shuffle=True,
#     partition_random_seed=random_state,
#     plot=True,
#     stratified=False,
#     verbose=False
# )

In [21]:
# best_value = np.argmin(cv_data['test-MultiRMSE-mean'])
# best_iter = np.argmin(cv_data['test-MultiRMSE-mean'])

# print('Best validation MultiRMSE score: {:.4f}±{:.4f} on step {}'.format(
#     best_value,
#     cv_data['test-MultiRMSE-std'][best_iter],
#     best_iter)
# )

In [22]:
# params['iterations'] = best_iter

In [23]:
# model = CatBoostRegressor(
#     **params,
#     eval_metric='MultiRMSE',
#     verbose=False
# )

# model.fit(
#     train_x, train_y,
#     cat_features=categorical_features_indices,
#     eval_set=(val_x, val_y),
#     # logging_level='Verbose',
#     plot=False
# )

In [24]:
# model.get_evals_result()

In [25]:
# predict_y = model.predict(val_x)

In [26]:
# def rmse_score(predictions, targets):
#     return np.sqrt(np.mean((predictions-targets)**2, axis=0))

In [27]:
# rmse_score(predict_y, val_y)