# Model training
---
Description:

In this notebook, you can generate two solutions.
One is using 6 `single output regressor` to predict 6 target variable respectively, and the other is `muti-output regressor` that can output 6 predictions of target variable. It depends on `model_train_config.py` to lead the data flow.

---

## 1. Import Libraries

In [1]:
# basic
import pandas as pd
import numpy as np
import pickle
import model_train_config
# utils
from utils.time_utils import current_time
from utils.metric_utils import rmse_score
# model
from catboost import CatBoostRegressor
from catboost import cv, Pool

import feature_engineering_config as fe_config  
from feature_engineering import Standardization

## 2. Config

In [2]:
random_state = model_train_config.random_state

## 3. Load Data

In [3]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'rb') as f:
    train_x_after_fe = pickle.load(f)

In [4]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'rb') as f:
    train_y_after_fe = pickle.load(f)

In [5]:
# You need to use original values of target variable to evaluate RMSE
with open(f'../Data/meta/non_scaled_train_y_after_feature_engineering.pickle', 'rb') as f:
    non_scaled_train_y = pickle.load(f)

In [6]:
train_y_after_fe.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,-2.875155,-1.366112,-1.107743,-1.662985,-0.752201,-2.795963
1,-3.26159,-1.215837,-2.053786,-1.706093,-1.202112,-2.974941
4,-1.715849,-0.539601,-1.896112,-1.231903,-1.202112,-1.602776
5,-2.875155,-0.765013,-1.028906,-1.102578,-1.602033,-1.662436
6,-2.411432,-0.765013,-0.713558,-1.662985,-0.602231,-2.616985


In [7]:
train_x_after_fe.head()

Unnamed: 0,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure71,clean_pressure72,clean_pressure81,clean_pressure91,...,clean_pressure82_group,oven_pb1_group,oven_pb2_group,oven_a1_group,oven_b1_group,oven_b3_group,painting_g3_act_hvv_group,painting_g8_act_f_air_group,painting_g9_act_hvc_group,env_rpi05_temp_group
0,-0.845269,-2.274242,2.327716,2.163639,-2.96591,1.903804,0.434963,2.902725,1.936083,-0.427078,...,5,5,3,2,2,1,3,0,4,1
1,-0.845888,-2.273734,2.327716,2.172853,-2.96591,1.903804,0.42707,2.906072,1.950879,-0.471062,...,5,5,3,2,2,1,3,0,4,1
4,-0.855656,-2.29559,2.327716,2.168243,-2.964846,1.903804,0.380329,2.902725,1.807915,-0.562123,...,5,5,3,2,2,1,3,0,1,1
5,-0.855656,-2.29559,2.327716,2.168243,-2.964846,1.903804,0.380329,2.902725,1.807915,-0.562123,...,5,5,3,2,2,1,3,0,1,1
6,-0.855656,-2.29559,2.327716,2.168243,-2.964846,1.903804,0.380329,2.902725,1.807915,-0.562123,...,5,5,3,2,2,1,3,0,1,1


In [8]:
categorical_features_indices = np.where(train_x_after_fe.dtypes != float)[0]

## 4. Single Output Regressor

Training model for each target variable.

In [9]:
single_output_model_dic = {}
for col_y in train_y_after_fe.columns:

    params = {
        'loss_function': 'RMSE',
        'random_seed': random_state,
        'early_stopping_rounds':100,
        'task_type': 'CPU',
        'verbose':False,
        'has_time':True,
    }
    model = CatBoostRegressor(**params)

    grid = {
        'learning_rate': [0.05, 0.07, 0.09, 0.3],
        'iterations': [50, 60, 70, 80, 90, 100, 120, ],
        'depth': [3, 4, 5, 6, 7],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
    }
    randomized_search_result = model.randomized_search(
        grid,
        X=train_x_after_fe,
        y=train_y_after_fe[col_y],
        cv=3,
        plot=False,
        verbose=False
    )
    
    single_output_model_dic[col_y] = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 0.6109975691
bestIteration = 34


bestTest = 0.6321284684
bestIteration = 79


bestTest = 0.6019160312
bestIteration = 89


bestTest = 0.6100136827
bestIteration = 68


bestTest = 0.5960791858
bestIteration = 94


bestTest = 0.6028758507
bestIteration = 119


bestTest = 0.6048874831
bestIteration = 69


bestTest = 0.6152167909
bestIteration = 79


bestTest = 0.6237244652
bestIteration = 29


bestTest = 0.5999834183
bestIteration = 69

Training on fold [0/3]

bestTest = 0.572216132
bestIteration = 98

Training on fold [1/3]

bestTest = 0.7431476648
bestIteration = 90

Training on fold [2/3]

bestTest = 0.6467753362
bestIteration = 89


bestTest = 0.5012956617
bestIteration = 59


bestTest = 0.5182713867
bestIteration = 79


bestTest = 0.4996115642
bestIteration = 89


bestTest = 0.4663398266
bestIteration = 89


bestTest = 0.4595414079
bestIteration = 99


bestTest = 0.4915816021
bestIteration = 119


bestTest = 0.4882979502
bestIteration = 69


bestTest = 0.4919977242
bestI

### Save single output model

In [10]:
time = current_time()
with open(f'../Model/catboost_single_{time}.pickle', 'wb' ) as f:
    pickle.dump(single_output_model_dic, f)

```python
model_dic[col_y]['model'].predict(train_x_after_fe)  # get predictions
model_dic[col_y]['randomized_search_result']['params']  # get parameters
model_dic[col_y]['randomized_search_result']['cv_results']['test-RMSE-mean']  # get metric result
```

* ### Save the list of target variable name

In [11]:
col_y_li = train_y_after_fe.columns.tolist()
with open(f'../Model/target_col_list.pickle', 'wb' ) as f:
    pickle.dump(col_y_li, f)

* ### Predict 6 target variable

In [12]:
array_li= [ single_output_model_dic[col_y]['model'].predict(train_x_after_fe).reshape(-1,1) for col_y in col_y_li ]
all_res = np.concatenate(array_li, axis=1)

In [13]:
# Load y-scaler model
with open(f'../Model/standard_scaler_y.pickle', 'rb') as f:
    scaler_y = pickle.load(f)

In [14]:
predictions = scaler_y.inverse_transform(all_res)
targets = non_scaled_train_y
print(rmse_score(predictions, targets))

sensor_point5_i_value      6.663888
sensor_point6_i_value      9.495002
sensor_point7_i_value     12.598776
sensor_point8_i_value     10.701720
sensor_point9_i_value     10.518761
sensor_point10_i_value     9.158507
dtype: float64


## 5. Muti-output Regressor
Training one model for Muti-Output result.

In [15]:
params = {
    'loss_function': 'MultiRMSE',
    'random_seed': random_state,
    'early_stopping_rounds':100,
    'task_type': 'CPU',
    'verbose':False,
    'has_time':True,
}
model = CatBoostRegressor(**params)

grid = {
    'learning_rate': [0.05, 0.07, 0.09, 0.3],
    'iterations': [50, 60, 70, 80, 90, 100],
    'depth': [3, 4, 5, 6, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

randomized_search_result = model.randomized_search(
    grid,
    X=train_x_after_fe,
    y=train_y_after_fe,
    cv=3,
    partition_random_seed=random_state,
    plot=False,
    verbose=False
)
model_muti_output_dic = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 1.728735673
bestIteration = 69


bestTest = 1.697672077
bestIteration = 79


bestTest = 1.603510882
bestIteration = 49


bestTest = 1.625161475
bestIteration = 49


bestTest = 1.89781322
bestIteration = 49


bestTest = 1.743687381
bestIteration = 89


bestTest = 1.596771665
bestIteration = 69


bestTest = 1.533173417
bestIteration = 82


bestTest = 1.557608104
bestIteration = 79


bestTest = 1.666167591
bestIteration = 59

Training on fold [0/3]

bestTest = 1.520832039
bestIteration = 74

Training on fold [1/3]

bestTest = 1.588909351
bestIteration = 76

Training on fold [2/3]

bestTest = 1.576418112
bestIteration = 82



In [16]:
predictions = scaler_y.inverse_transform( model_muti_output_dic['model'].predict(train_x_after_fe) )
targets = non_scaled_train_y
print(rmse_score(predictions, targets))

sensor_point5_i_value      6.525038
sensor_point6_i_value      9.863718
sensor_point7_i_value     12.481714
sensor_point8_i_value     10.718974
sensor_point9_i_value     10.216413
sensor_point10_i_value     9.001918
dtype: float64


In [17]:
with open(f'../Model/catboost_all_{time}.pickle', 'wb' ) as f:
    pickle.dump(model_muti_output_dic['model'], f)