# Model training
---
Describe:

---

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import model_config
from utils.time_utils import current_time

from catboost import CatBoostRegressor
# from bayes_opt import BayesianOptimization


from sklearn.model_selection import train_test_split
from catboost import cv, Pool

import feature_engineering_config as fe_config  
from feature_engineering import Standardization

## 2. Config

In [2]:
random_state = model_config.random_state

## 3. Load Data

In [3]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'rb') as f:
    train_x_after_fe = pickle.load(f)

In [4]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'rb') as f:
    train_y_after_fe = pickle.load(f)

In [5]:
train_y_after_fe.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,-2.850704,-1.349299,-1.108874,-1.644919,-0.749104,-2.754174
1,-3.234429,-1.201861,-2.066065,-1.687718,-1.199583,-2.930475
2,-3.387919,-0.980703,-1.906533,-1.730517,-0.098411,-2.754174
4,-1.699529,-0.538388,-1.906533,-1.216932,-1.199583,-1.578837
5,-2.850704,-0.759546,-1.029109,-1.088536,-1.60001,-1.637604


In [6]:
train_x_after_fe.head()

Unnamed: 0,clean_pressure31,clean_pressure41,clean_pressure61,clean_pressure72,clean_pressure81,clean_pressure91,clean_pressure102,oven_pa1,oven_pa2,oven_pb1,...,oven_a1_group,oven_a2_group,oven_b1_group,painting_g4_act_hvc_group,painting_g9_act_a_air_group,painting_g9_act_hvc_group,painting_g10_act_hvc_group,env_rpi05_temp_group,env_rpi15_pm1_group,env_rpi15_pm25_group
0,-0.841315,1.883944,1.896626,2.853088,1.927227,-0.409044,0.82925,1.849228,1.356757,3.752397,...,3,2,1,4,2,3,4,1,1,2
1,-0.841937,1.883944,1.896626,2.866816,1.941897,-0.452476,0.829058,1.883055,1.346359,3.73189,...,3,2,1,4,0,3,4,1,1,2
2,-0.841937,1.883944,1.896626,2.866816,1.941897,-0.452476,0.829058,1.883055,1.346359,3.73189,...,3,2,1,4,0,3,4,1,1,2
4,-0.851756,1.883944,1.896626,2.853088,1.800151,-0.542397,0.827615,1.928491,1.305015,3.16622,...,3,2,1,4,2,1,4,1,1,2
5,-0.851756,1.883944,1.896626,2.853088,1.800151,-0.542397,0.827615,1.928491,1.305015,3.16622,...,3,2,1,4,2,1,4,1,1,2


In [7]:
categorical_features_indices = np.where(train_x_after_fe.dtypes != float)[0]

## 4. Training model on each target variable

In [8]:
model_dic = {}
for col_y in train_y_after_fe.columns:

    params = {
        'loss_function': 'RMSE',
        'random_seed': random_state,
        'early_stopping_rounds':100,
        'task_type': 'CPU',
        'verbose':False,
        'has_time':True,
    }
    model = CatBoostRegressor(**params)

    grid = {
        'learning_rate': [0.05, 0.07, 0.09, 0.3],
        'iterations': [50, 60, 70, 80, 90, 100, 120, ],
        'depth': [3, 4, 5, 6, 7],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
    }
    randomized_search_result = model.randomized_search(
        grid,
        X=train_x_after_fe,
        y=train_y_after_fe[col_y],
        cv=3,
        plot=False
    )
    
    model_dic[col_y] = {'model': model, 'randomized_search_result': randomized_search_result}


bestTest = 0.7432550606
bestIteration = 52

0:	loss: 0.7432551	best: 0.7432551 (0)	total: 363ms	remaining: 3.27s

bestTest = 0.7456749243
bestIteration = 78

1:	loss: 0.7456749	best: 0.7432551 (0)	total: 556ms	remaining: 2.22s

bestTest = 0.7485511858
bestIteration = 89

2:	loss: 0.7485512	best: 0.7432551 (0)	total: 1.76s	remaining: 4.12s

bestTest = 0.7385212813
bestIteration = 72

3:	loss: 0.7385213	best: 0.7385213 (3)	total: 2.31s	remaining: 3.46s

bestTest = 0.7475849396
bestIteration = 80

4:	loss: 0.7475849	best: 0.7385213 (3)	total: 3.22s	remaining: 3.22s

bestTest = 0.7409289743
bestIteration = 115

5:	loss: 0.7409290	best: 0.7385213 (3)	total: 4.37s	remaining: 2.91s

bestTest = 0.7393812839
bestIteration = 55

6:	loss: 0.7393813	best: 0.7385213 (3)	total: 5.28s	remaining: 2.26s

bestTest = 0.7381470347
bestIteration = 77

7:	loss: 0.7381470	best: 0.7381470 (7)	total: 6.34s	remaining: 1.58s

bestTest = 0.7396507095
bestIteration = 17

8:	loss: 0.7396507	best: 0.7381470 (7)	tot

In [9]:
# model_dic[col_y]['model'].predict(train_x_after_fe)

In [10]:
# model_dic[col_y]['randomized_search_result']['params']

In [11]:
# model_dic[col_y]['randomized_search_result']['cv_results']['test-RMSE-mean']

In [12]:
def rmse_score(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2,  axis=0))

In [13]:
array_li= [model_dic[col_y]['model'].predict(train_x_after_fe).reshape(-1,1) for col_y in train_y_after_fe.columns]
all_res = np.concatenate(array_li, axis=1)

In [14]:
with open(f'../Model/standard_scaler_y.pickle', 'rb') as f:
    scaler_y = pickle.load(f)

### Use ground true to evaluate RMSE

In [15]:
with open(f'../Data/meta/ground_true_train_y_after_feature_engineering.pickle', 'rb') as f:
    ground_true_train_y = pickle.load(f)

In [None]:
predictions = scaler_y.inverse_transform(all_res)
targets = ground_true_train_y
print(rmse_score(predictions, targets))

In [None]:
time = current_time()
for col_y in train_y_after_fe.columns:
    with open(f'../Model/catboost_{col_y}_{time}.pickle', 'wb' ) as f:
        pickle.dump(model_dic[col_y]['model'], f)

## 5. Catboost for Muti-Output 

In [None]:
params = {
    'loss_function': 'MultiRMSE',
    'random_seed': random_state,
    'early_stopping_rounds':100,
    'task_type': 'CPU',
    'verbose':False,
    'has_time':True,
}
model = CatBoostRegressor(**params)

grid = {
    'learning_rate': [0.05, 0.07, 0.09, 0.3],
    'iterations': [50, 60, 70, 80, 90, 100],
    'depth': [3, 4, 5, 6, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

randomized_search_result = model.randomized_search(
    grid,
    X=train_x_after_fe,
    y=train_y_after_fe,
    cv=3,
    plot=False
)
model_muti_output_dic = {'model': model, 'randomized_search_result': randomized_search_result}

In [None]:
predictions = scaler_y.inverse_transform( model_muti_output_dic['model'].predict(train_x_after_fe) )
targets = ground_true_train_y
print(rmse_score(predictions, targets))

In [None]:
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score

In [None]:
# params = {
#     'loss_function': 'MultiRMSE',
#     'iterations': 150,
#     'random_seed': random_state,
#     'learning_rate': 0.5,
#     # 'use_best_model': True
# }

# cv_data = cv(
#     params = params,
#     pool = Pool(train_x, label=train_y, cat_features=categorical_features_indices),
#     fold_count=5,
#     type = 'Classical',  # The method to split the dataset into folds.
#     shuffle=True,
#     partition_random_seed=random_state,
#     plot=True,
#     stratified=False,
#     verbose=False
# )

In [None]:
# best_value = np.argmin(cv_data['test-MultiRMSE-mean'])
# best_iter = np.argmin(cv_data['test-MultiRMSE-mean'])

# print('Best validation MultiRMSE score: {:.4f}±{:.4f} on step {}'.format(
#     best_value,
#     cv_data['test-MultiRMSE-std'][best_iter],
#     best_iter)
# )

In [None]:
# params['iterations'] = best_iter

In [None]:
# model = CatBoostRegressor(
#     **params,
#     eval_metric='MultiRMSE',
#     verbose=False
# )

# model.fit(
#     train_x, train_y,
#     cat_features=categorical_features_indices,
#     eval_set=(val_x, val_y),
#     # logging_level='Verbose',
#     plot=False
# )

In [None]:
# model.get_evals_result()

In [None]:
# predict_y = model.predict(val_x)

In [None]:
# def rmse_score(predictions, targets):
#     return np.sqrt(np.mean((predictions-targets)**2, axis=0))

In [None]:
# rmse_score(predict_y, val_y)