## Tuning of XGBoost estimator

In [10]:
from pinard import nirs_set as n_set
from sklearn.model_selection import train_test_split
import numpy as np

# Init basic random
rd_seed = 42
np.random.seed(rd_seed)

# Create a set named data
n = n_set.NIRS_Set('data')

# Load csv data and split into train and test
X, y = n.load('Xcal.csv', 'Ycal.csv', x_hdr=0, y_hdr=0, y_cols=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = rd_seed)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

from pinard import preprocessor as pp
from sklearn.pipeline import Pipeline

### Declare preprocessing pipeline components
preprocessing = [   ('id', pp.IdentityTransformer()),
                    ('savgol', pp.SavitzkyGolay()),
                    ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
                    ('gaussian2', pp.Gaussian(order = 2, sigma = 1)),
                    ('haar', pp.Wavelet('haar')),
                    ('savgol*savgol', Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])),
                    ('gaussian1*savgol', Pipeline([('_g1',pp.Gaussian(order = 1, sigma = 2)),('_sg3',pp.SavitzkyGolay())])),
                    ('gaussian2*savgol', Pipeline([('_g2',pp.Gaussian(order = 1, sigma = 2)),('_sg4',pp.SavitzkyGolay())])),
                    ('haar*savgol', Pipeline([('_haar2',pp.Wavelet('haar')),('_sg5',pp.SavitzkyGolay())]))
                ]



(288, 2151) (288,) (73, 2151) (73,)


## XGBoost model + params

In [3]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from xgboost import XGBRegressor

x_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('preprocessing', FeatureUnion(preprocessing)), 
    ('XGB', XGBRegressor())
])

x_estimator = TransformedTargetRegressor(regressor = x_pipeline, transformer = MinMaxScaler())

print(x_estimator.get_params())

{'check_inverse': True, 'func': None, 'inverse_func': None, 'regressor__memory': None, 'regressor__steps': [('scaler', MinMaxScaler()), ('preprocessing', FeatureUnion(transformer_list=[('id', FunctionTransformer()),
                               ('savgol', SavitzkyGolay()),
                               ('gaussian1', Gaussian(sigma=2)),
                               ('gaussian2', Gaussian(order=2)),
                               ('haar', Wavelet()),
                               ('savgol*savgol',
                                Pipeline(steps=[('_sg1', SavitzkyGolay()),
                                                ('_sg2', SavitzkyGolay())])),
                               ('gaussian1*savgol',
                                Pipeline(steps=[('_g1', Gaussian(sigma=2)),
                                                ('_sg3', SavitzkyGolay())])),
                               ('gaussian2*savgol',
                                Pipeline(steps=[('_g2', Gaussian(sigma=2)),
      

## GridSearch on param grid

In [7]:
from sklearn.model_selection import GridSearchCV

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    "regressor__XGB__max_depth": [5, 6, 7],
    "regressor__XGB__eta": [0.2, 0.3],
    "regressor__XGB__gamma": [0, 0.3],
}

search = GridSearchCV(x_estimator, 
                      param_grid, 
                      cv=3, 
                      n_jobs=2, 
                      verbose=3, 
                      scoring='neg_mean_squared_error')

search.fit(X_train, y_train)
print(search.best_params_)

Y_preds = search.predict(X_test)
print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
{'regressor__XGB__eta': 0.3, 'regressor__XGB__gamma': 0, 'regressor__XGB__max_depth': 5}
MAE 1.3079682178359107
MSE 3.4625930671667016
MAPE 0.028900282120511277
R² 0.6718040922482609


## Same with KerasRegressor

In [19]:
from pinard.nirs_pipelines import FeatureAugmentation

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, SpatialDropout1D,BatchNormalization,Flatten, Dropout, Input

from scikeras.wrappers import KerasRegressor

from typing import Dict, Iterable, Any

# Create the keras model in the scikeras wrapper format (meta arg)
def keras_model(meta, sp_dropout, filter_1, filter_2, filter_3):
    print("META", meta)
    input_shape = meta["X_shape_"][1:]
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(SpatialDropout1D(sp_dropout))
    model.add(Conv1D (filters=filter_1, kernel_size=15, strides=5, activation='selu'))
    model.add(Dropout(0.2))
    model.add(Conv1D (filters=filter_2, kernel_size=21, strides=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D (filters=filter_3, kernel_size=5, strides=3, activation='elu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(16, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    # model.summary()
    return model

# Create the KerasRegressor
k_regressor = KerasRegressor(model = keras_model,
                            epochs=400, 
                            sp_dropout=0.08,
                            filter_1=8,
                            filter_2=64,
                            filter_3=32,
                            fit__batch_size=50,
                            fit__validation_split=0.2,
                            verbose = 0)

# Declare the pipeline with a FeatureAugmentation (2D)
k_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('preprocessing', FeatureAugmentation(preprocessing)),
    ('KerasNN', k_regressor)
])

# Train and predict same as usual
k_estimator = TransformedTargetRegressor(regressor = k_pipeline, transformer = MinMaxScaler())
 
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    "regressor__KerasNN__model__sp_dropout": [0.05,0.25],
    "regressor__KerasNN__model__filter_1": [8,32,128],
    "regressor__KerasNN__model__filter_2": [8,32,128],
    "regressor__KerasNN__model__filter_3": [8,32,128]
}

search = GridSearchCV(k_estimator, 
                      param_grid, 
                      cv=3, 
                      verbose=2, 
                      scoring='neg_mean_squared_error')

search.fit(X_train, y_train)
print(search.best_params_)

Y_preds = search.predict(X_test)

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))

Fitting 3 folds for each of 54 candidates, totalling 162 fits
META {'target_type_': 'continuous', 'y_dtype_': dtype('float64'), 'y_ndim_': 1, 'X_dtype_': dtype('float32'), 'X_shape_': (192, 2151, 9), 'n_features_in_': 2151, 'target_encoder_': RegressorTargetEncoder(), 'n_outputs_': 1, 'n_outputs_expected_': 1, 'feature_encoder_': FunctionTransformer()}
TRANSFORM
ahahah (96, 2151, 9)
[CV] END regressor__KerasNN__model__filter_1=8, regressor__KerasNN__model__filter_2=8, regressor__KerasNN__model__filter_3=8, regressor__KerasNN__model__sp_dropout=0.05; total time=  54.1s
META {'target_type_': 'continuous', 'y_dtype_': dtype('float64'), 'y_ndim_': 1, 'X_dtype_': dtype('float32'), 'X_shape_': (192, 2151, 9), 'n_features_in_': 2151, 'target_encoder_': RegressorTargetEncoder(), 'n_outputs_': 1, 'n_outputs_expected_': 1, 'feature_encoder_': FunctionTransformer()}
TRANSFORM
ahahah (96, 2151, 9)
[CV] END regressor__KerasNN__model__filter_1=8, regressor__KerasNN__model__filter_2=8, regressor__Ker