In [1]:
!pip install pinard
!pip install scikeras

Collecting pinard
Successfully installed pinard



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load data

- initialize random variables
- create a data set to train the pipelines

In [2]:
from pinard import utils
from pinard.model_selection import train_test_split_idx
from sklearn.model_selection import train_test_split
import numpy as np

# Init basic random
rd_seed = 42
np.random.seed(rd_seed)

xcal_csv = "https://raw.githubusercontent.com/GBeurier/pinard/main/examples/Xcal.csv"
ycal_csv = "https://raw.githubusercontent.com/GBeurier/pinard/main/examples/Ycal.csv"

# Create a set named data
x, y = utils.load_csv(xcal_csv, ycal_csv, x_hdr=0, y_hdr=0, autoremove_na=True)
train_index, test_index = train_test_split_idx(x, y=y, method="random", test_size=0.25, random_state=rd_seed)
X_train, y_train, X_test, y_test = x[train_index], y[train_index], x[test_index], y[test_index]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(270, 2151) (270,) (91, 2151) (91,)


## Declare preprocessing operators

Here we declare the list of preprocessings that will be applied either in FeatureUnion or FeatureAugmentation.

In [3]:
from pinard import preprocessing as pp
from sklearn.pipeline import Pipeline

### Declare preprocessing pipeline components
preprocessing = [   ('id', pp.IdentityTransformer()),
                    ('savgol', pp.SavitzkyGolay()),
                    ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
                ]

## Simple PLS regression

Here we create a pipeline with a FeatureUnion preprocessing.
Then we train the pipeline and display results

In [4]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.cross_decomposition import PLSRegression

# Simple PLS pipeline declaration
pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('preprocessing', FeatureUnion(preprocessing)), 
    ('pls', PLSRegression(n_components=10))
])

# TransformedTargetRegressor is used to apply scaling to Y within the pipeline
estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())

# # Train the model
estimator.fit(X_train, y_train)

# # Compute metrics on the test set
Y_preds = estimator.predict(X_test)

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))
# print(estimator.get_params())

MAE 1.1567468539928398
MSE 2.5117852372966643
MAPE 0.0254993046121325
R² 0.7367788435855969


## Same pipeline with a XGBoost estimator

In [5]:
from xgboost import XGBRegressor

x_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('preprocessing', FeatureUnion(preprocessing)), 
    ('XGB', XGBRegressor(n_estimators=50, max_depth=10))
])

x_estimator = TransformedTargetRegressor(regressor = x_pipeline, transformer = MinMaxScaler())

x_estimator.fit(X_train, y_train)

Y_preds = x_estimator.predict(X_test)

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))

# print(x_estimator.get_params())

MAE 1.2912766
MSE 3.8444414
MAPE 0.028730346
R² 0.5971238611860624


## Same pipeline with simple KerasRegressor
*A more detailed and complete example is provided in keras_regressor.ipynb.*

In [6]:
from pinard.sklearn import FeatureAugmentation

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, SpatialDropout1D,BatchNormalization,Flatten, Dropout, Input

from scikeras.wrappers import KerasRegressor

from typing import Dict, Iterable, Any

# Create the keras model in the scikeras wrapper format (meta arg)
def keras_model(meta: Dict[str, Any]):
    input_shape = meta["X_shape_"][1:]
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(SpatialDropout1D(0.08))
    model.add(Conv1D (filters=8, kernel_size=15, strides=5, activation='selu'))
    model.add(Dropout(0.2))
    model.add(Conv1D (filters=64, kernel_size=21, strides=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D (filters=32, kernel_size=5, strides=3, activation='elu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(16, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    model.summary()
    return model

# Create the KerasRegressor
k_regressor = KerasRegressor(model = keras_model,
                            epochs=400, 
                            fit__batch_size=50,
                            fit__validation_split=0.2,
                            verbose = 2)

# Declare the pipeline with a FeatureAugmentation (2D)
k_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('preprocessing', FeatureAugmentation(preprocessing)),
    ('KerasNN', k_regressor)
])

# Train and predict same as usual
k_estimator = TransformedTargetRegressor(regressor = k_pipeline, transformer = MinMaxScaler())
 
k_estimator.fit(X_train, y_train)
# print(k_estimator.regressor_[2].history_)

Y_preds = k_estimator.predict(X_test)

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 spatial_dropout1d (SpatialD  (None, 2151, 3)          0         
 ropout1D)                                                       
                                                                 
 conv1d (Conv1D)             (None, 428, 8)            368       
                                                                 
 dropout (Dropout)           (None, 428, 8)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 136, 64)           10816     
                                                                 
 batch_normalization (BatchN  (None, 136, 64)          256       
 ormalization)                                                   
                                                                 
 conv1d_2 (Conv1D)           (None, 44, 32)            1

## Save and load pipeline

There is two ways to save a pipeline using either pickle or joblib. If a KerasRegressor is used only the pickle method works.

In [7]:
import joblib
import pickle

# save xgb estimator
Y_preds = estimator.predict(X_test)
print("R²", r2_score(y_test, Y_preds))
joblib.dump(estimator, 'xgb_estimator.pkl')

# load xgb estimator
loaded_estimator = joblib.load('xgb_estimator.pkl')
Y_preds = loaded_estimator.predict(X_test)
print("R²", r2_score(y_test, Y_preds))

# save keras estimator
Y_preds = k_estimator.predict(X_test)
print("R²", r2_score(y_test, Y_preds))
with open('keras_estimator.pickle', 'wb') as handle:
    pickle.dump(k_estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)
# byte_model = pickle.dumps(estimator, 'xgb_estimator.pkl')

# load keras estimator
with open('keras_estimator.pickle', 'rb') as handle:
    loaded_estimator = pickle.load(handle)
# loaded_estimator = pickle.loads(bytes_model)

Y_preds = loaded_estimator.predict(X_test)
print("R²", r2_score(y_test, Y_preds))

R² 0.7367788435855969
R² 0.7367788435855969
3/3 - 0s - 60ms/epoch - 20ms/step
R² 0.5825535320021759




INFO:tensorflow:Assets written to: C:\Users\grego\AppData\Local\Temp\tmp53x6_gz7\assets


INFO:tensorflow:Assets written to: C:\Users\grego\AppData\Local\Temp\tmp53x6_gz7\assets


3/3 - 0s - 254ms/epoch - 85ms/step
R² 0.5825535320021759


## Simple cross validation with sklearn

In [8]:
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

print("CV_scores", cross_val_score(estimator, x, y, cv=3))
print("-- CV predict --")
Y_preds = cross_val_predict(estimator, x, y, cv=3)
print("MAE", mean_absolute_error(y, Y_preds))
print("MSE", mean_squared_error(y, Y_preds))
print("MAPE", mean_absolute_percentage_error(y, Y_preds))
print("R²", r2_score(y, Y_preds))

print("-- Cross Validate --")
cv_results = cross_validate(estimator, x, y, cv=3, return_train_score=True, n_jobs=3)
for key in cv_results.keys():
    print(key, cv_results[key])


NameError: name 'X' is not defined

## Same with Keras Regressor
* n_jobs parameter seems to deactivate gpu in tensorflow *
The CV do not take the best model but the last model. A better way would be
to handle cv set by hand and to score the best model per fold. (see keras_regressor)

In [7]:
print("-- Cross Validate --")
cv_results = cross_validate(k_estimator, x, y, cv=3, return_train_score=True)
for key in cv_results.keys():
    print(key, cv_results[key])

-- Cross Validate --
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 spatial_dropout1d_1 (Spatia  (None, 2151, 9)          0         
 lDropout1D)                                                     
                                                                 
 conv1d_3 (Conv1D)           (None, 428, 8)            1088      
                                                                 
 dropout_1 (Dropout)         (None, 428, 8)            0         
                                                                 
 conv1d_4 (Conv1D)           (None, 136, 64)           10816     
                                                                 
 batch_normalization_2 (Batc  (None, 136, 64)          256       
 hNormalization)                                                 
                                                                 
 conv1d_5 (Conv1D)           (Non