In [53]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath("") + "/../src/pynirs")

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

import tensorflow
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, SpatialDropout1D,BatchNormalization,Flatten

from nirs_set import NIRS_Set

from pipeline_y import Pipeline as Pipeline_y
from pipeline_y import Augmentation
import splitter as sp
import augmenter as aug
from pipeline_tools import FeatureUnionNewAxis

from sklearn.metrics import *
import numpy as np

np.random.seed(12345)
tensorflow.random.set_seed(12345)
    

#### Data augmenters

# augmentation definition (default uses all functions)
augmenter1 = Pipeline_y ([
    ('test2', aug.Augmenter(1)),
])

augmenter2 = Augmentation ([
    # ('test', aug.Augmenter(1)),
    ('test2', aug.Augmenter()),
    ('rt2', Pipeline([('rt', aug.Rotate_Translate(3)), ('spl_add', aug.Random_Spline_Addition(3))]),
    ('spl_add', aug.Random_Spline_Addition(2)),
    ('y_shift', aug.Random_Y_Shift(2)),
    ('spl_shift', aug.Random_X_Spline_Shift(2))
    ])


n = NIRS_Set('data')
X, y = n.load('Xcal.csv', 'Ycal.csv', x_hdr = 0, y_hdr = 0)
print(X.shape, y.shape)

X1, y1 = augmenter1.transform(X, y)
print(X1.shape, y1.shape)

print(X.shape, y.shape)
X2, y2 = augmenter2.transform(X, y)
print(X2.shape, y2.shape)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(361, 2151) (361, 1)
(722, 2151) (722, 1)
(361, 2151) (361, 1)


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 1 has size 40

In [52]:
import preprocessor as pp
# from sklearn.preprocessing import MinMaxScaler

#### Pipeline transformers
#### MODEL creation
def create_model(optimizer = 'adam', 
                 kernel_initializer = 'glorot_uniform', 
                 dropout = 0.3):
    model = Sequential()
    model.add(Conv1D (filters=64, kernel_size=11, strides=3, activation='selu'))
    model.add(SpatialDropout1D(0.3))
    model.add(Conv1D (filters=64, kernel_size=7, strides=1, activation='relu'))
    model.add(BatchNormalization())
    model.add(SpatialDropout1D(0.15))
    # model.add(Conv1D (filters=64, kernel_size=3, strides=1, activation='relu'))
    # model.add(SpatialDropout1D(0.15))
    model.add(Flatten())
    model.add(Dense(32, activation='sigmoid'))
    model.add(Dense(1, activation='linear'))

    model.compile(loss = 'mean_squared_error', optimizer = optimizer, metrics = ['mae', 'mse'])
    
    
    return model


## After FeatureUnionNewAxis standard sklearn preprocessors become incompatible
filters = FeatureUnionNewAxis([
    ('id', pp.IdentityTransformer()),
    ('savgol', pp.SavitzkyGolay()),
    ('derivate', pp.Derivate()), 
    ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
    ('gaussian2', pp.Gaussian(order = 2, sigma = 1)),
    ('haar', pp.Wavelet('haar')),
    ('savgol*savgol', Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])),
    ('gaussian1*savgol', Pipeline([('_g1',pp.Gaussian(order = 1, sigma = 2)),('_sg3',pp.SavitzkyGolay())])),
    ('gaussian2*savgol', Pipeline([('_g2',pp.Gaussian(order = 1, sigma = 2)),('_sg4',pp.SavitzkyGolay())])),
    ('haar*savgol', Pipeline([('_haar2',pp.Wavelet('haar')),('_sg5',pp.SavitzkyGolay())]))
    ])

filters2 = FeatureUnion([
    ('id', pp.IdentityTransformer()),
    ('savgol', pp.SavitzkyGolay()),
    ('derivate', pp.Derivate()), 
    ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
    ('gaussian2', pp.Gaussian(order = 2, sigma = 1)),
    ('haar', pp.Wavelet('haar')),
    ('savgol*savgol', Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])),
    ('gaussian1*savgol', Pipeline([('_g1',pp.Gaussian(order = 1, sigma = 2)),('_sg3',pp.SavitzkyGolay())])),
    ('gaussian2*savgol', Pipeline([('_g2',pp.Gaussian(order = 1, sigma = 2)),('_sg4',pp.SavitzkyGolay())])),
    ('haar*savgol', Pipeline([('_haar2',pp.Wavelet('haar')),('_sg5',pp.SavitzkyGolay())]))
    ])
                

# # The pipeline with the keras regressor. Can be replaced by any sklearn regressor if
# # filters preprocessing is done in 1D (FeatureUnion instead of FeatureUnionNewAxis)
pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('filters', filters), 
    # ('pls', PLS())
    ('nn', KerasRegressor(build_fn = create_model, epochs=10, batch_size=500, verbose = 0))
])


pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('filters', filters2), 
    ('pls', PLS())
    # ('nn', KerasRegressor(build_fn = create_model, epochs=10, batch_size=500, verbose = 0))
])

# ## TransformedTargetRegressor enables the transformation of Y before and after
estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())


#########

n = NIRS_Set('data')
X, y = n.load('Xcal.csv', 'Ycal.csv', x_hdr = 0, y_hdr = 0)
print(X.shape, y.shape)


### EXAMPLE 1 > simple fit
X_train, X_test, y_train, y_test = sp.sk_train_test_split(X, y)
# AUGMENTATION HERE

# dedicated operators parameters are written opname__var = value
estimator.fit(X_train, y_train) #, nn__validation_data = (X_test, y_test))
Y_preds = estimator.predict(X_test)
print(np.hstack((y_test, Y_preds))[:10])

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))



### EXAMPLE 2 > Cross Validation no augmentation
## kf = sp.KFold(method='kennard_stone', n_splits=5)
# kf = KFold(n_splits=2)
# r = cross_validate(estimator, X, y, return_train_score = True, return_estimator = True, cv = kf)
# print(r['test_score'], r['train_score'])
# for estimator in r['estimator']:
#     Y_preds = estimator.predict(X)
#     print("MAE", mean_absolute_error(y, Y_preds))
#     print("MSE", mean_squared_error(y, Y_preds))
#     print("MAPE", mean_absolute_percentage_error(y, Y_preds))
    

# # ### EXAMPLE 3 > Hyperparams
# print(estimator.get_params().keys())
# param_grid = {
#     'regressor__filters__haar__wavelet':['haar', 'bior1.3'], 
#     # 'nn__optimizer':['rmsprop', 'adam', 'adagrad']
# }

# grid = GridSearchCV(estimator, cv = 3, param_grid = param_grid)
# X_train, X_test, y_train, y_test = sp.sk_train_test_split(X, y)
# grid.fit(X_train, y_train)
# print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
# means = grid.cv_results_['mean_test_score']
# stds = grid.cv_results_['std_test_score']
# params = grid.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

(361, 2151) (361, 1)


  return array(a, dtype, copy=False, order=order)


AxisError: axis2: axis 1 is out of bounds for array of dimension 1