In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath("") + "/../src/pynirs")

import numpy as np

import tensorflow
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, SpatialDropout1D,BatchNormalization,Flatten


from nirs_set import NIRS_Set
import preprocessor as pp
import augmenter as aug


from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import *


### init random seeds
rd_seed = 13246
np.random.seed(rd_seed)
tensorflow.random.set_seed(rd_seed)

### Load data
n = NIRS_Set('data')
X, y = n.load('Xcal.csv', 'Ycal.csv', x_hdr = 0, y_hdr = 0)
print(X.shape, y.shape)

### Declare pipeline components
preprocessing = [   ('id', pp.IdentityTransformer()),
                    # ('savgol', pp.SavitzkyGolay()),
                    ('derivate', pp.Derivate()), 
                    ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
                    ('gaussian2', pp.Gaussian(order = 2, sigma = 1)),
                    ('haar', pp.Wavelet('haar')),
                    # ('savgol*savgol', Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])),
                    ('gaussian1*savgol', Pipeline([('_g1',pp.Gaussian(order = 1, sigma = 2)),('_sg3',pp.Derivate())])),
                    # ('gaussian2*savgol', Pipeline([('_g2',pp.Gaussian(order = 1, sigma = 2)),('_sg4',pp.SavitzkyGolay())])),
                    # ('haar*savgol', Pipeline([('_haar2',pp.Wavelet('haar')),('_sg5',pp.SavitzkyGolay())]))
                ]

(361, 2151) (361, 1, 1)


In [2]:
from xgboost import XGBRegressor
from datetime import datetime
from nirs_pipelines import FeatureAugmentation

def keras_model(optimizer = 'adam'):
    model = Sequential()
    model.add(SpatialDropout1D(0.08))
    model.add(Conv1D (filters=8, kernel_size=15, strides=5, activation='selu'))
    model.add(BatchNormalization())
    model.add(Conv1D (filters=64, kernel_size=21, strides=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D (filters=32, kernel_size=5, strides=3, activation='elu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(16, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'mean_squared_error', optimizer = optimizer, metrics = ['mae', 'mse'])
    return model

names = ['ArabifCSR_C', 'ArabifCSR_R', 'ArabifCSR_S', 'Arabifdelta13C','Arabifdelta15N', 'ArabifLDMC_mg_g', 'ArabifLNC_perc', 'ArabifSLA_mm2_mg']

for name in names:
    n = NIRS_Set('data')
    X_train, y_train = n.load('CEFE/' + name + '_Xcal.csv', 'CEFE/' + name + '_Ycal.csv', y_cols = 1)
    X_test, y_test = n.load('CEFE/' + name + '_Xval.csv', 'CEFE/' + name + '_Yval.csv', y_cols = 1)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
  
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()), 
        ('preprocessing', FeatureAugmentation(preprocessing)), 
        ('KerasNN',  KerasRegressor(build_fn = keras_model, epochs=1000, batch_size=500, verbose = 0))
    ])


    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())

    estimator.fit(X_train, y_train)
    print(X_train.shape, y_train.shape)
    Y_preds = estimator.predict(X_test)
    np.savetxt(name + "nn_y_pred.csv", Y_preds, delimiter=";")

    file_object = open('scores.txt', 'a')
    file_object.write('*'*20 + ' NN \n')
    file_object.write(name + ' ' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")+ '\n')
    file_object.write("MAE " + str(mean_absolute_error(y_test, Y_preds))+ '\n')
    file_object.write("MSE " + str(mean_squared_error(y_test, Y_preds))+ '\n')
    file_object.write("MAPE " + str(mean_absolute_percentage_error(y_test, Y_preds))+ '\n')
    file_object.write("R2 " + str(r2_score(y_test, Y_preds))+ '\n')
    file_object.close()
    print("MAE", mean_absolute_error(y_test, Y_preds))
    print("MSE", mean_squared_error(y_test, Y_preds))
    print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
    print("R²", r2_score(y_test, Y_preds))
    
    
for name in names:
    n = NIRS_Set('data')
    X_train, y_train = n.load('CEFE/' + name + '_Xcal.csv', 'CEFE/' + name + '_Ycal.csv', y_cols = 1)
    X_test, y_test = n.load('CEFE/' + name + '_Xval.csv', 'CEFE/' + name + '_Yval.csv', y_cols = 1)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    xgb =  XGBRegressor()
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()), 
        ('preprocessing', FeatureUnion(preprocessing)), 
        ('KerasNN', xgb)
    ])

    estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())

    estimator.fit(X_train, y_train)
    print(X_train.shape, y_train.shape)
    Y_preds = estimator.predict(X_test)
    np.savetxt(name + "xgb_y_pred.csv", Y_preds, delimiter=";")

    file_object = open('scores.txt', 'a')
    file_object.write('*'*20 + ' XGB \n')
    file_object.write(name + ' ' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")+ '\n')
    file_object.write("MAE " + str(mean_absolute_error(y_test, Y_preds))+ '\n')
    file_object.write("MSE " + str(mean_squared_error(y_test, Y_preds))+ '\n')
    file_object.write("MAPE " + str(mean_absolute_percentage_error(y_test, Y_preds))+ '\n')
    file_object.write("R2 " + str(r2_score(y_test, Y_preds))+ '\n')
    file_object.close()
    print("MAE", mean_absolute_error(y_test, Y_preds))
    print("MSE", mean_squared_error(y_test, Y_preds))
    print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
    print("R²", r2_score(y_test, Y_preds))

(3045, 2150) (3045,) (1015, 2150) (1015,)


KeyboardInterrupt: 

In [None]:
import shap

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
import seaborn as sns # for correlation heatmap

# perm_importance = permutation_importance(xgb, X_test, y_test)
# sorted_idx = perm_importance.importances_mean.argsort()
# plt.barh(np.arrange(0,len(X_test[0],1))[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")
explainer = shap.TreeExplainer(estimator)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")


In [None]:
from nirs_pipelines import FeatureAugmentation

# ##################################################################################################
### Example 2 - simple NN with 2D preprocessing

print("*"*5, "EXAMPLE 2 - simple NN with 2D preprocessing", "*"*5)
pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('preprocessing', FeatureAugmentation(preprocessing)), 
    ('KerasNN',  KerasRegressor(build_fn = keras_model, epochs=1000, batch_size=500, verbose = 0))
])

estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = rd_seed)
estimator.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
Y_preds = estimator.predict(X_test)

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))



In [None]:
##################################################################################################
### Example 3 - Augmentation + estimator ex2
from nirs_pipelines import SampleAugmentation


print("\n","*"*5, "EXAMPLE 3 - Augmentation + estimator ex2", "*"*5)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = rd_seed)

aug = SampleAugmentation(augmentations)
X_train, y_train = aug.transform(X_train, y_train)
print(X_train.shape, y_train.shape)

estimator.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
Y_preds = estimator.predict(X_test)

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))

In [None]:
##################################################################################################
### Example 4 - PPB
print("\n","*"*5, "EXAMPLE 4 - PPB", "*"*5)

X_train, y_train = n.load('PPB_train.csv', y_cols=0)
X_test, y_test = n.load('PPB_test.csv', y_cols=0)

estimator.fit(X_train, y_train)
Y_preds = estimator.predict(X_test)
print(Y_preds)
print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))
print("R²", r2_score(y_test, Y_preds))

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath("") + "/../src/pynirs")

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

import tensorflow
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, SpatialDropout1D,BatchNormalization,Flatten

from nirs_set import NIRS_Set

from pipeline_y import Pipeline as Pipeline_y
from pipeline_y import Augmentation
import splitter as sp
import augmenter as aug
from pipeline_tools import FeatureUnionNewAxis

from sklearn.metrics import *
import numpy as np

np.random.seed(12345)
tensorflow.random.set_seed(12345)
    

#### Data augmenters

# augmentation definition (default uses all functions)
augmenter1 = Pipeline_y ([
    ('test2', aug.Augmenter(1)),
])

augmenter2 = Augmentation ([
    # ('test', aug.Augmenter(1)),
    ('test2', aug.Augmenter()),
    # ('rt2', Pipeline([('rt', aug.Rotate_Translate()), ('spl_add', aug.Random_Spline_Addition(3))]),
    ('spl_add', aug.Random_Spline_Addition(2)),
    ('y_shift', aug.Random_Y_Shift(2)),
    ('spl_shift', aug.Random_X_Spline_Shift(2))
    ])


n = NIRS_Set('data')
X, y = n.load('Xcal.csv', 'Ycal.csv', x_hdr = 0, y_hdr = 0)
print(X.shape, y.shape)

X1, y1 = augmenter1.transform(X, y)
print(X1.shape, y1.shape)

print(X.shape, y.shape)
X2, y2 = augmenter2.transform(X, y)
print(X2.shape, y2.shape)

In [None]:
import preprocessor as pp
# from sklearn.preprocessing import MinMaxScaler

#### Pipeline transformers
#### MODEL creation
def create_model(optimizer = 'adam', 
                 kernel_initializer = 'glorot_uniform', 
                 dropout = 0.3):
    model = Sequential()
    model.add(Conv1D (filters=64, kernel_size=11, strides=3, activation='selu'))
    model.add(SpatialDropout1D(0.3))
    model.add(Conv1D (filters=64, kernel_size=7, strides=1, activation='relu'))
    model.add(BatchNormalization())
    model.add(SpatialDropout1D(0.15))
    # model.add(Conv1D (filters=64, kernel_size=3, strides=1, activation='relu'))
    # model.add(SpatialDropout1D(0.15))
    model.add(Flatten())
    model.add(Dense(32, activation='sigmoid'))
    model.add(Dense(1, activation='linear'))

    model.compile(loss = 'mean_squared_error', optimizer = optimizer, metrics = ['mae', 'mse'])
    
    
    return model


## After FeatureUnionNewAxis standard sklearn preprocessors become incompatible
filters = FeatureUnionNewAxis([
    ('id', pp.IdentityTransformer()),
    ('savgol', pp.SavitzkyGolay()),
    ('derivate', pp.Derivate()), 
    ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
    ('gaussian2', pp.Gaussian(order = 2, sigma = 1)),
    ('haar', pp.Wavelet('haar')),
    ('savgol*savgol', Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])),
    ('gaussian1*savgol', Pipeline([('_g1',pp.Gaussian(order = 1, sigma = 2)),('_sg3',pp.SavitzkyGolay())])),
    ('gaussian2*savgol', Pipeline([('_g2',pp.Gaussian(order = 1, sigma = 2)),('_sg4',pp.SavitzkyGolay())])),
    ('haar*savgol', Pipeline([('_haar2',pp.Wavelet('haar')),('_sg5',pp.SavitzkyGolay())]))
    ])

filters2 = FeatureUnion([
    ('id', pp.IdentityTransformer()),
    # ('savgol', pp.SavitzkyGolay()),
    ('derivate', pp.Derivate()), 
    # ('gaussian1', pp.Gaussian(order = 1, sigma = 2)),
    # ('gaussian2', pp.Gaussian(order = 2, sigma = 1)),
    # ('haar', pp.Wavelet('haar')),
    # ('savgol*savgol', Pipeline([('_sg1',pp.SavitzkyGolay()),('_sg2',pp.SavitzkyGolay())])),
    # ('gaussian1*savgol', Pipeline([('_g1',pp.Gaussian(order = 1, sigma = 2)),('_sg3',pp.SavitzkyGolay())])),
    # ('gaussian2*savgol', Pipeline([('_g2',pp.Gaussian(order = 1, sigma = 2)),('_sg4',pp.SavitzkyGolay())])),
    # ('haar*savgol', Pipeline([('_haar2',pp.Wavelet('haar')),('_sg5',pp.SavitzkyGolay())]))
    ])
                

# # The pipeline with the keras regressor. Can be replaced by any sklearn regressor if
# # filters preprocessing is done in 1D (FeatureUnion instead of FeatureUnionNewAxis)
pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('filters', filters), 
    # ('pls', PLS())
    ('nn', KerasRegressor(build_fn = create_model, epochs=10, batch_size=500, verbose = 0))
])


pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('filters', filters2), 
    ('pls', PLS())
    # ('nn', KerasRegressor(build_fn = create_model, epochs=10, batch_size=500, verbose = 0))
])

# ## TransformedTargetRegressor enables the transformation of Y before and after
estimator = TransformedTargetRegressor(regressor = pipeline, transformer = MinMaxScaler())


#########

n = NIRS_Set('data')
X, y = n.load('Xcal.csv', 'Ycal.csv', x_hdr = 0, y_hdr = 0)
print(X.shape, y.shape)


### EXAMPLE 1 > simple fit
X_train, X_test, y_train, y_test = sp.sk_train_test_split(X, y)
# AUGMENTATION HERE

# dedicated operators parameters are written opname__var = value
estimator.fit(X_train, y_train) #, nn__validation_data = (X_test, y_test))
Y_preds = estimator.predict(X_test)
print(np.hstack((y_test, Y_preds))[:10])

print("MAE", mean_absolute_error(y_test, Y_preds))
print("MSE", mean_squared_error(y_test, Y_preds))
print("MAPE", mean_absolute_percentage_error(y_test, Y_preds))



### EXAMPLE 2 > Cross Validation no augmentation
## kf = sp.KFold(method='kennard_stone', n_splits=5)
# kf = KFold(n_splits=2)
# r = cross_validate(estimator, X, y, return_train_score = True, return_estimator = True, cv = kf)
# print(r['test_score'], r['train_score'])
# for estimator in r['estimator']:
#     Y_preds = estimator.predict(X)
#     print("MAE", mean_absolute_error(y, Y_preds))
#     print("MSE", mean_squared_error(y, Y_preds))
#     print("MAPE", mean_absolute_percentage_error(y, Y_preds))
    

# # ### EXAMPLE 3 > Hyperparams
# print(estimator.get_params().keys())
# param_grid = {
#     'regressor__filters__haar__wavelet':['haar', 'bior1.3'], 
#     # 'nn__optimizer':['rmsprop', 'adam', 'adagrad']
# }

# grid = GridSearchCV(estimator, cv = 3, param_grid = param_grid)
# X_train, X_test, y_train, y_test = sp.sk_train_test_split(X, y)
# grid.fit(X_train, y_train)
# print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
# means = grid.cv_results_['mean_test_score']
# stds = grid.cv_results_['std_test_score']
# params = grid.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))