In [3]:
#Installing packages

#Needed for step 1 - data generation

#Installing PROSAIL
#!pip install prosail

#latin hypercube stuff
#lets try to do a LHS
#!pip install lhsmdu

#this package as a number of functions to deal with hyperspectral data
#!pip install pysptools


In [4]:
#General purpose: 
import matplotlib.pyplot as plt
import numpy
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

import pandas as pd

#PROSPECT+SAIL Radiative transfer mode package
import prosail

#Sampling design package
import lhsmdu

#package to for operations on spectral data
import pysptools as sptool 
from pysptools import distance
#machine learning packages are imported later, nearer to the model


First we create 3 datasets of 500, 1500 and 3000 samples

Varying parameters: Cab, Cw, Cm and LAI

In [5]:
#number of samples
train_n3000 = 3000
train_n1500 = 1500
train_n0500 = 500


n_traits=4 #I will test on 4 varying traits: cab, car, cw,cm,lai

#generating a LHS hypercube (it uses a 0 to 1 interval that can be used as a multiplier against the different traits)
np.random.seed(0)
LHS_train3000 = lhsmdu.createRandomStandardUniformMatrix(n_traits,train_n3000 ) #the package has a more advanced method but it is too slow to process
LHS_train1500 = lhsmdu.createRandomStandardUniformMatrix(n_traits,train_n1500 )
LHS_train0500 = lhsmdu.createRandomStandardUniformMatrix(n_traits,train_n0500 )

#max_n=1 #this value should go from 1 to 2, so i make it change from 0 to 1 here and then add 1 later
max_cab=79. #add 1
#max_car=44. #add 1
#max_cbrown= 9.99 #add 0.01
max_cw=0.008 #add 0.001 
max_cm=0.008 #0.001
max_lai = 9.9 #add 0.1



The next 2 snippets are functions needed for generating the data at sentinel resolution

First a function for better control of the prosail call

In [25]:
#in here I create a custom call for prosail, this allows me to more easily control the default values
def custom_prosail(cab,cw,cm,lai):
    import prosail
    #default parameters
    n= 1.2
    car=25.
    cbrown=0.01
    typelidf=1
    lidfa = -0.35 #leaf angle distribution parameter a and b
    lidfb=-0.15
    hspot= 0.01 #hotspot parameters
    #sun and viewing angle
    tts=30. #observation and solar position parameters
    tto=10. 
    psi=0.
    rho_out = prosail.run_prosail(n,
                                 cab,
                                 car,
                                 cbrown,
                                 cw,
                                 cm,
                                 lai,
                                 lidfa,hspot,tts,tto,psi,
                                 typelidf, #lidfb=-0.15,
                                 factor='DHR', rsoil=1., psoil=1.)
    return(rho_out)

  

Then a function to convert the input hyperspectral data to Sentinel 2A data using a weighted mean approach



In [26]:
def Prosail2S2(path2csv,spectra_input):
    #importing pandas
    import pandas as pd
    import numpy
    import numpy as np
    #upload a S2_Response.csv from https://earth.esa.int/web/sentinel/user-guides/sentinel-2-msi/document-library/-/asset_publisher/Wk0TKajiISaR/content/sentinel-2a-spectral-responses

    s2_table = pd.read_csv(path2csv,sep=";",decimal=",") #check if this is proper, regarding the sep and dec

    s2_table_sel = s2_table[s2_table['SR_WL'].between(400,2500)] #selects all values between 400 and 2500
    spectra_input_df = pd.DataFrame(data=spectra_input,columns=["rho"],index=s2_table_sel.index) #transforms the input array into a pandas df with the column name rho and row.index = to the original input table

  
    rho_s2 = s2_table_sel.multiply(spectra_input_df['rho'],axis="index") #calculates the numerator
    w_band_sum = s2_table_sel.sum(axis=0,skipna = True) #calculates the denominator

    output = (rho_s2.sum(axis=0)/w_band_sum).rename_axis("ID").values #runs the weighted mean and converts the output to a numpy array

    return output[1:] #removes the first value because it represents the wavelength column

#please LOAD THTE FILE NOW
filepath="/Users/BSibiya/Desktop/Sandberg Fynbos Reserve/S2_response.csv"


Now we create a function that generates the data given the n input samples




In [27]:
#function expects as input a PD dataframe with the columns properly named
#notice if you change any defaults on the custom_prosail function then you have to go back and
#change that
#this function also transforms the hyperspectral data to sentinel data
def Gen_spectra_data(traits):
    k = 1
    #pd_train_traits=traits
    #print(range(len(traits)))
    for i in range(len(traits)):
        #n_t = pd_train_traits["n"][i]
        cab_t = traits["cab"][i]
        #car_t = pd_train_traits["car"][i]
        #cbrown_t = pd_train_traits["cbrown"][i]
        cw_t = traits["cw"][i]
        cm_t = traits["cm"][i]
        lai_t = traits["lai"][i]

        if k == 1:
            tr_rho_s = custom_prosail(cab_t,cw_t,cm_t,lai_t)
            tr_rho_s = Prosail2S2(filepath,tr_rho_s)
            #plt.plot ( x, tr_rho_s, ':', label="Training prosail")
            #plt.legend(loc='best')
      
        if k > 1:
            tr_rho_t = custom_prosail(cab_t,cw_t,cm_t,lai_t)
            tr_rho_t = Prosail2S2(filepath,tr_rho_t)
            tr_rho_s = np.vstack((tr_rho_s,tr_rho_t))
            #plt.plot ( x, tr_rho_t, ':')

        k = k+1


    rho_samples=tr_rho_s


    return rho_samples


Now we can get the datasets

In [28]:

#preparing function inputs

pd_traits0500 = pd.DataFrame.transpose(pd.DataFrame(LHS_train0500))
pd_traits1500 = pd.DataFrame.transpose(pd.DataFrame(LHS_train1500))
pd_traits3000 = pd.DataFrame.transpose(pd.DataFrame(LHS_train3000))

pd_traits0500.columns = ["cab","cw","cm","lai"]
pd_traits1500.columns = ["cab","cw","cm","lai"]
pd_traits3000.columns = ["cab","cw","cm","lai"]

#pd_traits0500["car"]=pd_traits0500["car"]*max_car+1 
pd_traits0500["cab"]=pd_traits0500["cab"]*max_cab+1.
pd_traits0500["cw"] =pd_traits0500["cw"] *max_cw+.001
pd_traits0500["cm"] =pd_traits0500["cm"] *max_cm+.001
pd_traits0500["lai"]=pd_traits0500["lai"]*max_lai+.25

pd_traits1500["cab"]=pd_traits1500["cab"]*max_cab+1.
pd_traits1500["cw"] =pd_traits1500["cw"] *max_cw+.001
pd_traits1500["cm"] =pd_traits1500["cm"] *max_cm+.001
pd_traits1500["lai"]=pd_traits1500["lai"]*max_lai+.25

pd_traits3000["cab"]=pd_traits3000["cab"]*max_cab+1.
pd_traits3000["cw"] =pd_traits3000["cw"] *max_cw+.001
pd_traits3000["cm"] =pd_traits3000["cm"] *max_cm+.001
pd_traits3000["lai"]=pd_traits3000["lai"]*max_lai+.25


#pd_train_traits["n"]=pd_t
np_spectra0500 = Gen_spectra_data(pd_traits0500)
np_spectra1500 = Gen_spectra_data(pd_traits1500)
np_spectra3000 = Gen_spectra_data(pd_traits3000)

print(np_spectra0500.shape)
print(np_spectra1500.shape)
print(np_spectra3000.shape)

(500, 13)
(1500, 13)
(3000, 13)


In [29]:
#lets also create a numpy object for the tratis
np_traits0500 = pd_traits0500.iloc[:,:].values
np_traits1500 = pd_traits1500.iloc[:,:].values
np_traits3000 = pd_traits3000.iloc[:,:].values

Now we need to K-fold the data so we can do the LOOCV - Leave one out cross validation

In [30]:
from sklearn.model_selection import KFold # import KFol

#this command is enough to set u the k-fold
kf = KFold(n_splits=5) # Define the split 

#test spot
#X = np_spectra0500
#Y = np.arange(len(np_spectra0500)) #this is simply a place holder

#The kfold of sklearn doesn't actually randomize the folding but that is ok because
#the samples were generated randomly anyway. 
#k = 1
#for train_index, test_index in kf.split(X):
  #print("TRAIN:", train_index, "TEST:", test_index)
  #X_train, X_test = X[train_index], X[test_index]
  #y_train, y_test = Y[train_index], Y[test_index]
  #print(k)
  #print(train_index.shape)
  #print(test_index.shape)
  #k=k+1
#uncomment above to see how it works


Now we set up all the machine learning models


In [31]:
#machine learning stuff

#NEURAL NETWORK - Keras will be updated soon so this colab will also have to be changed
from sklearn.neural_network import MLPRegressor as ANN_reg #this is a simpler neural network package
from keras.models import Sequential
from keras.layers import Dense
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#ignore the warning for now

#Random FOREST
# Fitting Random Forest Regression to the dataset 
# import the regressor 
from sklearn.ensemble import RandomForestRegressor 

#Gaussian processes
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, Matern

#initializing the ANN
ann_ml = Sequential()
#ann_ml.add(Dense(9, input_dim=9, activation='linear'))
ann_ml.add(Dense(10, input_dim=9, activation='tanh'))
ann_ml.add(Dense(6, activation='relu'))
ann_ml.add(Dense(4)) #indeed this ha to be added in this case without any activ function, the R script added this on its own
#model.add(Dense(1, activation='sigmoid'))

# compile the keras model
ann_ml.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

#initializing the random forest
rfr_ml = RandomForestRegressor(n_estimators=1000,random_state=0,
                              min_samples_leaf=5,min_samples_split=5,verbose=1)
#initializing the gaussian process
gpr_ml = GaussianProcessRegressor(n_restarts_optimizer=50,
                                        normalize_y=True,
                                        random_state=0)




In [32]:
#print(np_traits0500)
#print(pd_traits0500)

Creating an empty pandas dataframe to store the output of the models


In [33]:
column_names=["Model",
              "NSamples",
              "Variable",
              "Fold_nr",
              "ExplVar",
              "Max_err",
              "Mean_abs_Err",
              "Mean_sqr_err",
              #"Mean_sqr_lg_err",
              "Median_abs_err",
              "r2",
              "MAPE"]
              #"Mean_poiss_dev",
              #"Mean_gamma_dev"]
              #"Mean_tweed_dev"]

#mape is not existant in the package so we have to create it:
#https://stats.stackexchange.com/questions/58391/mean-absolute-percentage-error-mape-in-scikit-learn
#from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [34]:
#creating a df to receive the data
df_metrics = pd.DataFrame(columns=column_names)

#we pick only the bands at 20m resolution - i reckon it is actually peaceful to use everything.. 

#first we subset the bands to the 20m resolution only
#S2A_SR_AV_B1	S2A_SR_AV_B2	S2A_SR_AV_B3	
#S2A_SR_AV_B4	S2A_SR_AV_B5	S2A_SR_AV_B6	
#S2A_SR_AV_B7	S2A_SR_AV_B8	S2A_SR_AV_B8A	
#S2A_SR_AV_B9	S2A_SR_AV_B10	S2A_SR_AV_B11	
#S2A_SR_AV_B12

train_df_0500 = np_spectra0500[:,[1,2,3,4,5,6,8,11,12]]
train_df_1500 = np_spectra1500[:,[1,2,3,4,5,6,8,11,12]]
train_df_3000 = np_spectra3000[:,[1,2,3,4,5,6,8,11,12]]


#importing metric functions
from sklearn import metrics

#the ANN requires that we transform the variables
from sklearn.preprocessing import MinMaxScaler 
scaler_0500 = MinMaxScaler()

In [35]:
pd.DataFrame(train_df_0500)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.018533,-0.000281,0.017074,-0.005364,0.144110,0.268710,0.278151,0.170602,0.007394
1,0.019765,0.002389,0.018920,-0.001685,0.137651,0.238109,0.247967,0.133313,-0.005383
2,0.017040,-0.008581,0.010169,-0.005372,0.198349,0.330979,0.336115,0.053206,-0.040283
3,0.170749,0.201338,0.232814,0.271576,0.329139,0.352863,0.385049,0.446938,0.398903
4,0.018742,-0.004196,0.013868,0.001424,0.206181,0.311808,0.325989,0.255646,0.088763
...,...,...,...,...,...,...,...,...,...
495,0.017523,-0.005340,0.014340,-0.008286,0.175090,0.347994,0.355710,0.113587,-0.021109
496,0.022150,0.002596,0.018330,0.014669,0.206740,0.282672,0.297651,0.125662,0.008557
497,0.019009,0.003083,0.018121,-0.003555,0.126260,0.248023,0.256627,0.110215,-0.018591
498,0.018588,0.009430,0.003458,0.045665,0.239646,0.282202,0.295556,0.219398,0.053269


Measuring the accuraccy the of the regression

In [36]:
#with 500 samples
k=1
for train_index, test_index in kf.split(train_df_0500):

    #subsetting for ith k-fold
    X_train, X_test = train_df_0500[train_index], train_df_0500[test_index]
    Y_train, Y_test = np_traits0500[train_index], np_traits0500[test_index]
    label_names = ["cab","cw","cm","lai"]

    #ANN - Training 
    scaler_0500.fit(Y_train)
    Y_train_norm = scaler_0500.transform(Y_train)
    ann_ml.fit(X_train,Y_train_norm,epochs=1500,verbose=0)

    #RF - Training n
    rfr_ml.fit(X_train,Y_train)

    #GPR - Training 
    gpr_ml.fit(X_train,Y_train)

    #Prediction
    y_ann_0500 = scaler_0500.inverse_transform(ann_ml.predict(X_test))
    y_rfr_0500 = rfr_ml.predict(X_test)
    y_gpr_0500 = gpr_ml.predict(X_test)
    


    for i in range(n_traits):

        #creating the temp list
        ann_temp_list = {"Model":"ANN",
                         "NSamples":500,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_ann_0500[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_ann_0500[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_ann_0500[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_ann_0500[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_ann_0500[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_ann_0500[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_ann_0500[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_ann_0500[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_ann_0500[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_ann_0500[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_ann_0500[:,i])}

        rfr_temp_list = {"Model":"RFr",
                         "NSamples":500,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_rfr_0500[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_rfr_0500[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_rfr_0500[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_rfr_0500[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_rfr_0500[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_rfr_0500[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_rfr_0500[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_rfr_0500[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_rfr_0500[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_rfr_0500[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_rfr_0500[:,i])}

        gpr_temp_list = {"Model":"GPR",
                         "NSamples":500,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_gpr_0500[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_gpr_0500[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_gpr_0500[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_gpr_0500[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_gpr_0500[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_gpr_0500[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_gpr_0500[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_gpr_0500[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_gpr_0500[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_gpr_0500[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_gpr_0500[:,i])}

        #appending to the dataframe
        df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
        df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
        df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
    k = k+1

df_metrics.to_csv("run0500.csv",sep=";",decimal=",")


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.1s finished
  updates=self.state_updates,
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df

In [37]:
#the ANN requires that we transform the variables
from sklearn.preprocessing import MinMaxScaler 
scaler_1500 = MinMaxScaler()

#with 500 samples
k=1
for train_index, test_index in kf.split(train_df_1500):
    #subsetting for ith k-fold
    X_train, X_test = train_df_1500[train_index], train_df_1500[test_index]
    Y_train, Y_test = np_traits1500[train_index], np_traits1500[test_index]
    label_names = ["cab","cw","cm","lai"]

    #ANN - Training 
    scaler_1500.fit(Y_train)
    Y_train_norm = scaler_1500.transform(Y_train)
    ann_ml.fit(X_train,Y_train_norm,epochs=1500,verbose=0)

    #RF - Training n
    rfr_ml.fit(X_train,Y_train)

    #GPR - Training 
    gpr_ml.fit(X_train,Y_train)

    #Prediction
    y_ann_1500 = scaler_0500.inverse_transform(ann_ml.predict(X_test))
    y_rfr_1500 = rfr_ml.predict(X_test)
    y_gpr_1500 = gpr_ml.predict(X_test)


    for i in range(n_traits):
        #creating the temp list
        ann_temp_list = {"Model":"ANN",
                         "NSamples":1500,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_ann_1500[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_ann_1500[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_ann_1500[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_ann_1500[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_ann_1500[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_ann_1500[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_ann_1500[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_ann_1500[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_ann_1500[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_ann_1500[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_ann_1500[:,i])}

        rfr_temp_list = {"Model":"RFr",
                         "NSamples":1500,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_rfr_1500[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_rfr_1500[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_rfr_1500[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_rfr_1500[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_rfr_1500[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_rfr_1500[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_rfr_1500[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_rfr_1500[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_rfr_1500[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_rfr_1500[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_rfr_1500[:,i])}

        gpr_temp_list = {"Model":"GPR",
                         "NSamples":1500,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_gpr_1500[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_gpr_1500[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_gpr_1500[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_gpr_1500[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_gpr_1500[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_gpr_1500[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_gpr_1500[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_gpr_1500[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_gpr_1500[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_gpr_1500[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_gpr_1500[:,i])}

      #appending to the dataframe
        df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
        df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
        df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
    k = k+1

df_metrics.to_csv("run1500.csv",sep=";",decimal=",")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    8.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(r

In [38]:
#the ANN requires that we transform the variables
from sklearn.preprocessing import MinMaxScaler 
scaler_3000 = MinMaxScaler()

#with 500 samples
k=1
for train_index, test_index in kf.split(train_df_3000):
    #subsetting for ith k-fold
    X_train, X_test = train_df_3000[train_index], train_df_3000[test_index]
    Y_train, Y_test = np_traits3000[train_index], np_traits3000[test_index]
    label_names = ["cab","cw","cm","lai"]

    #ANN - Training 
    scaler_3000.fit(Y_train)
    Y_train_norm = scaler_3000.transform(Y_train)
    ann_ml.fit(X_train,Y_train_norm,epochs=3000,verbose=0)

    #RF - Training n
    rfr_ml.fit(X_train,Y_train)

    #GPR - Training 
    gpr_ml.fit(X_train,Y_train)

    #Prediction
    y_ann_3000 = scaler_0500.inverse_transform(ann_ml.predict(X_test))
    y_rfr_3000 = rfr_ml.predict(X_test)
    y_gpr_3000 = gpr_ml.predict(X_test)


    for i in range(n_traits):
        #creating the temp list
        ann_temp_list = {"Model":"ANN",
                         "NSamples":3000,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_ann_3000[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_ann_3000[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_ann_3000[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_ann_3000[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_ann_3000[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_ann_3000[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_ann_3000[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_ann_3000[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_ann_3000[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_ann_3000[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_ann_3000[:,i])}

        rfr_temp_list = {"Model":"RFr",
                         "NSamples":3000,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_rfr_3000[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_rfr_3000[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_rfr_3000[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_rfr_3000[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_rfr_3000[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_rfr_3000[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_rfr_3000[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_rfr_3000[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_rfr_3000[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_rfr_3000[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_rfr_3000[:,i])}

        gpr_temp_list = {"Model":"GPR",
                         "NSamples":3000,
                         "Variable":label_names[i],
                         "Fold_nr":k,
                         "ExplVar": metrics.explained_variance_score(Y_test[:,i], y_gpr_3000[:,i]),
                         "Max_err": metrics.max_error(Y_test[:,i], y_gpr_3000[:,i]),
                         "Mean_abs_Err": metrics.mean_absolute_error(Y_test[:,i], y_gpr_3000[:,i]),
                         "Mean_sqr_err": metrics.mean_squared_error(Y_test[:,i], y_gpr_3000[:,i]),
                         #"Mean_sqr_lg_err": metrics.mean_squared_log_error(Y_test[:,i], y_gpr_3000[:,i]),
                         "Median_abs_err" : metrics.median_absolute_error(Y_test[:,i], y_gpr_3000[:,i]),
                         "r2": metrics.r2_score(Y_test[:,i], y_gpr_3000[:,i]),
                         #"Mean_poiss_dev" : metrics.mean_poisson_deviance(Y_test[:,i], y_gpr_3000[:,i]),
                         #"Mean_gamma_dev" : metrics.mean_gamma_deviance(Y_test[:,i], y_gpr_3000[:,i])}
                         #"Mean_tweed_dev" : metrics.mean_tweedie_deviance(Y_test[:,i], y_gpr_3000[:,i])}
                         "MAPE": mean_absolute_percentage_error(Y_test[:,i], y_gpr_3000[:,i])}
        

      #appending to the dataframe
        df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
        df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
        df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
    k = k+1

df_metrics.to_csv("run3000.csv",sep=";",decimal=",")


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   17.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(rfr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(gpr_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(ann_temp_list,ignore_index=True)
  df_metrics = df_metrics.append(r

Finished for now, data is saved, better to make the plots in R 

In [39]:
df_metrics

Unnamed: 0,Model,NSamples,Variable,Fold_nr,ExplVar,Max_err,Mean_abs_Err,Mean_sqr_err,Median_abs_err,r2,MAPE
0,ANN,500,cab,1,0.852601,31.874573,7.529774,8.513208e+01,6.897156,0.851994,80.021962
1,RFr,500,cab,1,0.908765,32.163474,4.439896,5.560893e+01,1.628559,0.903321,50.854898
2,GPR,500,cab,1,0.993992,18.190601,0.374867,3.506375e+00,0.064763,0.993904,9.972425
3,ANN,500,cw,1,0.333914,0.004569,0.001797,4.595325e-06,0.001599,0.329368,56.632810
4,RFr,500,cw,1,-0.005121,0.004470,0.002361,6.887331e-06,0.002488,-0.005123,73.213570
...,...,...,...,...,...,...,...,...,...,...,...
175,RFr,3000,cm,5,0.487738,0.005082,0.001297,2.767794e-06,0.000996,0.486524,39.185767
176,GPR,3000,cm,5,0.999957,0.000223,0.000007,2.331652e-10,0.000003,0.999957,0.204230
177,ANN,3000,lai,5,0.985639,1.758575,0.256394,1.160956e-01,0.202924,0.985322,8.885046
178,RFr,3000,lai,5,0.972955,2.872562,0.296064,2.145135e-01,0.157901,0.972879,5.183799
