Testing ANN under different FS approaches
Testing the best estimator of ANN and RS for testing potato yield by Control treatment using different approaches of feature selection. Three feature selections were considered such as Univariate, Chi2, and Permutation Importance. The best accuracy was achieved with Permutation importance (R2=0.78), followed by Univariate (R2=0.72), and Chi2 (R2=0.65) when tested potato yield with Control treatment in Rwanda. 
This contribution is mapped as a contribution from the Egypt Use Case team for the TRANSFORM WP under EiA

Dataset source: Potato yield in Rowanda under soil, topgraphy and climate datasetcreating 69 features.

First we will train and test the best models of RF and ANN using all features and without applying feature selection approaches to explore the primary accuracy. Then, we will apply three feature selection approaches to the best estimator (ANN) to explore the best FS approach.

In [None]:
#Import the required libraries
import os   #to interact with the underlying operating system
import random   #for generating random integers in Python
import warnings   #alerts of unexpected conditions detected when running a code
import numpy as np  #to provide an array object that is up to 50x faster than traditional Python lists
import pandas as pd  #for working with data sets, as it has functions for analyzing, cleaning, exploring, and manipulating data
from scipy import stats   #SciPy is a collection of mathematical algorithms and convenience functions built on the NumPy extension of Python
import matplotlib.pyplot as plt #a cross-platform, data visualization and graphical plotting library (histograms, scatter plots, bar charts, etc) for Python and its numerical extension NumPy
from sklearn import ensemble,neural_network,neighbors,svm,model_selection,inspection #neighbors,svm if we want to use Knearest and SVM models
warnings.simplefilter(action='ignore')

In [None]:
#Loading the dataset (here we used Rowanda Potato dataset)
df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Cleaned2.csv')
df

Define X and y features

In [None]:
# First use all featues to train and validate the models (ANN and RF),in order to detect the important features and best estimator
xFeatures = ['B030','Cu030','K030','N030','Na030','P030','Ptot030','altop','albottom','bdr','ctottop','ctotbottom','catop','cabottom',
            'claytotpsatop','claytotpsabottom','dbodtop','dbodbottom','ececftop','ececfbottom','fetop','febottom','ktop','kbottom','mgtop',
            'mgbottom','ntotncstop','ntotncsbottom','octop','ocbottom','ptop','pbottom','phh2otop','phh2obottom','stop','sbottom','sandtotpsatop',
            'sandtotpsabottom','silttotpsatop','silttotpsabottom','wpg2top','wpg2bottom','zntop','znbottom','SOMtop','SOMbottom','PWPtop',
            'PWPbottom','FCtop','FCbottom','SWStop','SWSbottom','treat2','DEM','slope','TPI','TRI','tr','di','nrd','tmean','tmin','tmax','Nrate','Prate','Krate']
#Identify the dependent variable
yFeatur = 'yield'

Define the work space

In [None]:
#Specify the workspace INfile and outfile for training
inFile = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file'
outFolder = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/ANN_ICARDA_Training'

Data pre-processing:Remove minus and NAN values

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df.reset_index()

Required functions for the statistics, visualization and hyperparameter tuning

In [None]:
#Statistics, visualization and hyperparameter functions
def performanceStatistics(x, y):  #Statistics function
  E = y - x                       # Error
  AE = np.abs(E)                  # Absolute Error
  MAE = np.mean(AE)               # Mean Absolute Error
  SE = np.power(E, 2)             # Square Error
  MSE = np.mean(SE)               # Mean Square Error (this method is the best)
  RMSE = np.sqrt(MSE)             # Root Mean Square Error
  RB = ((np.mean(y) - np.mean(x)) / np.mean(x)) * 100
  slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
  R2 = np.power(r_value, 2)       # correlation of determination
  stat = {'R2':round(R2, 2),
          'RB':round(RB, 2),
          'MAE':round(MAE, 2),
          'RMSE':round(RMSE, 2),
          'n':len(E)}
  return stat

def plotOriginalPredicted(Original, Predicted, outFile, label=''): #visualization function
  minX = np.min(Original)
  maxX = np.max(Original)
  minY = np.min(Predicted)
  maxY = np.max(Predicted)
  minXY = np.min(np.array([minX, minY]))
  maxXY = np.min(np.array([maxX, maxY]))
  fs = 26
  fig = plt.figure(figsize=(15, 15))
  plt.scatter(Original, Predicted, color='blue', label=label)
  plt.plot(np.linspace(minXY, maxXY, 50), np.linspace(minXY, maxXY, 50), color='red', linestyle='-', linewidth=1, markersize=5, label='1:1 Line')
  plt.xlim([minXY, maxXY])
  plt.ylim([minXY, maxXY])
  plt.xticks(size = fs)
  plt.yticks(size = fs)
  plt.xlabel('Original yield (t/ha)', fontsize=fs)
  plt.ylabel('Predicted yield (t/ha)', fontsize=fs)
  
  stat = performanceStatistics(Original, Predicted)
  digits = 2
  n = round(stat['n'], digits)
  r2 = round(stat['R2'], digits)
  RB = round(stat['RB'], digits)
  MAE = round(stat['MAE'], digits)
  RMSE = round(stat['RMSE'], digits)
  s = 'n={} \n$R^2$={}\nRB={} (%)\nMAE={} (t/ha)\nRMSE={} (t/ha)'.format(n, r2, RB, MAE, RMSE)
  plt.text(x=(minXY + 1), y=(maxXY - 1), s=s, horizontalalignment='left', verticalalignment='top', color='black', fontsize=fs)
  plt.legend(loc= 9, fontsize=fs)
  plt.savefig(outFile, dpi=300, bbox_inches='tight')
  plt.clf()
  plt.close()

def hyperparameters(n):    #Hyperparameters function using the grid search
  
  RFR_param_grid = dict(n_estimators = [i for i in range(100, 2050, 50)], #RF hyperparameter tuning
                        max_features = ['auto', 'sqrt', 'log2'],
                        )

  hl = [i for i in range(10, 55, 5)] # hidden layers for ANN huperparameter tuning
  hl = [25] 
  hn = [(n * 2) + 1] # hidden neurons
  hlhn = []
  for i in hl:
    for j in hn:
      hlhn.append((i, j))

  hls = [(i[0], i[1]) for i in hlhn]
  ANN_param_grid = dict(hidden_layer_sizes = hls , 
                        activation = ['identity', 'logistic', 'tanh', 'relu'], # 'identity', 'logistic', 'tanh', 'relu'
                        solver = ['lbfgs', 'sgd', 'adam'], # 'lbfgs', 'sgd', 'adam'
                        max_iter = [1000000],
                        )
  
  #KNN_param_grid = dict(n_neighbors = [i for i in range(10, 55, 5)],
                        #)
   
  #SVR_param_grid = dict(gamma = [0.0625,0.25,0.5,1,2,4],
                        #C = [10, 20, 30, 40],
                        #)
  
  MLA = {
      # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
      'RFR':[ensemble.RandomForestRegressor(random_state=0, verbose=False), RFR_param_grid],
      # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html
      'ANN':[neural_network.MLPRegressor(random_state=0, verbose=False), ANN_param_grid],
      # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html
      #'KNN':[neighbors.KNeighborsRegressor(n_jobs=-1), KNN_param_grid],
      # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
      #'SVR':[svm.SVR(verbose=0), SVR_param_grid],
      }

  return MLA

Training and testing the models using all features and without feature selection

In [None]:
#Training and validating the models by using 75 % of the datset for training and 25 % for validation
df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Cleaned2.csv')
Xdf = df[xFeatures]
ydf = df[[yFeatur]]

X = df[xFeatures]
y = df[[yFeatur]]
X = (X - Xdf.mean()) / Xdf.std()
y = (y - ydf.mean()) / ydf.std()
df = pd.concat([X, y], axis = 1)
df = df.dropna()
X = df[xFeatures]
y = df[[yFeatur]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=0)

n = len(xFeatures)
MLA = hyperparameters(n)
for idMLA, mlaL_mlaA in enumerate(MLA.items()):
  mlaL, mlaA = mlaL_mlaA
  estimator = mlaA[0]
  param_grid = mlaA[1]
  GSCV = model_selection.GridSearchCV(estimator, param_grid, scoring='r2', cv=5, refit=True, verbose=False, n_jobs=-1)
  GSCV.fit(X_train, y_train)
  bestEstimator = GSCV.best_estimator_ 

  OriginalYield_test  = y_test.values.flatten()
  PredictedYield_test = bestEstimator.predict(X_test).flatten()
  OriginalYield_test  = (float(ydf.std()) * OriginalYield_test) + float(ydf.mean())
  PredictedYield_test = (float(ydf.std()) * PredictedYield_test) + float(ydf.mean())
  
  outFile     = os.path.join(outFolder, '{}_plot_DefaultML_{}.png'.format(mlaL, n))
  Stats       = os.path.join(outFolder, '{}_Stats_DefaultML_{}.xlsx'.format(mlaL, n))
  Importances = os.path.join(outFolder, '{}_Importances_DefaultML_{}.xlsx'.format(mlaL, n))

  plotOriginalPredicted(OriginalYield_test, PredictedYield_test, outFile, label='Predicted yield of {}'.format(mlaL))
  df_stats = performanceStatistics(OriginalYield_test, PredictedYield_test)
  print(mlaL, df_stats)

  df_GSCV = pd.DataFrame(GSCV.cv_results_)
  df_GSCV = df_GSCV.sort_values(by='rank_test_score', ascending=True)
  df_GSCV = df_GSCV.head(1)
  for i in list(df_GSCV.columns):
    df_stats[i] = df_GSCV[i].values.tolist()[0]
  df_stats = pd.DataFrame.from_dict(df_stats, orient='index').T
  df_stats.to_excel(Stats, index=False)
  
  pi = inspection.permutation_importance(bestEstimator, X_train, y_train, n_jobs=-1, random_state=0).importances_mean #to identify the important features based on permutation
  pi = [((i / pi.sum()) * 100) for i in pi]
  dfImportances = pd.DataFrame(data=[pi], columns=xFeatures).round(2)
  dfImportances.to_excel(Importances, index=False)


Testing the best estimator by Control Treatment using all features without applying feature selection 

In [None]:
#Testing the best estimator by Control Treatment using all features 
##OutFolder2 where testing ouputs will be placed
outFolder = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/ANN_ICARDA_Testing'
#Test by b Control 
for i in [[xFeaturesML, 'ML'], [xFeaturesML, 'ML']]:

  df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Cleaned2.csv')
  xFeatures = i[0]
  group     = i[1]
  
  print(group)
  Xdf = df[xFeatures]
  ydf = df[[yFeatur]]

  for col in df.columns:
    if col not in ['texture_class_top','texture_class_bottom','treat']:
      mean = np.mean(df[col].values)
      std  = np.std(df[col].values)
      if std != 0:
        df[col] = df[col].apply(lambda x:(x-mean)/std)
  
  testtreat = ['Control'] # random.sample(list(df.treat.unique()), 1)
  print(testtreat)

 
  test  = df[df['treat'].isin(testtreat)]

 
  X_test  = test[xFeatures]
  y_test  = test[[yFeatur]]
#Test by treat(Control) and best estimator
n = len(xFeatures)
  #MLA = hyperparameters(n)
  #for idMLA, mlaL_mlaA in enumerate(MLA.items()):
    #mlaL, mlaA = mlaL_mlaA
    #estimator = mlaA[0]
    #param_grid = mlaA[1]
    #GSCV = model_selection.GridSearchCV(estimator, param_grid, scoring='r2', cv=5, refit=True, verbose=False, n_jobs=-1)
GSCV.predict(X_test)
bestEstimator = GSCV.best_estimator_ 

OriginalYield_test  = y_test.values.flatten()
PredictedYield_test = bestEstimator.predict(X_test).flatten()
OriginalYield_test  = (float(ydf.std()) * OriginalYield_test) + float(ydf.mean())
PredictedYield_test = (float(ydf.std()) * PredictedYield_test) + float(ydf.mean())

yieldFile   = os.path.join(outFolder, '{}_yield_{}_{}.xlsx'.format(mlaL, group, n))
plot        = os.path.join(outFolder, '{}_plot_{}_{}.png'.format(mlaL, group, n))
Stats       = os.path.join(outFolder, '{}_Stats_{}_{}.xlsx'.format(mlaL, group, n))
Importances = os.path.join(outFolder, '{}_Importances_{}_{}.xlsx'.format(mlaL, group, n))

dfYield = pd.DataFrame({'Original Yield':list(OriginalYield_test), 'Predicted Yield':list(PredictedYield_test)})
dfYield.to_excel(yieldFile, index=False)

plotOriginalPredicted(OriginalYield_test, PredictedYield_test, plot, label='Predicted yield of {}'.format(mlaL))
df_stats = performanceStatistics(OriginalYield_test, PredictedYield_test)
print(mlaL, df_stats)

df_GSCV = pd.DataFrame(GSCV.cv_results_)
df_GSCV = df_GSCV.sort_values(by='rank_test_score', ascending=True)
df_GSCV = df_GSCV.head(1)
for i in list(df_GSCV.columns):
       
    df_stats[i] = df_GSCV[i].values.tolist()[0]
            
    df_stats = pd.DataFrame.from_dict(df_stats, orient='index').T
    df_stats.to_excel(Stats, index=False)
    
    pi = inspection.permutation_importance(bestEstimator, X_test, y_test, n_jobs=-1, random_state=0).importances_mean
    pi = [((i / pi.sum()) * 100) for i in pi]
    dfImportances = pd.DataFrame(data=[pi], columns=xFeatures).round(2)
    dfImportances.to_excel(Importances, index=False)

We found that the best estimator is ANN, and the accuracy is 'R2': 0.52  using all features and without feature selection.
- Lets try **different feature selection approaches** such as ***Univariate*** and ***Chi2*** alongwith ***permutation importance*** from ANN to explore the best approach acheived the highest accuracy
1-Univariate approach
#Statistical tests can be used to select those features that have the strongest relationship with the output variable.

In [None]:
#The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.

#The example below uses the chi-squared (chi²) statistical test for non-negative features to select k (k=10) of the best features from the Mobile Price Range Prediction Dataset.
#Load the required libraries 
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
df = pd.read_csv("D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned1.csv")
df

In [None]:
Preprocessing to get ride of NAN and minus values

In [None]:
df = df.apply (pd.to_numeric, errors='coerce')

print (df)
df[df < 0] = 0
df
df.isnull().values.any()
df = df.dropna()
df = df.reset_index(drop=True)

print (df)

In [None]:
#Specify X and y to explore the FS based univariate approach
X = df2.iloc[:,1:70]  #independent variables
y = df2.iloc[:,-1]    #target variable i.e price range
y=y.astype('int') #Convert float to int
y

In [None]:
#apply SelectKBest class to extract top 20 best features

BestFeatures = SelectKBest(score_func=chi2, k=20)
fit = BestFeatures.fit(X,y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
f_Scores = pd.concat([df_columns,df_scores],axis=1)               # feature scores
f_Scores.columns = ['Specs','Score']  
f_Scores 
print(f_Scores.nlargest(20,'Score'))       # print 20 best features in descending order

In [None]:
#Test ANN by these 20 features explored by Univariate approach and compare the accuracy with previous one
#Univariate Selected features
xFeatures = ['ntotncstop','ntotncsbottom','catop','cabottom','Ptot030','P030','K030','Nrate','N030','mgbottom','ktop','kbottom','Krate','Prate','fetop','DEM','Cu030','albottom','TRI']
xFeaturesML = ['ntotncstop','ntotncsbottom','catop','cabottom','Ptot030','P030','K030','Nrate','N030','mgbottom','ktop','kbottom','Krate','Prate','fetop','DEM','Cu030','albottom','TRI']
yFeatur = 'yield'

#Training and validating the model using the new features extracted by univariate approach

In [None]:
#Training and validating the model using the new features  
df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned3.csv')

Xdf = df[xFeatures]
ydf = df[[yFeatur]]

X = df[xFeatures]
y = df[[yFeatur]]
X = (X - Xdf.mean()) / Xdf.std()
y = (y - ydf.mean()) / ydf.std()
df = pd.concat([X, y], axis = 1)
df = df.dropna()
X = df[xFeatures]
y = df[[yFeatur]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=0)

n = len(xFeatures)
MLA = hyperparameters(n)
for idMLA, mlaL_mlaA in enumerate(MLA.items()):
  mlaL, mlaA = mlaL_mlaA
  estimator = mlaA[0]
  param_grid = mlaA[1]
  GSCV = model_selection.GridSearchCV(estimator, param_grid, scoring='r2', cv=5, refit=True, verbose=False, n_jobs=-1)
  GSCV.fit(X_train, y_train)
  bestEstimator = GSCV.best_estimator_ 

  OriginalYield_test  = y_test.values.flatten()
  PredictedYield_test = bestEstimator.predict(X_test).flatten()
  OriginalYield_test  = (float(ydf.std()) * OriginalYield_test) + float(ydf.mean())
  PredictedYield_test = (float(ydf.std()) * PredictedYield_test) + float(ydf.mean())
  
  outFile     = os.path.join(outFolder, '{}_plot_DefaultML_{}.png'.format(mlaL, n))
  Stats       = os.path.join(outFolder, '{}_Stats_DefaultML_{}.xlsx'.format(mlaL, n))
  Importances = os.path.join(outFolder, '{}_Importances_DefaultML_{}.xlsx'.format(mlaL, n))

  plotOriginalPredicted(OriginalYield_test, PredictedYield_test, outFile, label='Predicted yield of {}'.format(mlaL))
  df_stats = performanceStatistics(OriginalYield_test, PredictedYield_test)
  print(mlaL, df_stats)

  df_GSCV = pd.DataFrame(GSCV.cv_results_)
  df_GSCV = df_GSCV.sort_values(by='rank_test_score', ascending=True)
  df_GSCV = df_GSCV.head(1)
  for i in list(df_GSCV.columns):
    df_stats[i] = df_GSCV[i].values.tolist()[0]
  df_stats = pd.DataFrame.from_dict(df_stats, orient='index').T
  df_stats.to_excel(Stats, index=False)
  
  pi = inspection.permutation_importance(bestEstimator, X_train, y_train, n_jobs=-1, random_state=0).importances_mean
  pi = [((i / pi.sum()) * 100) for i in pi]
  dfImportances = pd.DataFrame(data=[pi], columns=xFeatures).round(2)
  dfImportances.to_excel(Importances, index=False)

In [None]:
df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned3.csv')
df = df.dropna()
df = df.reset_index(drop=True)

print (df)

Test by control treatment after using the important features extracted by univariate approach

In [None]:
#Test by b Control 
for i in [[xFeaturesML, 'ML'], [xFeaturesML, 'ML']]:

  #df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned3.csv')
  xFeatures = i[0]
  group     = i[1]
  
  print(group)
  Xdf = df[xFeatures]
  ydf = df[[yFeatur]]

  for col in df.columns:
    if col not in ['texture_class_top','texture_class_bottom','treat']:
      mean = np.mean(df[col].values)
      std  = np.std(df[col].values)
      if std != 0:
        df[col] = df[col].apply(lambda x:(x-mean)/std)
  
  testtreat = ['Control'] # random.sample(list(df.treat.unique()), 1)
  print(testtreat)

 
  test  = df[df['treat'].isin(testtreat)]

 
  X_test  = test[xFeatures]
  y_test  = test[[yFeatur]]
###OutFolder2 for test by Control
outFolder = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/UnivariateFS_Test'
#Test by b Control 
for i in [[xFeaturesML, 'ML'], [xFeaturesML, 'ML']]:

  #df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned3.csv')

  xFeatures = i[0]
  group     = i[1]
  
  print(group)
  Xdf = df[xFeatures]
  ydf = df[[yFeatur]]

  for col in df.columns:
    if col not in ['texture_class_top','texture_class_bottom','treat']:
      mean = np.mean(df[col].values)
      std  = np.std(df[col].values)
      if std != 0:
        df[col] = df[col].apply(lambda x:(x-mean)/std)
  
  testtreat = ['Control'] # random.sample(list(df.treat.unique()), 1)
  print(testtreat)

 
  test  = df[df['treat'].isin(testtreat)]

 
  X_test  = test[xFeatures]
  y_test  = test[[yFeatur]]
#Test by treat(Control) and best estimator
n = len(xFeatures)
  #MLA = hyperparameters(n)
  #for idMLA, mlaL_mlaA in enumerate(MLA.items()):
    #mlaL, mlaA = mlaL_mlaA
    #estimator = mlaA[0]
    #param_grid = mlaA[1]
    #GSCV = model_selection.GridSearchCV(estimator, param_grid, scoring='r2', cv=5, refit=True, verbose=False, n_jobs=-1)
GSCV.predict(X_test)
bestEstimator = GSCV.best_estimator_ 

OriginalYield_test  = y_test.values.flatten()
PredictedYield_test = bestEstimator.predict(X_test).flatten()
OriginalYield_test  = (float(ydf.std()) * OriginalYield_test) + float(ydf.mean())
PredictedYield_test = (float(ydf.std()) * PredictedYield_test) + float(ydf.mean())

yieldFile   = os.path.join(outFolder, '{}_yield_{}_{}.xlsx'.format(mlaL, group, n))
plot        = os.path.join(outFolder, '{}_plot_{}_{}.png'.format(mlaL, group, n))
Stats       = os.path.join(outFolder, '{}_Stats_{}_{}.xlsx'.format(mlaL, group, n))
Importances = os.path.join(outFolder, '{}_Importances_{}_{}.xlsx'.format(mlaL, group, n))

dfYield = pd.DataFrame({'Original Yield':list(OriginalYield_test), 'Predicted Yield':list(PredictedYield_test)})
dfYield.to_excel(yieldFile, index=False)

plotOriginalPredicted(OriginalYield_test, PredictedYield_test, plot, label='Predicted yield of {}'.format(mlaL))
df_stats = performanceStatistics(OriginalYield_test, PredictedYield_test)
print(mlaL, df_stats)

df_GSCV = pd.DataFrame(GSCV.cv_results_)
df_GSCV = df_GSCV.sort_values(by='rank_test_score', ascending=True)
df_GSCV = df_GSCV.head(1)
for i in list(df_GSCV.columns):
       
    df_stats[i] = df_GSCV[i].values.tolist()[0]
            
    df_stats = pd.DataFrame.from_dict(df_stats, orient='index').T
    df_stats.to_excel(Stats, index=False)
    
    pi = inspection.permutation_importance(bestEstimator, X_test, y_test, n_jobs=-1, random_state=0).importances_mean
    pi = [((i / pi.sum()) * 100) for i in pi]
    dfImportances = pd.DataFrame(data=[pi], columns=xFeatures).round(2)
    dfImportances.to_excel(Importances, index=False)

R2 of testing ANN using Univariate approach = 0.72, lets try Chi2 approach and compare.
2- Chi-square Test

The Chi-square test is used for categorical features in a dataset. We calculate Chi-square between each feature and the target and select the desired number of features with the best Chi-square scores. In order to correctly apply the chi-squared to test the relation between various features in the dataset and the target variable, the following conditions have to be met: the variables have to be categorical, sampled independently, and values should have an expected frequency greater than 5.

In [None]:
df = pd.read_csv("D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned1.csv")
from sklearn.feature_selection import SelectKBest, chi2 #Imprort the required library
#remove minus and NAN values from dataframe
df[df < 0] = 0
df
df = df.dropna()
df = df.reset_index(drop=True)

print (df)

In [None]:
#Specify X and y variables
X = df.iloc[:,1:70]  #independent variables
y = df.iloc[:,-1]    #target variable i.e price range
X
y=y.astype('int')
y

In [None]:
#convert categorical data to int
X_cat = X.astype(int)
#20 features of the high ch2 will be selected
chi2_features = SelectKBest(chi2,k=20)
X_kbest_features = chi2_features.fit_transform(X_cat, y)

#Reduced features
print('Original feature number:',X_cat.shape[1])
print('Reduced feature number:',X_kbest_features.shape[1])

In [None]:
print(X_kbest_features.shape)
print(X_kbest_features)
X_kbest_features.astype('float')
#Array to list to enalble us reading the feature values
arr = np.array(X_kbest_features)
list = arr.tolist()
print(f'List: {list}')

The selected 20 features from Chi2 approach are ['Cu030','K030','P030','Ptot030','albottom','catop','cabottom','fetop','ktop','kbottom','mgtop','mgbottom','ntotncstop','ntotncsbottom','DEM','TRI','Nrate','Prate','Krate']
Using the same way used above with Univariate, it was found that the accuracy from Chi2 in testing Control= ANN {'R2': 0.65, 'RB': 4.75, 'MAE': 1.79, 'RMSE': 2.27, 'n': 100}. It looks like that there is an opportunity for improving feature selection if we compared both approaches with initial (without feature selection). Lets try feature permutation importance and compare its accuracy with Univariate and Chi2 approaches.

Accurcy of ANN (the best estimator) using all features and without permutation importance, R2=0.56 (see upper cell at the begining of the code), lets drop the following features 
based on the PI, and see the accuracy under selected features ['ntotncsbottom', 'SOMbottom', 'FCbottom', 'octop', 'znbottom', 'ececfbottom', 'mgbottom']
* ANN important features:59 (the selected features after RFE using permutation importance)

In [None]:
#X and y features after dropping non-important features based on permutation importance from ANN
xFeatures = ['B030','Cu030','K030','Mn030','N030','Na030','P030','Ptot030','altop','albottom','bdr','ctottop',
'ctotbottom','cabottom','claytotpsatop','dbodtop','dbodbottom','ececftop',
'fetop','febottom','ktop','kbottom','mgtop','ntotncstop','ocbottom','ptop','pbottom','phh2otop','phh2obottom',
'stop','sbottom','sandtotpsatop','sandtotpsabottom','silttotpsatop','silttotpsabottom','wpg2top','wpg2bottom','zntop',
'PWPtop','PWPbottom','FCtop','SWStop','SWSbottom','treat2','DEM','slope',
'TPI','TRI','tr','di','nrd','tmean','tmin','tmax','Nrate','Prate','Krate'] 

xFeaturesML = ['B030','Cu030','K030','Mn030','N030','Na030','P030','Ptot030','altop','albottom','bdr','ctottop',
'ctotbottom','cabottom','claytotpsatop','dbodtop','dbodbottom','ececftop',
'fetop','febottom','ktop','kbottom','mgtop','ntotncstop','ocbottom','ptop','pbottom','phh2otop','phh2obottom',
'stop','sbottom','sandtotpsatop','sandtotpsabottom','silttotpsatop','silttotpsabottom','wpg2top','wpg2bottom','zntop',
'PWPtop','PWPbottom','FCtop','SWStop','SWSbottom','treat2','DEM','slope',
'TPI','TRI','tr','di','nrd','tmean','tmin','tmax','Nrate','Prate','Krate'] 

yFeatur = 'yield'

Work space

In [None]:
inFile = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE'
outFolder = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/ANNFPITraining'

In [None]:
###Default training (training by 75 % from the dataset and testing by 25 %)after dropping the non important features based on permuatation
df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned3.csv')
df = df.dropna()
df = df.reset_index(drop=True)

Xdf = df[xFeatures]
ydf = df[[yFeatur]]

X = df[xFeatures]
y = df[[yFeatur]]
X = (X - Xdf.mean()) / Xdf.std()
y = (y - ydf.mean()) / ydf.std()
df = pd.concat([X, y], axis = 1)
df = df.dropna()
X = df[xFeatures]
y = df[[yFeatur]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=0)

n = len(xFeatures)
MLA = hyperparameters(n)
for idMLA, mlaL_mlaA in enumerate(MLA.items()):
  mlaL, mlaA = mlaL_mlaA
  estimator = mlaA[0]
  param_grid = mlaA[1]
  GSCV = model_selection.GridSearchCV(estimator, param_grid, scoring='r2', cv=5, refit=True, verbose=False, n_jobs=-1)
  GSCV.fit(X_train, y_train)
  bestEstimator = GSCV.best_estimator_ 

  OriginalYield_test  = y_test.values.flatten()
  PredictedYield_test = bestEstimator.predict(X_test).flatten()
  OriginalYield_test  = (float(ydf.std()) * OriginalYield_test) + float(ydf.mean())
  PredictedYield_test = (float(ydf.std()) * PredictedYield_test) + float(ydf.mean())
  
  outFile     = os.path.join(outFolder, '{}_plot_DefaultML_{}.png'.format(mlaL, n))
  Stats       = os.path.join(outFolder, '{}_Stats_DefaultML_{}.xlsx'.format(mlaL, n))
  Importances = os.path.join(outFolder, '{}_Importances_DefaultML_{}.xlsx'.format(mlaL, n))

  plotOriginalPredicted(OriginalYield_test, PredictedYield_test, outFile, label='Predicted yield of {}'.format(mlaL))
  df_stats = performanceStatistics(OriginalYield_test, PredictedYield_test)
  print(mlaL, df_stats)

  df_GSCV = pd.DataFrame(GSCV.cv_results_)
  df_GSCV = df_GSCV.sort_values(by='rank_test_score', ascending=True)
  df_GSCV = df_GSCV.head(1)
  for i in list(df_GSCV.columns):
    df_stats[i] = df_GSCV[i].values.tolist()[0]
  df_stats = pd.DataFrame.from_dict(df_stats, orient='index').T
  df_stats.to_excel(Stats, index=False)
  
  pi = inspection.permutation_importance(bestEstimator, X_train, y_train, n_jobs=-1, random_state=0).importances_mean
  pi = [((i / pi.sum()) * 100) for i in pi]
  dfImportances = pd.DataFrame(data=[pi], columns=xFeatures).round(2)
  dfImportances.to_excel(Importances, index=False)

R2 = 0.85, lets test by Control treatment


In [None]:
##OutFolder2 another space of outputs of testing
outFolder = 'D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/ANNFPITesting'
#Test by b Control 
for i in [[xFeaturesML, 'ML'], [xFeaturesML, 'ML']]:

  df = pd.read_csv('D:/ICARDA/TT2/TestingByControl_1/Final merged file/Kheir_Codes and results for building ML to predict potato in Rwanda/Feature selection techniques_basedRFE/Cleaned3.csv')
  df = df.dropna()
  df = df.reset_index(drop=True)

  xFeatures = i[0]
  group     = i[1]
  
  print(group)
  Xdf = df[xFeatures]
  ydf = df[[yFeatur]]

  for col in df.columns:
    if col not in ['texture_class_top','texture_class_bottom','treat']:
      mean = np.mean(df[col].values)
      std  = np.std(df[col].values)
      if std != 0:
        df[col] = df[col].apply(lambda x:(x-mean)/std)
  
  testtreat = ['Control'] # random.sample(list(df.treat.unique()), 1)
  print(testtreat)

 
  test  = df[df['treat'].isin(testtreat)]

 
  X_test  = test[xFeatures]
  y_test  = test[[yFeatur]]
#Test by treat(Control) and best estimator
n = len(xFeatures)
  #MLA = hyperparameters(n)
  #for idMLA, mlaL_mlaA in enumerate(MLA.items()):
    #mlaL, mlaA = mlaL_mlaA
    #estimator = mlaA[0]
    #param_grid = mlaA[1]
    #GSCV = model_selection.GridSearchCV(estimator, param_grid, scoring='r2', cv=5, refit=True, verbose=False, n_jobs=-1)
GSCV.predict(X_test)
bestEstimator = GSCV.best_estimator_ 

OriginalYield_test  = y_test.values.flatten()
PredictedYield_test = bestEstimator.predict(X_test).flatten()
OriginalYield_test  = (float(ydf.std()) * OriginalYield_test) + float(ydf.mean())
PredictedYield_test = (float(ydf.std()) * PredictedYield_test) + float(ydf.mean())

yieldFile   = os.path.join(outFolder, '{}_yield_{}_{}.xlsx'.format(mlaL, group, n))
plot        = os.path.join(outFolder, '{}_plot_{}_{}.png'.format(mlaL, group, n))
Stats       = os.path.join(outFolder, '{}_Stats_{}_{}.xlsx'.format(mlaL, group, n))
Importances = os.path.join(outFolder, '{}_Importances_{}_{}.xlsx'.format(mlaL, group, n))

dfYield = pd.DataFrame({'Original Yield':list(OriginalYield_test), 'Predicted Yield':list(PredictedYield_test)})
dfYield.to_excel(yieldFile, index=False)

plotOriginalPredicted(OriginalYield_test, PredictedYield_test, plot, label='Predicted yield of {}'.format(mlaL))
df_stats = performanceStatistics(OriginalYield_test, PredictedYield_test)
print(mlaL, df_stats)

df_GSCV = pd.DataFrame(GSCV.cv_results_)
df_GSCV = df_GSCV.sort_values(by='rank_test_score', ascending=True)
df_GSCV = df_GSCV.head(1)
for i in list(df_GSCV.columns):
       
    df_stats[i] = df_GSCV[i].values.tolist()[0]
            
    df_stats = pd.DataFrame.from_dict(df_stats, orient='index').T
    df_stats.to_excel(Stats, index=False)
    
    pi = inspection.permutation_importance(bestEstimator, X_test, y_test, n_jobs=-1, random_state=0).importances_mean
    pi = [((i / pi.sum()) * 100) for i in pi]
    dfImportances = pd.DataFrame(data=[pi], columns=xFeatures).round(2)
    dfImportances.to_excel(Importances, index=False)

Now, Permutation importances based ANN, showed the best accuracy of testing by Control (R2=0.78), compared with Univariate (R2=0.72), and Chi2 (R2=0.65). Therefore, based on the current dataset and region conditions, the best estimator of yield is ANN and best approach of feature selection is the permutation importance from ANN. Lets use this script in the response function of spatial N,P, and K.