# **Loading Dataset**

In [1]:
import pandas as pd
b1 = pd.read_csv('Beta_Secretase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')
b1

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.384050
1,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.698970
2,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.337242
3,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.045757
4,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.251812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7057,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.480000
7058,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.070000
7059,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.540000
7060,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.260000


In [2]:
X = b1.drop('pIC50', axis=1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7057,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7058,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7059,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7060,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
Y = b1['pIC50'].copy()
Y

0       6.384050
1       8.698970
2       6.337242
3       5.045757
4       5.251812
          ...   
7057    4.480000
7058    4.070000
7059    4.540000
7060    4.260000
7061    4.510000
Name: pIC50, Length: 7062, dtype: float64

In [4]:
# Remove low variance features
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(threshold=(0.1))    
X = selection.fit_transform(X)
X.shape

(7062, 218)

# **Building Regression models**

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# **Random forest**

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Create a list of seed number to use for the iteration
list_seed_number = [42,43,44,45,46,47,48,49,50,51]

# Create empty list for later adding results from the multiple data splits
list_rf_train_explained_variance_score = []
list_rf_test_explained_variance_score = []
list_rf_train_r2 = []
list_rf_test_r2 = []
list_rf_train_mse = []
list_rf_test_mse = []

for i in list_seed_number:
  # Data split
  X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=i)
  
  # Define regressor
  rf = RandomForestRegressor(n_estimators=10)
  
  # Train model
  rf.fit(X_train, Y_train)

  # Make predictions
  Y_train_pred = rf.predict(X_train)
  Y_test_pred = rf.predict(X_test)

  # Training set performance
  rf_train_explained_variance_score = explained_variance_score(Y_train, Y_train_pred) # Calculate Explained Variance Score
  rf_train_r2 = r2_score(Y_train, Y_train_pred) # Calculate R2
  rf_train_mse = mean_squared_error(Y_train, Y_train_pred) # Calculate MAE

  # Test set performance
  rf_test_explained_variance_score = explained_variance_score(Y_test, Y_test_pred) # Calculate Explained Variance Score  
  rf_test_r2 = r2_score(Y_test, Y_test_pred) # Calculate R2
  rf_test_mse = mean_squared_error(Y_test, Y_test_pred) # Calculate MAE

  # Saving results to list
  list_rf_train_explained_variance_score.append(rf_train_explained_variance_score)
  list_rf_test_explained_variance_score.append(rf_test_explained_variance_score)  
  list_rf_train_r2.append(rf_train_r2)
  list_rf_test_r2.append(rf_test_r2)
  list_rf_train_mse.append(rf_train_mse)
  list_rf_test_mse.append(rf_test_mse)

In [87]:
# Convert list to Pandas Series
list_seed_number = pd.Series(list_seed_number)
list_rf_train_explained_variance_score = pd.Series(list_rf_train_explained_variance_score)
list_rf_test_explained_variance_score = pd.Series(list_rf_test_explained_variance_score)
list_rf_train_r2 = pd.Series(list_rf_train_r2)
list_rf_test_r2 = pd.Series(list_rf_test_r2)
list_rf_train_mse = pd.Series(list_rf_train_mse)
list_rf_test_mse = pd.Series(list_rf_test_mse)

# Aggregate results into a single data frame
df_rf = pd.concat([list_seed_number, 
                list_rf_train_explained_variance_score,
                list_rf_test_explained_variance_score,
                list_rf_train_r2, 
                list_rf_test_r2, 
                list_rf_train_mse, 
                list_rf_test_mse],
               axis=1)
df_rf.columns=['Seed No.','Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_rf.index.name = "RF Model Performance"
# Finally save the data frame to a CSV file
df_rf.to_csv('RF_10-iterations_results.csv')

In [75]:
# Finalizing Results
list_column_mean = []
list_column_SD = []

for i in df_rf.columns:
  column_mean = df_rf[i].mean()
  column_SD = df_rf[i].std()
  list_column_mean.append(column_mean)
  list_column_SD.append(column_SD)

list_column_mean = pd.Series(list_column_mean)
list_column_SD = pd.Series(list_column_SD)

df_rf_mean_sd = pd.concat([list_column_mean,list_column_SD], axis=1)

# Deletes the first row (corresponding to the Seed No.)
df_rf_mean_sd_results = df_rf_mean_sd.drop(0, axis=0)

# Update the column and row names
df_rf_mean_sd_results.columns = ['Mean', 'SD']
df_rf_mean_sd_results.index = ['Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_rf_mean_sd_results.index.name = "RF Model Performance"

# Transposing the data frame
df_rf_mean_sd_results.T

RF Model Performance,Train EVS,Test EVS,Train R2,Test R2,Train MSE,Test MSE
Mean,0.822891,0.357298,0.822875,0.35697,0.342557,1.233654
SD,0.003522,0.028141,0.003521,0.028092,0.007299,0.039734


# **K nearest neighbors**

In [14]:
from sklearn.neighbors import KNeighborsRegressor

# Create a list of seed number to use for the iteration
list_seed_number = [42,43,44,45,46,47,48,49,50,51]

# Create empty list for later adding results from the multiple data splits
list_knn_train_explained_variance_score = []
list_knn_test_explained_variance_score = []
list_knn_train_r2 = []
list_knn_test_r2 = []
list_knn_train_mse = []
list_knn_test_mse = []

for i in list_seed_number:
  # Data split
  X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=i)
  
  # Define regressor
  knn = KNeighborsRegressor(3)
  
  # Train model
  knn.fit(X_train, Y_train)

  # Make predictions
  Y_train_pred = knn.predict(X_train)
  Y_test_pred = knn.predict(X_test)

  # Training set performance
  knn_train_explained_variance_score = explained_variance_score(Y_train, Y_train_pred) # Calculate Explained Variance Score
  knn_train_r2 = r2_score(Y_train, Y_train_pred) # Calculate R2
  knn_train_mse = mean_squared_error(Y_train, Y_train_pred) # Calculate MAE

  # Test set performance
  knn_test_explained_variance_score = explained_variance_score(Y_test, Y_test_pred) # Calculate Explained Variance Score  
  knn_test_r2 = r2_score(Y_test, Y_test_pred) # Calculate R2
  knn_test_mse = mean_squared_error(Y_test, Y_test_pred) # Calculate MAE

  # Saving results to list
  list_knn_train_explained_variance_score.append(knn_train_explained_variance_score)
  list_knn_test_explained_variance_score.append(knn_test_explained_variance_score)  
  list_knn_train_r2.append(knn_train_r2)
  list_knn_test_r2.append(knn_test_r2)
  list_knn_train_mse.append(knn_train_mse)
  list_knn_test_mse.append(knn_test_mse)

In [86]:
# Convert list to Pandas Series
list_seed_number = pd.Series(list_seed_number)
list_knn_train_explained_variance_score = pd.Series(list_knn_train_explained_variance_score)
list_knn_test_explained_variance_score = pd.Series(list_knn_test_explained_variance_score)
list_knn_train_r2 = pd.Series(list_knn_train_r2)
list_knn_test_r2 = pd.Series(list_knn_test_r2)
list_knn_train_mse = pd.Series(list_knn_train_mse)
list_knn_test_mse = pd.Series(list_knn_test_mse)

# Aggregate results into a single data frame
df_knn = pd.concat([list_seed_number, 
                list_knn_train_explained_variance_score,
                list_knn_test_explained_variance_score,
                list_knn_train_r2, 
                list_knn_test_r2, 
                list_knn_train_mse, 
                list_knn_test_mse],
               axis=1)
df_knn.columns=['Seed No.','Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_knn.index.name = "KNN Model Performance"
# Finally save the data frame to a CSV file
df_knn.to_csv('KNN_10-iterations_results.csv')

In [74]:
# Finalizing Results
list_column_mean = []
list_column_SD = []

for i in df_knn.columns:
  column_mean = df_knn[i].mean()
  column_SD = df_knn[i].std()
  list_column_mean.append(column_mean)
  list_column_SD.append(column_SD)

list_column_mean = pd.Series(list_column_mean)
list_column_SD = pd.Series(list_column_SD)

df_knn_mean_sd = pd.concat([list_column_mean,list_column_SD], axis=1)

# Deletes the first row (corresponding to the Seed No.)
df_knn_mean_sd_results = df_knn_mean_sd.drop(0, axis=0)

# Update the column and row names
df_knn_mean_sd_results.columns = ['Mean', 'SD']
df_knn_mean_sd_results.index = ['Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_knn_mean_sd_results.index.name = "KNN Model Performance"

# Transposing the data frame
df_knn_mean_sd_results.T

KNN Model Performance,Train EVS,Test EVS,Train R2,Test R2,Train MSE,Test MSE
Mean,0.669631,0.337279,0.669543,0.336797,0.639085,1.272405
SD,0.006232,0.029187,0.006247,0.029111,0.012128,0.042959


# **Support vector machine (Radial basis function kernel)**

In [19]:
from sklearn.svm import SVR

# Create a list of seed number to use for the iteration
list_seed_number = [42,43,44,45,46,47,48,49,50,51]

# Create empty list for later adding results from the multiple data splits
list_svm_rbf_train_explained_variance_score = []
list_svm_rbf_test_explained_variance_score = []
list_svm_rbf_train_r2 = []
list_svm_rbf_test_r2 = []
list_svm_rbf_train_mse = []
list_svm_rbf_test_mse = []

for i in list_seed_number:
  # Data split
  X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=i)
  
  # Define regressor
  svm_rbf = SVR(gamma=2, C=1)
  
  # Train model
  svm_rbf.fit(X_train, Y_train)

  # Make predictions
  Y_train_pred = svm_rbf.predict(X_train)
  Y_test_pred = svm_rbf.predict(X_test)

  # Training set performance
  svm_rbf_train_explained_variance_score = explained_variance_score(Y_train, Y_train_pred) # Calculate Explained Variance Score
  svm_rbf_train_r2 = r2_score(Y_train, Y_train_pred) # Calculate R2
  svm_rbf_train_mse = mean_squared_error(Y_train, Y_train_pred) # Calculate MAE

  # Test set performance
  svm_rbf_test_explained_variance_score = explained_variance_score(Y_test, Y_test_pred) # Calculate Explained Variance Score  
  svm_rbf_test_r2 = r2_score(Y_test, Y_test_pred) # Calculate R2
  svm_rbf_test_mse = mean_squared_error(Y_test, Y_test_pred) # Calculate MAE

  # Saving results to list
  list_svm_rbf_train_explained_variance_score.append(svm_rbf_train_explained_variance_score)
  list_svm_rbf_test_explained_variance_score.append(svm_rbf_test_explained_variance_score)  
  list_svm_rbf_train_r2.append(svm_rbf_train_r2)
  list_svm_rbf_test_r2.append(svm_rbf_test_r2)
  list_svm_rbf_train_mse.append(svm_rbf_train_mse)
  list_svm_rbf_test_mse.append(svm_rbf_test_mse)

In [85]:
# Convert list to Pandas Series
list_seed_number = pd.Series(list_seed_number)
list_svm_rbf_train_explained_variance_score = pd.Series(list_svm_rbf_train_explained_variance_score)
list_svm_rbf_test_explained_variance_score = pd.Series(list_svm_rbf_test_explained_variance_score)
list_svm_rbf_train_r2 = pd.Series(list_svm_rbf_train_r2)
list_svm_rbf_test_r2 = pd.Series(list_svm_rbf_test_r2)
list_svm_rbf_train_mse = pd.Series(list_svm_rbf_train_mse)
list_svm_rbf_test_mse = pd.Series(list_svm_rbf_test_mse)

# Aggregate results into a single data frame
df_svm_rbf = pd.concat([list_seed_number, 
                list_svm_rbf_train_explained_variance_score,
                list_svm_rbf_test_explained_variance_score,
                list_svm_rbf_train_r2, 
                list_svm_rbf_test_r2, 
                list_svm_rbf_train_mse, 
                list_svm_rbf_test_mse],
               axis=1)
df_svm_rbf.columns=['Seed No.','Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_svm_rbf.index.name = "SVM RBF Model Performance"
# Finally save the data frame to a CSV file
df_svm_rbf.to_csv('SVM_RBF_10-iterations_results.csv')

In [73]:
# Finalizing Results
list_column_mean = []
list_column_SD = []

for i in df_knn.columns:
  column_mean = df_svm_rbf[i].mean()
  column_SD = df_svm_rbf[i].std()
  list_column_mean.append(column_mean)
  list_column_SD.append(column_SD)

list_column_mean = pd.Series(list_column_mean)
list_column_SD = pd.Series(list_column_SD)

df_svm_rbf_mean_sd = pd.concat([list_column_mean,list_column_SD], axis=1)

# Deletes the first row (corresponding to the Seed No.)
df_svm_rbf_mean_sd_results = df_svm_rbf_mean_sd.drop(0, axis=0)

# Update the column and row names
df_svm_rbf_mean_sd_results.columns = ['Mean', 'SD']
df_svm_rbf_mean_sd_results.index = ['Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_svm_rbf_mean_sd_results.index.name = "SVM RBF Model Performance"

# Transposing the data frame
df_svm_rbf_mean_sd_results.T

SVM RBF Model Performance,Train EVS,Test EVS,Train R2,Test R2,Train MSE,Test MSE
Mean,0.714474,0.139323,0.713074,0.137582,0.554924,1.656089
SD,0.004155,0.016336,0.004309,0.015534,0.010074,0.063723


# **Decision tree**

In [22]:
from sklearn.tree import DecisionTreeRegressor

# Create a list of seed number to use for the iteration
list_seed_number = [42,43,44,45,46,47,48,49,50,51]

# Create empty list for later adding results from the multiple data splits
list_dt_train_explained_variance_score = []
list_dt_test_explained_variance_score = []
list_dt_train_r2 = []
list_dt_test_r2 = []
list_dt_train_mse = []
list_dt_test_mse = []

for i in list_seed_number:
  # Data split
  X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=i)
  
  # Define regressor
  dt = DecisionTreeRegressor(max_depth=5)
  
  # Train model
  dt.fit(X_train, Y_train)

  # Make predictions
  Y_train_pred = dt.predict(X_train)
  Y_test_pred = dt.predict(X_test)

  # Training set performance
  dt_train_explained_variance_score = explained_variance_score(Y_train, Y_train_pred) # Calculate Explained Variance Score
  dt_train_r2 = r2_score(Y_train, Y_train_pred) # Calculate R2
  dt_train_mse = mean_squared_error(Y_train, Y_train_pred) # Calculate MAE

  # Test set performance
  dt_test_explained_variance_score = explained_variance_score(Y_test, Y_test_pred) # Calculate Explained Variance Score  
  dt_test_r2 = r2_score(Y_test, Y_test_pred) # Calculate R2
  dt_test_mse = mean_squared_error(Y_test, Y_test_pred) # Calculate MAE

  # Saving results to list
  list_dt_train_explained_variance_score.append(dt_train_explained_variance_score)
  list_dt_test_explained_variance_score.append(dt_test_explained_variance_score)  
  list_dt_train_r2.append(dt_train_r2)
  list_dt_test_r2.append(dt_test_r2)
  list_dt_train_mse.append(dt_train_mse)
  list_dt_test_mse.append(dt_test_mse)

In [84]:
# Convert list to Pandas Series
list_seed_number = pd.Series(list_seed_number)
list_dt_train_explained_variance_score = pd.Series(list_dt_train_explained_variance_score)
list_dt_test_explained_variance_score = pd.Series(list_dt_test_explained_variance_score)
list_dt_train_r2 = pd.Series(list_dt_train_r2)
list_dt_test_r2 = pd.Series(list_dt_test_r2)
list_dt_train_mse = pd.Series(list_dt_train_mse)
list_dt_test_mse = pd.Series(list_dt_test_mse)

# Aggregate results into a single data frame
df_dt = pd.concat([list_seed_number, 
                list_dt_train_explained_variance_score,
                list_dt_test_explained_variance_score,
                list_dt_train_r2, 
                list_dt_test_r2, 
                list_dt_train_mse, 
                list_dt_test_mse],
               axis=1)
df_dt.columns=['Seed No.','Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_dt.index.name = "DT Model Performance"
# Finally save the data frame to a CSV file
df_dt.to_csv('DT_10-iterations_results.csv')

In [72]:
# Finalizing Results
list_column_mean = []
list_column_SD = []

for i in df_rf.columns:
  column_mean = df_dt[i].mean()
  column_SD = df_dt[i].std()
  list_column_mean.append(column_mean)
  list_column_SD.append(column_SD)

list_column_mean = pd.Series(list_column_mean)
list_column_SD = pd.Series(list_column_SD)

df_dt_mean_sd = pd.concat([list_column_mean,list_column_SD], axis=1)

# Deletes the first row (corresponding to the Seed No.)
df_dt_mean_sd_results = df_dt_mean_sd.drop(0, axis=0)

# Update the column and row names
df_dt_mean_sd_results.columns = ['Mean', 'SD']
df_dt_mean_sd_results.index = ['Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_dt_mean_sd_results.index.name = "DT Model Performance"

# Transposing the data frame
df_dt_mean_sd_results.T

DT Model Performance,Train EVS,Test EVS,Train R2,Test R2,Train MSE,Test MSE
Mean,0.319745,0.259379,0.319745,0.258618,1.315569,1.422395
SD,0.004858,0.026387,0.004858,0.027182,0.009059,0.032522


# **Neural network**

In [25]:
from sklearn.neural_network import MLPRegressor

# Create a list of seed number to use for the iteration
list_seed_number = [42,43,44,45,46,47,48,49,50,51]

# Create empty list for later adding results from the multiple data splits
list_mlp_train_explained_variance_score = []
list_mlp_test_explained_variance_score = []
list_mlp_train_r2 = []
list_mlp_test_r2 = []
list_mlp_train_mse = []
list_mlp_test_mse = []

for i in list_seed_number:
  # Data split
  X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=i)
  
  # Define regressor
  mlp = MLPRegressor(alpha=1, max_iter=1000)
  
  # Train model
  mlp.fit(X_train, Y_train)

  # Make predictions
  Y_train_pred = mlp.predict(X_train)
  Y_test_pred = mlp.predict(X_test)

  # Training set performance
  mlp_train_explained_variance_score = explained_variance_score(Y_train, Y_train_pred) # Calculate Explained Variance Score
  mlp_train_r2 = r2_score(Y_train, Y_train_pred) # Calculate R2
  mlp_train_mse = mean_squared_error(Y_train, Y_train_pred) # Calculate MAE

  # Test set performance
  mlp_test_explained_variance_score = explained_variance_score(Y_test, Y_test_pred) # Calculate Explained Variance Score  
  mlp_test_r2 = r2_score(Y_test, Y_test_pred) # Calculate R2
  mlp_test_mse = mean_squared_error(Y_test, Y_test_pred) # Calculate MAE

  # Saving results to list
  list_mlp_train_explained_variance_score.append(mlp_train_explained_variance_score)
  list_mlp_test_explained_variance_score.append(mlp_test_explained_variance_score)  
  list_mlp_train_r2.append(mlp_train_r2)
  list_mlp_test_r2.append(mlp_test_r2)
  list_mlp_train_mse.append(mlp_train_mse)
  list_mlp_test_mse.append(mlp_test_mse)

In [88]:
# Convert list to Pandas Series
list_seed_number = pd.Series(list_seed_number)
list_mlp_train_explained_variance_score = pd.Series(list_mlp_train_explained_variance_score)
list_mlp_test_explained_variance_score = pd.Series(list_mlp_test_explained_variance_score)
list_mlp_train_r2 = pd.Series(list_mlp_train_r2)
list_mlp_test_r2 = pd.Series(list_mlp_test_r2)
list_mlp_train_mse = pd.Series(list_mlp_train_mse)
list_mlp_test_mse = pd.Series(list_mlp_test_mse)

# Aggregate results into a single data frame
df_mlp = pd.concat([list_seed_number, 
                list_mlp_train_explained_variance_score,
                list_mlp_test_explained_variance_score,
                list_mlp_train_r2, 
                list_mlp_test_r2, 
                list_mlp_train_mse, 
                list_mlp_test_mse],
               axis=1)
df_mlp.columns=['Seed No.','Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_mlp.index.name = "MLP Model Performance"
# Finally save the data frame to a CSV file
df_mlp.to_csv('MLP_10-iterations_results.csv')

In [71]:
# Finalizing Results
list_column_mean = []
list_column_SD = []

for i in df_knn.columns:
  column_mean = df_mlp[i].mean()
  column_SD = df_mlp[i].std()
  list_column_mean.append(column_mean)
  list_column_SD.append(column_SD)

list_column_mean = pd.Series(list_column_mean)
list_column_SD = pd.Series(list_column_SD)

df_mlp_mean_sd = pd.concat([list_column_mean,list_column_SD], axis=1)

# Deletes the first row (corresponding to the Seed No.)
df_mlp_mean_sd_results = df_mlp_mean_sd.drop(0, axis=0)

# Update the column and row names
df_mlp_mean_sd_results.columns = ['Mean', 'SD']
df_mlp_mean_sd_results.index = ['Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_mlp_mean_sd_results.index.name = "MLP Model Performance"

# Transposing the data frame
df_mlp_mean_sd_results.T

MLP Model Performance,Train EVS,Test EVS,Train R2,Test R2,Train MSE,Test MSE
Mean,0.627364,0.40921,0.623378,0.406902,0.728396,1.137988
SD,0.016733,0.021896,0.019973,0.023741,0.039432,0.03476


# **Stacked model**

In [28]:
# Define estimators
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# Create a list of seed number to use for the iteration
list_seed_number = [42,43,44,45,46,47,48,49,50,51]

# Create empty list for later adding results from the multiple data splits
list_stack_train_explained_variance_score = []
list_stack_test_explained_variance_score = []
list_stack_train_r2 = []
list_stack_test_r2 = []
list_stack_train_mse = []
list_stack_test_mse = []

for i in list_seed_number:
  # Data split
  X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=i)
  
  estimator_list = [
    ('rf',rf),
    ('knn',knn),
    ('svm_rbf',svm_rbf),
    ('dt',dt),
    ('mlp',mlp) ]

  # Build stack model
  stack_model = StackingRegressor(
  estimators=estimator_list, final_estimator=LinearRegression()
  )  

  # Train stacked model
  stack_model.fit(X_train, Y_train)

  # Make predictions
  Y_train_pred = stack_model.predict(X_train)
  Y_test_pred = stack_model.predict(X_test)

  # Training set performance
  stack_model_train_explained_variance_score = explained_variance_score(Y_train, Y_train_pred) # Calculate Explained Variance Score
  stack_model_train_r2 = r2_score(Y_train, Y_train_pred) # Calculate R^2
  stack_model_train_mse = mean_squared_error(Y_train, Y_train_pred) # Calculate MSE Score

  # Test set performance
  stack_model_test_explained_variance_score = explained_variance_score(Y_test, Y_test_pred) # Calculate Explained Variance Score
  stack_model_test_r2 = r2_score(Y_test, Y_test_pred) # Calculate R^2
  stack_model_test_mse = mean_squared_error(Y_test, Y_test_pred) # Calculate MSE Score

  # Saving results to list
  list_stack_train_explained_variance_score.append(stack_model_train_explained_variance_score)
  list_stack_test_explained_variance_score.append(stack_model_test_explained_variance_score)  
  list_stack_train_r2.append(stack_model_train_r2)
  list_stack_test_r2.append(stack_model_test_r2)
  list_stack_train_mse.append(stack_model_train_mse)
  list_stack_test_mse.append(stack_model_test_mse)

In [91]:
# Convert list to Pandas Series
list_seed_number = pd.Series(list_seed_number)
list_stack_train_explained_variance_score = pd.Series(list_stack_train_explained_variance_score)
list_stack_test_explained_variance_score = pd.Series(list_stack_test_explained_variance_score)
list_stack_train_r2 = pd.Series(list_stack_train_r2)
list_stack_test_r2 = pd.Series(list_stack_test_r2)
list_stack_train_mse = pd.Series(list_stack_train_mse)
list_stack_test_mse = pd.Series(list_stack_test_mse)

# Aggregate results into a single data frame
df_stack = pd.concat([list_seed_number, 
                list_stack_train_explained_variance_score,
                list_stack_test_explained_variance_score,
                list_stack_train_r2, 
                list_stack_test_r2, 
                list_stack_train_mse, 
                list_stack_test_mse],
               axis=1)
df_stack.columns=['Seed No.','Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_stack.index.name = "Stacked Models Performance"
# Finally save the data frame to a CSV file
df_stack.to_csv('Stacked_Models_10-iterations_results.csv')

In [111]:
# Finalizing Results
list_column_mean = []
list_column_SD = []

for i in df_knn.columns:
  column_mean = df_stack[i].mean()
  column_SD = df_stack[i].std()
  list_column_mean.append(column_mean)
  list_column_SD.append(column_SD)

list_column_mean = pd.Series(list_column_mean)
list_column_SD = pd.Series(list_column_SD)

df_stack_mean_sd = pd.concat([list_column_mean,list_column_SD], axis=1)

# Deletes the first row (corresponding to the Seed No.)
df_stack_mean_sd_results = df_stack_mean_sd.drop(0, axis=0)

# Update the column and row names
df_stack_mean_sd_results.columns = ['Mean', 'SD']
df_stack_mean_sd_results.index = ['Train EVS','Test EVS','Train R2','Test R2','Train MSE','Test MSE']
df_stack_mean_sd_results.index.name = "Stacked Models Performance"

# Transposing the data frame
df_stack_mean_sd_results.T

Stacked Models Performance,Train EVS,Test EVS,Train R2,Test R2,Train MSE,Test MSE
Mean,0.68981,0.439069,0.68848,0.437689,0.602404,1.07913
SD,0.012224,0.017432,0.012229,0.017412,0.021957,0.030026
