# **Financial Inclusion in Africa**

The objective of this notebook is to create a machine learning model to predict which individuals are most likely to have or use a bank account. The models and solutions developed can provide an indication of the state of financial inclusion in Kenya, Rwanda, Tanzania and Uganda.

# **BLUFF**

After Running the various classifier models the LightGBM model had the highest AUC of 80.9%. Following close behind was the XGBoost model with an AUC of 80.7%. Our models were compared to the output of a driverless AI model developed by https://h2o.ai/H20.ai. The comparison can be found in our written report.

# **Libraries**

In [None]:
# Basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
from numpy import unique
from numpy import where
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib import pyplot
import scipy.stats as ss
import math
from scipy.stats.mstats import winsorize

In [None]:
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn import linear_model
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

In [None]:
# sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, log_loss, balanced_accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.inspection import partial_dependence
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import permutation_importance

In [None]:
# mblearn library
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification

In [None]:
!pip install shapash

Collecting shapash
  Downloading shapash-2.0.1-py2.py3-none-any.whl (899 kB)
[K     |████████████████████████████████| 899 kB 5.5 MB/s 
[?25hCollecting dash-daq>=0.5.0
  Downloading dash_daq-0.5.0.tar.gz (642 kB)
[K     |████████████████████████████████| 642 kB 9.2 MB/s 
[?25hCollecting shap>=0.38.1
  Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)
[K     |████████████████████████████████| 564 kB 36.2 MB/s 
[?25hCollecting dash-html-components>=2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting numba>=0.53.1
  Downloading numba-0.55.1-1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.7 MB/s 
Collecting dash-table>=5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash>=2.3.1
  Downloading dash-2.4.1-py3-none-any.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 17.8 MB/s 
[?25hCollecting dash-bootstrap-components>=1.1.

In [None]:
!pip install lime
import lime
import lime.lime_tabular

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l[K     |█▏                              | 10 kB 13.7 MB/s eta 0:00:01[K     |██▍                             | 20 kB 11.2 MB/s eta 0:00:01[K     |███▋                            | 30 kB 9.0 MB/s eta 0:00:01[K     |████▊                           | 40 kB 7.8 MB/s eta 0:00:01[K     |██████                          | 51 kB 4.3 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 5.0 MB/s eta 0:00:01[K     |████████▎                       | 71 kB 5.4 MB/s eta 0:00:01[K     |█████████▌                      | 81 kB 4.3 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 4.7 MB/s eta 0:00:01[K     |███████████▉                    | 102 kB 5.1 MB/s eta 0:00:01[K     |█████████████                   | 112 kB 5.1 MB/s eta 0:00:01[K     |██████████████▎                 | 122 kB 5.1 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 5.1 MB/s eta 0:00:01[K     |█████████

In [None]:
!pip install pdpbox
from pdpbox import pdp, get_dataset, info_plots

Collecting pdpbox
  Downloading PDPbox-0.2.1.tar.gz (34.0 MB)
[K     |████████████████████████████████| 34.0 MB 184 kB/s 
Collecting matplotlib==3.1.1
  Downloading matplotlib-3.1.1-cp37-cp37m-manylinux1_x86_64.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 22.2 MB/s 
Building wheels for collected packages: pdpbox
  Building wheel for pdpbox (setup.py) ... [?25l[?25hdone
  Created wheel for pdpbox: filename=PDPbox-0.2.1-py3-none-any.whl size=35758224 sha256=766bf1510e06815f465d81214523ef4ddb5ebb9deaa5defe72973650483ca163
  Stored in directory: /root/.cache/pip/wheels/f4/d0/1a/b80035625c53131f52906a6fc4dd690d8efd2bf8af6a4015eb
Successfully built pdpbox
Installing collected packages: matplotlib, pdpbox
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.2.2
    Uninstalling matplotlib-3.2.2:
      Successfully uninstalled matplotlib-3.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are 

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBRegressor

In [None]:
# Keras 
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# LightGBM Library
!pip install lightgbm
from lightgbm import LGBMClassifier



# **Data**

In [None]:
# Update openpyxl
!pip install openpyxl==3.0.5


Collecting openpyxl==3.0.5
  Downloading openpyxl-3.0.5-py2.py3-none-any.whl (242 kB)
[?25l[K     |█▍                              | 10 kB 19.6 MB/s eta 0:00:01[K     |██▊                             | 20 kB 11.4 MB/s eta 0:00:01[K     |████                            | 30 kB 8.6 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 7.6 MB/s eta 0:00:01[K     |██████▊                         | 51 kB 3.5 MB/s eta 0:00:01[K     |████████                        | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 4.7 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 4.5 MB/s eta 0:00:01[K     |████████████▏                   | 92 kB 4.3 MB/s eta 0:00:01[K     |█████████████▌                  | 102 kB 4.7 MB/s eta 0:00:01[K     |██████████████▉                 | 112 kB 4.7 MB/s eta 0:00:01[K     |████████████████▏               | 122 kB 4.7 MB/s eta 0:00:01[K     |█████████████████▌              | 133 kB 4.7 MB/s eta

In [None]:
# Update openpyxl
!pip install openpyxl==3.0.5




In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

# Read 
file_ = "drive/My Drive/Colab Notebooks/AfricaTrainData.csv"   # adapt this as needed to the file structure on your Google drive
df = pd.read_csv(file_) # read in csv file

Mounted at /content/drive


# **EDA**

In [None]:
# Shape of df
print("Shape", df.shape)

# Check data types
print(df.info())

Shape (23524, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB
None


# **Preprocessing**

In [None]:
df.drop_duplicates(keep=False, inplace=True)
duplicates_count = df.duplicated().sum()
print('No. of dups:',duplicates_count)

No. of dups: 0


In [None]:
# Import label encoder
from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'species'.
df['relationship_with_head']= label_encoder.fit_transform(df['relationship_with_head'])
df['marital_status']= label_encoder.fit_transform(df['marital_status'])
df['education_level']= label_encoder.fit_transform(df['education_level'])
df['job_type']= label_encoder.fit_transform(df['job_type'])


In [None]:
# create dummy variables for categorical variables 
df = pd.get_dummies(data=df, columns=['location_type', 'cellphone_access', 'gender_of_respondent'],drop_first=True)


In [None]:
# Encode categorical target variable 
df.bank_account.replace(('Yes', 'No'), (1, 0), inplace=True)

In [None]:
df['bank_account'].value_counts()

0    20212
1     3312
Name: bank_account, dtype: int64

In [None]:
# pick numerical variables and set them as X
X = df[['household_size','age_of_respondent']]

In [None]:
# Code for skewness correction (see source below)
# Depending upon the characteritics of a feature (column), a log, Box-Cox or power transform is applied to normalize the distribution 

# -*- coding: utf-8 -*-
"""
Created on Sat Feb 23 14:42:46 2019
@author: DATAmadness
"""

##################################################
# A function that will accept a pandas dataframe
# and auto-transforms columns that exceeds threshold value
#  -  Offers choice between boxcox or log / exponential transformation
#  -  Automatically handles negative values
#  -  Auto recognizes positive /negative skewness

# Further documentation available here:
# https://datamadness.github.io/Skewness_Auto_Transform

def skew_autotransform(DF, include = None, exclude = None, plot = False, threshold = 1, exp = False):
    
    #Get list of column names that should be processed based on input parameters
    if include is None and exclude is None:
        colnames = DF.columns.values
    elif include is not None:
        colnames = include
    elif exclude is not None:
        colnames = [item for item in list(DF.columns.values) if item not in exclude]
    else:
        print('No columns to process!')
    
    #Helper function that checks if all values are positive
    def make_positive(series):
        minimum = np.amin(series)
        #If minimum is negative, offset all values by a constant to move all values to positive teritory
        if minimum <= 0:
            series = series + abs(minimum) + 0.01
        return series
    
    
    #Go through desired columns in DataFrame
    for col in colnames:
        #Get column skewness
        skew = DF[col].skew()
        transformed = True
        
        if plot:
            #Prep the plot of original data
            sns.set_style("darkgrid")
            sns.set_palette("Blues_r")
            fig, axes = plt.subplots(1, 2, figsize=(10, 5))
            #ax1 = sns.distplot(DF[col], ax=axes[0])
            ax1 = sns.histplot(DF[col], ax=axes[0], color="blue", label="100% Equities", kde=True, stat="density", linewidth=0)
            ax1.set(xlabel='Original ' + str(col))
        
        #If skewness is larger than threshold and positively skewed; If yes, apply appropriate transformation
        if abs(skew) > threshold and skew > 0:
            skewType = 'positive'
            #Make sure all values are positive
            DF[col] = make_positive(DF[col])
            
            if exp:
               #Apply log transformation 
               DF[col] = DF[col].apply(math.log)
            else:
                #Apply boxcox transformation
                DF[col] = ss.boxcox(DF[col])[0]
            skew_new = DF[col].skew()
         
        elif abs(skew) > threshold and skew < 0:
            skewType = 'negative'
            #Make sure all values are positive
            DF[col] = make_positive(DF[col])
            
            if exp:
               #Apply exp transformation 
               DF[col] = DF[col].pow(10)
            else:
                #Apply boxcox transformation
                DF[col] = ss.boxcox(DF[col])[0]
            skew_new = DF[col].skew()
        
        else:
            #Flag if no transformation was performed
            transformed = False
            skew_new = skew
        
        #Compare before and after if plot is True
        if plot:
            print('\n ------------------------------------------------------')     
            if transformed:
                print('\n %r had %r skewness of %2.2f' %(col, skewType, skew))
                print('\n Transformation yielded skewness of %2.2f' %(skew_new))
                sns.set_palette("Paired")
                #ax2 = sns.distplot(DF[col], ax=axes[1], color = 'r')
                ax2 = sns.histplot(DF[col], ax=axes[1], color="red", label="100% Equities", kde=True, stat="density", linewidth=0)
                ax2.set(xlabel='Transformed ' + str(col))
                plt.show()
            else:
                print('\n NO TRANSFORMATION APPLIED FOR %r . Skewness = %2.2f' %(col, skew))
                #ax2 = sns.distplot(DF[col], ax=axes[1])
                ax2 = sns.histplot(DF[col], ax=axes[1], color="blue", label="100% Equities", kde=True, stat="density", linewidth=0)
                ax2.set(xlabel='NO TRANSFORM ' + str(col))
                plt.show()
                

    return DF

In [None]:
# Use code above (adapted from https://github.com/datamadness/Automatic-skewness-transformation-for-Pandas-DataFrame) to correct skewness
# All the predictors are real-valued, so we can push them all through the skewness check/correction.
X = skew_autotransform(X.copy(deep=True), plot = False, exp = False, threshold = 1)

In [None]:
# Tukey Rule outliers
# As an alternative, you could use z-scores greater than 3 or less than -3.

cols = X.columns
#Tukey's method
def tukey_rule(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    upper_lim = data[col].quantile(0.5) + 2 * IQR
    lower_lim = data[col].quantile(0.5) - 2 * IQR
    outliers = []
    for index, x in enumerate(data[col]):
        if x < lower_lim or x >= upper_lim:
            outliers.append(index)
    return outliers

# Identify outliers
for i in cols:
  outliers_Tukey = tukey_rule(X,i)
  
# Windsorize X and check the results
X_winsorized = X.copy(deep=True)
for i in cols:
  X_winsorized[i] = winsorize(X[i], limits=(0.05, 0.05))

In [None]:
# Update result df with winzorized quant featires
df[['household_size']]=X_winsorized[['household_size']]
df[['age_of_respondent']]=X_winsorized[['age_of_respondent']]

In [None]:
# drop the column "year" ,"country" and "uniqueid"
df = df.drop(["year","uniqueid","country"], axis = 1)

In [None]:
# Confirm that the columsn match teh dictionary!
df.columns

Index(['bank_account', 'household_size', 'age_of_respondent',
       'relationship_with_head', 'marital_status', 'education_level',
       'job_type', 'location_type_Urban', 'cellphone_access_Yes',
       'gender_of_respondent_Male'],
      dtype='object')

# **Split Predictors & Target**

In [None]:
# Split Predictors
X = df.drop(['bank_account'],axis=1)              
y = df ['bank_account']  

# **Create Holdout Sample**

In [None]:
# Split instances into training and test set, i.e., create holdout sample
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=54321, stratify=y)    # This is a sklearn method. The test_size specifies the fraction of the dataset that
                                                                                                             # goes into the test set. stratify=1 hyperparameter ensures that the training and tests sets have the 
                                                                                                             # same balance (i.e., the same proportion of positive and negative target values)
                                                                                                             # random_state fixes the seed of the random number generator, so that the split is the same every time

# **SMOTE (Oversampling)**

In [None]:
# SMOTE (oversampling)
import imblearn                                                      # Import the imblearn package. sklearn does not feature SMOTE
from imblearn.over_sampling import SMOTE                             # Import SMOTE

sm = SMOTE(random_state=12346)                                       # Create an instance of SMOTE
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)     # Apply the SNOTE intance to the training data

print("Shape before SMOTE: ", X_train.shape, y_train.shape)          # Verify the shape has changed and is now balanced.
print("Shape after SMOTE: ", X_train_SMOTE.shape, y_train_SMOTE.shape)
print("Mean of target: ",mean(y_train_SMOTE))

Shape before SMOTE:  (18819, 9) (18819,)
Shape after SMOTE:  (32338, 9) (32338,)
Mean of target:  0.5


In [None]:
Xcols= X.columns
col_names=X.columns
cols=df.columns

# **Standardize**

In [None]:
# Standardize
mmsc = StandardScaler()                                   # Create an instance of MinMaxScaler
X_train_SMOTE_std = mmsc.fit_transform(X_train_SMOTE)   # Uses .fit_transform to compute the mean and standard deviaton of training dataset and to use those statistics to nromalize the training data
X_test_std = mmsc.transform(X_test)                     # Uses .transform to apply the training set transformation to the test set. This way both datasets are normalized similarly, but 
                                                        # no information from the test set is leaked back into the training set.

# **Support Vector Machine Classifier (SVC)**

In [None]:
# Build Random Forest classifer on revised dataset
from sklearn.metrics import confusion_matrix                                # Import confusion_matrix method from sklearn
from sklearn import metrics                                                 # Import metrics from sklearn
model = SVC(kernel='rbf')                                                   # Create a Support Vector Machine Classifier
model.fit(X_train_SMOTE_std,y_train_SMOTE)                                  # Fit the revised training data (i.e., the training data has only five features)
y_pred = model.predict(X_test_std)                                          # Use model to predict target values for test predictors
print('Confusion matrix: \n',confusion_matrix(y_test, y_pred))              # print confusion matrix using sklearn 
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)       # Compute true positive rate and false positive rate using the roc_curve method
print('AUC: ', metrics.auc(fpr, tpr))                                       # Compute and print the area under the ROC curve, AUC


Confusion matrix: 
 [[2901 1142]
 [ 155  507]]
AUC:  0.7416987550000635


In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=model,
                               X=X_train_SMOTE_std,
                               y=y_train_SMOTE,
                               train_sizes=np.linspace(0.5, 1.0, 5),
                               cv=10,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)


print('Test Mean: ',test_mean)
plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.3, 1.00])
plt.tight_layout()
plt.title('Learning Curve')
# plt.savefig('./figures/learning_curve.png', dpi=300)
plt.show()

# **Permutation Importance**

In [None]:
# Assess features important for predictions in the wild
plt.rcParams["figure.figsize"] = (10,8)
perm_importance = permutation_importance(model, X_test_std, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(Xcols[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

# **Feature Importance Using XGBoost Model**

In [None]:
xgb = XGBClassifier(n_estimators=100)
xgb.fit(X_train_SMOTE, np.ravel(y_train_SMOTE))
sorted_idx = xgb.feature_importances_.argsort()
plt.barh(cols[sorted_idx], xgb.feature_importances_[sorted_idx])
plt.xlabel("XGBoost Model Feature Importance")

# **Feature Importance Using Random Forest Model**

In [None]:
# Fit Random Forest model, recover feature importances
# Assess features important in the model-building process
rf = RandomForestClassifier()
rf.fit(X_train_SMOTE_std,np.ravel(y_train_SMOTE))
sorted_idx = rf.feature_importances_.argsort()
plt.barh(Xcols[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Model Feature Importance")

# **KDE Plots**

In [None]:
# Generate kde plots to examine the density functions of the most important features
y_test=pd.DataFrame(y_test)
y.columns=['first_trip_tz']
X_test_std = pd.DataFrame(X_test_std)
X_test_std.columns=Xcols
from matplotlib.pyplot import figure
figure(figsize=(6, 6), dpi=80)
df=pd.concat([y_test,X_test_std],axis=1)
for i in Xcols:
    sns.kdeplot(df.loc[(df['bank_account']==1),i], color='red', shade=True, Label='bank_account')
    sns.kdeplot(df.loc[(df['bank_account']==0),i], color='blue', shade=True, Label='not')
    plt.xlabel('Feature: '+str(i))
    plt.ylabel('Proportion')   
    plt.legend(loc='upper right')
    plt.show()

# **Partial Dependence Plots**

In [None]:
# Univariate Partial Dependence Plot
for i in Xcols:    
    pdp_ = pdp.pdp_isolate(model = model, dataset = X_test_std, model_features=Xcols, feature = i)
    pdp.pdp_plot(pdp_, str(i))
    plt.show()

In [None]:
Xcols

In [None]:
# Bivariate PDP
# Similar to previous PDP plots except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot
if 'location_type_Urban' in Xcols and 'marital_status' in Xcols:
  features_to_plot = ['location_type_Urban', 'marital_status']
  inter1  =  pdp.pdp_interact(model=model, dataset=X_test_std, model_features=Xcols, features=features_to_plot)
  pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
  plt.show()

In [None]:
df

# **Surrogate Models (Global)**

In [None]:
# Decision Tree surrogate model of Random Forest model
# Mimics the beahvior of the black box model on data from the wild (i.e., the test set)
from sklearn import tree
import graphviz

proxy = DecisionTreeClassifier(random_state = 20850,max_depth=2)    # Control the depth of the proxy tree here
proxy.fit(X_test,y_pred)

tree_graph = tree.export_graphviz(proxy, out_file = None, feature_names = Xcols)
graphviz.Source(tree_graph)

In [None]:
# How good is this surrogate decision tree model? 
# Let's find out by computing the correlation between the predictions of the original model and the surrogate model
y_proxy = proxy.predict(X_test) # Use the decision tree to make predictions
y_proxy = pd.DataFrame(y_proxy)
y_pred=pd.DataFrame(y_pred)
print('Correlation coefficient of RF predictions and Surrogate Model predictions: ',y_pred.corrwith(y_proxy,axis=0))

In [None]:
# Try other classifiers

names = ["Decision Tree", "k Nearest Neighbors", "SVM", "MLP", "Random Forest", "XGBoost", "Light GBM"]
classifiers = [ 
    DecisionTreeClassifier(max_depth=5),
    KNeighborsClassifier(3),
    SVC(),
    MLPClassifier(hidden_layer_sizes=(20,20),alpha=1, max_iter=500),
    RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3),
    LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=50,
                                learning_rate=0.1, bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)]

for name, clf in zip(names, classifiers):
  pipe_many = make_pipeline(StandardScaler(),
                        PCA(n_components=5),
                        clf)
  scores = cross_val_score(estimator=pipe_many,
                             X=X_train,
                             y=np.ravel(y_train),
                             cv=10,     #Reduced to three folds for execution speed
                             n_jobs=1,
                            scoring='roc_auc')
  print("Classifier: ",name)
  print('CV AUC scores: {}'.format(scores))
  print('CV AUC mean:{} and std:{}'.format(np.mean(scores), np.std(scores)))
  print("\n\n")

The leaderboard is now as follows:
Classifier performance results are ordered by AUC.

|**Classifier** |    AUC    |             |        |
|:----------------------|:------------:|:--------------:|:-------------:|
|LightGBM | 0.809 |
|XGBoost | 0.807 |
|MLP | 0.795 |
|Decision Tree | 0.758 |
|Support Vector Machine | 0.746 |
|SVC | 0.743 |
|kNN | 0.711 |
