In [13]:
# Load the data from the Apporto machine to the Colab environment

from google.colab import files
uploaded = files.upload()

Saving PersonalLoan.csv to PersonalLoan (1).csv


In [1]:
# Pandas is the Python package for data frames

import pandas as pd

In [3]:
# Part 1 Data Acquisition

# Read data from a CSV file into a data frame
df = pd.read_csv('PersonalLoan.csv')

# Display the first ten rows of the data frame to examine if it is an individual-level data set
print(df.head(10))

# Display the variable list
print(df.columns.values)

# Display the number of rows and the number of columns in the data set to confirm the portrait shape
# The first element of the output is the number of rows and the second is the number of columns 
print(df.shape)

   Age  Experience  Income  ZIPCode  Family  CCAvg  Education  Mortgage  \
0   25           1      49    91107       4    1.6          1         0   
1   45          19      34    90089       3    1.5          1         0   
2   39          15      11    94720       1    1.0          1         0   
3   35           9     100    94112       1    2.7          2         0   
4   35           8      45    91330       4    1.0          2         0   
5   37          13      29    92121       4    0.4          2       155   
6   53          27      72    91711       2    1.5          2         0   
7   50          24      22    93943       1    0.3          3         0   
8   35          10      81    90089       3    0.6          2       104   
9   34           9     180    93023       1    8.9          3         0   

  SecuritiesAccount CDAccount Online CreditCard PersonalLoan  
0               Yes        No     No         No           No  
1               Yes        No     No         No 

In [4]:
# Part 3 Missing Value Imputation

# Show the number of missing values for each variable in the data frame
df.isnull().sum()

# Drop ZIP Code for now
rvar_list =['ZIPCode']
df_sample1 = df.drop(columns=rvar_list)

# Separate all the variables into two lists for future column indexing
# One for numerical, the other for categorical 
cvar_list = ['Education', 'SecuritiesAccount', 'CDAccount', 'Online', 'CreditCard', 'PersonalLoan']
nvar_list = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage']

# Check if there is any missing value left
df_sample1.isnull().sum()


Age                  0
Experience           0
Income               0
Family               0
CCAvg                0
Education            0
Mortgage             0
SecuritiesAccount    0
CDAccount            0
Online               0
CreditCard           0
PersonalLoan         0
dtype: int64

In [5]:
# Part 4 Variable Transformation

# For classificaiton tree, we skip standardizing the numerical variables 
df_sample2 = df_sample1.copy()
df_sample2[nvar_list] = (df_sample1[nvar_list] - df_sample1[nvar_list].mean())/df_sample1[nvar_list].std()

# Set the datatype for cvar_list to be categorical in Python
# Set the datatype for nvar_list to be numerical in Python 
df_sample3 = df_sample2.copy()
df_sample3[cvar_list] = df_sample2[cvar_list].astype('category')
df_sample3[nvar_list] = df_sample2[nvar_list].astype('float64')

# Convert the categorical variables into dummies (Step 1 of dummy coding)
# prefix_sep is the sympol used to create the dummy variable names.

df_sample4 = df_sample3.copy()
df_sample4 = pd.get_dummies(df_sample3, prefix_sep='_')

# Remove the redundant dummies (Step 2 of dummy coding)
# Placeholder variable: rdummies
rdummies = ['Education_1', 'SecuritiesAccount_Yes', 'CDAccount_Yes', 'Online_Yes', 'CreditCard_Yes', 'PersonalLoan_No']
#rdummies = ['PersonalLoan_No']
df_sample5 = df_sample4.copy()
df_sample5 = df_sample4.drop(columns=rdummies)

# Get the remaining variable list after the variable transformation
print(df_sample5.columns.values)

# Display the milestone dataframe. Compare it with the original dataframe.
print(df_sample5)
print(df)

['Age' 'Experience' 'Income' 'Family' 'CCAvg' 'Mortgage' 'Education_2'
 'Education_3' 'SecuritiesAccount_No' 'CDAccount_No' 'Online_No'
 'CreditCard_No' 'PersonalLoan_Yes']
           Age  Experience    Income    Family     CCAvg  Mortgage  \
0    -1.774239   -1.665912 -0.538175  1.397274 -0.193366 -0.555468   
1    -0.029521   -0.096321 -0.864023  0.525938 -0.250586 -0.555468   
2    -0.552936   -0.445119 -1.363657 -1.216733 -0.536683 -0.555468   
3    -0.901880   -0.968316  0.569708 -1.216733  0.436047 -0.555468   
4    -0.901880   -1.055515 -0.625068  1.397274 -0.536683 -0.555468   
...        ...         ...       ...       ...       ...       ...   
4995 -1.425296   -1.491513 -0.733684 -1.216733 -0.021708 -0.555468   
4996 -1.338060   -1.404313 -1.276764  1.397274 -0.879999  0.280210   
4997  1.540726    1.647670 -1.081255 -0.345398 -0.937218 -0.555468   
4998  1.715198    1.734869 -0.538175  0.525938 -0.822780 -0.555468   
4999 -1.512532   -1.404313  0.200414  0.525938 -0.651121 

In [7]:
# Part 5 Data Partiton

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: model_selection. Required function name: train_test_split
from sklearn.model_selection import train_test_split

# Placeholder variables: df4partition, testpart_size
# test_size specifies the percentage for the test partition
df4partition = df_sample5
testpart_size = 0.2

# random_state specifies the seed for random number generator. 
# random_state = 1 unless otherwised noted
df_nontestData, df_testData = train_test_split(df4partition, test_size=testpart_size, random_state=1)

print(df_nontestData)

           Age  Experience    Income    Family     CCAvg  Mortgage  \
1233  0.668367    0.775675 -1.124701 -0.345398 -0.879999 -0.555468   
1056 -0.814644   -1.229914 -1.059532 -1.216733 -0.725507 -0.555468   
1686  1.453490    1.560470 -0.755407  1.397274  0.149950 -0.555468   
187   0.057715    0.078078  1.851377  0.525938 -0.021708  2.541456   
3840  0.930075    0.950074 -0.842300  0.525938 -1.051657  0.565323   
...        ...         ...       ...       ...       ...       ...   
2895  1.279018    1.386071 -0.755407  1.397274 -0.365024  0.820943   
2763  0.842839    0.950074 -1.320210  1.397274 -0.708341 -0.555468   
905   0.057715    0.165278 -0.994362 -1.216733 -0.536683  0.270378   
3980  0.057715    0.165278  0.330753  1.397274 -0.307805 -0.555468   
235  -0.640172   -1.055515 -0.060265  1.397274 -0.078927 -0.555468   

      Education_2  Education_3  SecuritiesAccount_No  CDAccount_No  Online_No  \
1233            0            0                     1             1          0 

In [8]:
# Part 6 Neural Network

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: neural_network 
# Required function name: MLPClassifier

from sklearn.neural_network import MLPClassifier

# Separate the predictor values and the DV values into X and y respectively
# Placeholder variable: DV
DV = 'PersonalLoan_Yes'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])

# Build a neural network with pre-specified penalty level alpha 
# hls = pre-specified hidden layer nodes 
alpha = 0.1
hls = 3

# Increase max_iter from 2000 to 4000 if a warning msg shows that the model does not converge
clf = MLPClassifier(solver='lbfgs', alpha=alpha, hidden_layer_sizes=hls, max_iter=2000, random_state=1).fit(X,y)

# A user-defined function summary_tree to display a neural network model
######
def summary_nn(model_object):
  coef_structure = [coef.shape for coef in model_object.coefs_]

  n_hidden_nodes = coef_structure[0][1]
  n_input_nodes = coef_structure[0][0]

  print('Neural Network Model Summary \n')
  
  print('Section 0. Penalty level alpha:', model_object.alpha)

  print('\nSection 1. Input (I) -> Hidden (H) - Weight (W): \n')

  for i in range(n_input_nodes):
    for j in range(n_hidden_nodes):
      print('I:', i+1, '-> H:', j+1, '- W:', model_object.coefs_[0][i][j])

  print('\nSection 2. Hidden (H) - Node Bias (B): \n')
  for j in range(n_hidden_nodes):
    print('H:', j+1, '- B:', model_object.intercepts_[0][j])

  print('\nSection 3. Hidden (H) -> Output (O) - Weight (W): \n')
  for j in range(n_hidden_nodes):
    print('H:', j+1, '-> O - W:', model_object.coefs_[1][j][0])
    
  print('\nSection 4. Output (O) - Node Bias (B): \n')
  print('O - B:', model_object.intercepts_[1][0])

######
summary_nn(clf)

Neural Network Model Summary 

Section 0. Penalty level alpha: 0.1

Section 1. Input (I) -> Hidden (H) - Weight (W): 

I: 1 -> H: 1 - W: 1.5012237186144177
I: 1 -> H: 2 - W: -0.717628806855946
I: 1 -> H: 3 - W: -0.012218768277804623
I: 2 -> H: 1 - W: -1.4630803253672742
I: 2 -> H: 2 - W: 0.7070014709670208
I: 2 -> H: 3 - W: -0.027130850730918206
I: 3 -> H: 1 - W: -0.7209175674282938
I: 3 -> H: 2 - W: 1.1833502344630618
I: 3 -> H: 3 - W: -0.05773324936765754
I: 4 -> H: 1 - W: 0.39664297039623536
I: 4 -> H: 2 - W: 0.009101988163266956
I: 4 -> H: 3 - W: 0.009867873033092063
I: 5 -> H: 1 - W: -0.288508904166782
I: 5 -> H: 2 - W: 0.3564089407828527
I: 5 -> H: 3 - W: 0.017533097828970653
I: 6 -> H: 1 - W: 0.044539056619701825
I: 6 -> H: 2 - W: -0.022595823235073503
I: 6 -> H: 3 - W: -0.030249936916256118
I: 7 -> H: 1 - W: -4.1366740133094195
I: 7 -> H: 2 - W: 3.841675409047689
I: 7 -> H: 3 - W: -0.01382097126982844
I: 8 -> H: 1 - W: -4.346332404043962
I: 8 -> H: 2 - W: 3.868436162546622
I: 8

In [9]:
# Run Neural Network with k-fold cross validation with k=5
# Placeholder variable: kfolds
kfolds = 5

# specify within which range of the number of nodes on the hidden layer we will search for the best network model
min_hls = 1
max_hls = 10
# specify within which range of penalty level we will search for the best network model
min_alpha = 0.0001
max_alpha = 10
n_alpha = 10 # for each hls node we gonna have 10 alphas

# parameter grid for search
import numpy as np
param_grid = {'hidden_layer_sizes':np.arange(min_hls, max_hls), 'alpha': list(np.linspace(min_alpha, max_alpha, num=n_alpha))}

# Create a user-defined function, profit_calcuation, that calcuates the profit over a dataset 
# for which we know the actual class (e.g., acceptance/rejection upon receiving a offer) from y_value
# and the predictors values (e.g., consumers age, experience, income) from x_value
# using the probability prediction model carried by the model object model.

def profit_calculation(model, x_value, y_value):
  
  # Specify the decision cut-off used in the decision rule
  d_cutoff = 1/11

  # Get the decisions made by the decision rule for each observation in the dataset
  # The method predict_proba is to get the predicted probability
  # Then we compare the predicted probabilities with the decision cut-off d_cutoff
  # True means SEND, False means NOT SEND
  # list() is to convert the results into a Python list
  decision = list(model.predict_proba(x_value)[:,1] > d_cutoff)
  
  # We put the actual class into a Python list called y
  y = list(y_value)
  
  # Get the number of observations of the dataset and put it into n_obs
  n_obs = len(y)

  # cum_profit is for cumulating the profit during the for-loop
  cum_profit = 0

  for i in range(n_obs): # i will go from 0 to (n_obs-1)
    if decision[i] == True and y[i] == 1: # if the decision is SEND and the actual class is 1 (Accept) for the i-th observation
      profit = 10 # the net profit is 10
    elif decision[i] == True and y[i] == 0: # if the decision is SEND and the actual class is 0 (Reject) for the i-th observation
      profit = -1 # the net profit is -1
    else:
      profit = 0 # For any other situation, the net profit is zero
    cum_profit = cum_profit + profit # cumulating the profit
  
  average_net_profit = cum_profit / n_obs # Derive the average net profit
  return average_net_profit # return the average net profit

# Set n_jobs to be -1 to run  on all CPU cores.
# The search criterion is to find the model that maximizes 
# whatever the scoring function - for this case roc_auc - returns.

from sklearn.model_selection import GridSearchCV

gridsearch = GridSearchCV(MLPClassifier(solver='lbfgs', max_iter=2000, random_state=1), param_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
gridsearch.fit(X,y)
clf_NN = gridsearch.best_estimator_

# Display the resulting best network modele
summary_nn(clf_NN)



Neural Network Model Summary 

Section 0. Penalty level alpha: 1.1112

Section 1. Input (I) -> Hidden (H) - Weight (W): 

I: 1 -> H: 1 - W: -0.00022764541313404147
I: 1 -> H: 2 - W: -0.45387599028168146
I: 1 -> H: 3 - W: -0.6600843392933732
I: 1 -> H: 4 - W: -0.035349357692608094
I: 1 -> H: 5 - W: 0.5775119464589253
I: 1 -> H: 6 - W: -0.04502518195218731
I: 1 -> H: 7 - W: -0.6543807184695876
I: 2 -> H: 1 - W: 0.000990149299291467
I: 2 -> H: 2 - W: 0.058782671463109254
I: 2 -> H: 3 - W: -0.4928024089690553
I: 2 -> H: 4 - W: 0.19611889983289935
I: 2 -> H: 5 - W: -0.4817839401346252
I: 2 -> H: 6 - W: -0.11232495384216191
I: 2 -> H: 7 - W: 0.5069269755645485
I: 3 -> H: 1 - W: 0.006229925678854181
I: 3 -> H: 2 - W: 1.1117887328594422
I: 3 -> H: 3 - W: -0.5277822881217992
I: 3 -> H: 4 - W: -1.0909450668087162
I: 3 -> H: 5 - W: -1.0221817992515563
I: 3 -> H: 6 - W: 0.3215398244711942
I: 3 -> H: 7 - W: 0.9210317094077599
I: 4 -> H: 1 - W: -0.004433061489210019
I: 4 -> H: 2 - W: 1.4017586068565

In [21]:
# Display the level of depth of the best pruned tree
## print(clf_BPT.get_depth())

# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

# X_test is the predictor values in the test partition
X_test = df_testData.drop(columns=[DV])


# Get the AUC of the best pruned tree model
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test_actual, clf_NN.predict_proba(X_test)[:,1]))

print(clf_NN.predict_proba(X_test)[:,1])

0.9831111111111112
[1.56921551e-02 6.81233949e-08 5.77604688e-08 2.06218057e-05
 7.90371426e-04 1.85798569e-09 2.95300450e-05 4.68471578e-03
 9.52543816e-01 6.70153257e-01 3.19979849e-06 1.10156894e-08
 7.02949058e-10 2.05778543e-07 8.96183347e-08 5.98182603e-05
 9.97378742e-01 9.62827095e-06 1.88978684e-03 2.26959720e-05
 1.29557337e-07 2.09774250e-09 7.65623616e-07 3.99541434e-07
 6.12225499e-04 3.33769416e-04 5.24638080e-05 9.56121721e-01
 2.19675980e-04 2.34539200e-04 3.54056738e-05 2.59625199e-05
 4.31405764e-05 8.06952584e-04 9.99595852e-01 2.98313867e-01
 4.97592592e-07 3.54822247e-06 7.71060453e-04 1.26900800e-04
 4.69657886e-02 1.98934489e-08 9.01362227e-02 1.88072515e-08
 9.52256698e-01 1.49656072e-08 1.51555525e-06 3.07707906e-07
 7.70705594e-01 6.71716229e-09 3.17166265e-04 1.36833189e-01
 6.80168297e-03 1.71489066e-04 1.71459661e-10 3.40450040e-04
 6.36274898e-05 1.98880406e-06 7.57061423e-06 5.60710929e-06
 2.75671628e-02 7.79877362e-06 6.24027364e-12 3.32793983e-09
 1.68