In [1]:
import pandas as pd 
import numpy as np

## Loading Dataset

In [2]:
dataset = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name=1)

In [3]:
dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [4]:
dataset1 = dataset.drop(["ID", "ZIP Code"], axis=1)

In [5]:
dataset1.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [6]:
dataset2 = dataset1.dropna()

In [7]:
dataset2 = dataset2.drop_duplicates()

## Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
# making CCAvg to int
dataset2["CCAvg"] = np.round(dataset2["CCAvg"])

In [10]:
dataset2.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,2.0,1,0,0,1,0,0,0
1,45,19,34,3,2.0,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,3.0,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [11]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [12]:
# IDVs
features = ['Age', 'Experience', 'Income', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account',
       'CD Account', 'Online', 'CreditCard']

In [13]:
# data is not in dataframe therefore no transpose
rf_model.fit(X=dataset2[features], y=dataset2["Personal Loan"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
print("OOB Accuracy : ")
print(rf_model.oob_score_)

OOB Accuracy : 
0.986164026468819


In [15]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature,imp)

Age 0.050059862042360806
Experience 0.05010324263914928
Income 0.36569313427531336
Family 0.1030818586746899
CCAvg 0.13381881050004374
Education 0.1687825750898747
Mortgage 0.045934995635407504
Securities Account 0.00608423244662116
CD Account 0.05581257349400275
Online 0.00947162701298098
CreditCard 0.011157088189555658


- Income, CCAvg, Education are some imp features

## Decision Tree

In [16]:
from sklearn import tree

In [17]:
tree_model = tree.DecisionTreeClassifier()

In [18]:
tree_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10)

In [19]:
# Without transpose the df is row wise
predictors = pd.DataFrame([dataset2["Education"], dataset2["CCAvg"], dataset2["Income"]]).T

In [20]:
predictors.head()

Unnamed: 0,Education,CCAvg,Income
0,1.0,2.0,49.0
1,1.0,2.0,34.0
2,1.0,1.0,11.0
3,2.0,3.0,100.0
4,2.0,1.0,45.0


In [21]:
tree_model.fit(X=predictors, y=dataset2["Personal Loan"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [22]:
with open("Dtree.dot", 'w') as f:
    f = tree.export_graphviz(tree_model, feature_names=["Education", "CCAvg", "Income"], out_file=f)

<img src="dtree.png" width=600 height=650 />

In [24]:
tree_model.score(X=predictors, y=dataset2["Personal Loan"])

0.970723882093443