### Import required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import warnings
warnings.filterwarnings('ignore')

# Read CSV file


In [2]:
ds=pd.read_csv('bank.csv')

In [3]:
features_list=list(ds.columns)

In [4]:
features_list

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'deposit']

In [5]:
ds.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [6]:
ds.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes


In [7]:
ds.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


### Select all features except the target feature

In [8]:
X=ds.loc[:,ds.columns !='default']

In [9]:
X.head(5)

Unnamed: 0,age,job,marital,education,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


### Select Only the target feature

In [10]:
y=ds.loc[:,ds.columns=='default']

In [11]:
y.head(3)

Unnamed: 0,default
0,no
1,no
2,no


### Select Only Contineous Values from X 

In [12]:
numerical_feature=X.iloc[:,[4,8,10,11,12,13]]

In [13]:
numerical_feature.head(5)

Unnamed: 0,balance,day,duration,campaign,pdays,previous
0,2343,5,1042,1,-1,0
1,45,5,1467,1,-1,0
2,1270,5,1389,1,-1,0
3,2476,5,579,1,-1,0
4,184,5,673,2,-1,0


### Select Only Categorical Values from X and y groups to apply LabelEncoder and OneHotEncoder

In [14]:
X_categorical_mask=X.dtypes==object

In [15]:
X_categorical_cols=X.columns[X_categorical_mask].tolist()

In [16]:
X[X_categorical_cols].head()

Unnamed: 0,job,marital,education,housing,loan,contact,month,poutcome,deposit
0,admin.,married,secondary,yes,no,unknown,may,unknown,yes
1,admin.,married,secondary,no,no,unknown,may,unknown,yes
2,technician,married,secondary,yes,no,unknown,may,unknown,yes
3,services,married,secondary,yes,no,unknown,may,unknown,yes
4,admin.,married,tertiary,no,no,unknown,may,unknown,yes


### Create Object of LabelEncoder

In [17]:
le=LabelEncoder()

In [18]:
#Convert all the categorical columns to numeric values
X[X_categorical_cols]=X[X_categorical_cols].apply(lambda loc:le.fit_transform(loc))

In [19]:
X[X_categorical_cols].head()

Unnamed: 0,job,marital,education,housing,loan,contact,month,poutcome,deposit
0,0,1,1,1,0,2,8,3,1
1,0,1,1,0,0,2,8,3,1
2,9,1,1,1,0,2,8,3,1
3,7,1,1,1,0,2,8,3,1
4,0,1,2,0,0,2,8,3,1


In [20]:
X1=X[X_categorical_cols]

In [21]:
y=le.fit_transform(y)

In [22]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
y=pd.DataFrame({'default':y})

In [24]:
type(y)

pandas.core.frame.DataFrame

In [25]:
y.head(2)

Unnamed: 0,default
0,0
1,0


In [26]:
le_num=LabelEncoder()

In [27]:
numerical_feature.head(2)

Unnamed: 0,balance,day,duration,campaign,pdays,previous
0,2343,5,1042,1,-1,0
1,45,5,1467,1,-1,0


In [28]:
numerical_feature=numerical_feature.apply(lambda loc :le_num.fit_transform(loc))

In [29]:
numerical_feature.head()

Unnamed: 0,balance,day,duration,campaign,pdays,previous
0,2288,4,1021,0,0,0
1,469,4,1283,0,0,0
2,1618,4,1246,0,0,0
3,2356,4,577,0,0,0
4,608,4,671,1,0,0


### Use Of One Hot Encoder

In [30]:
ohe=OneHotEncoder(categorical_features=X_categorical_mask,sparse=False)

In [31]:
Categ_cols=ohe.fit_transform(X)

In [32]:
Categ_cols.shape

(11162, 51)

In [33]:
Categ_cols=pd.DataFrame(data=Categ_cols)

In [34]:
Categ_cols.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,59.0,2343.0,5.0,1042.0,1.0,-1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,56.0,45.0,5.0,1467.0,1.0,-1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,41.0,1270.0,5.0,1389.0,1.0,-1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,55.0,2476.0,5.0,579.0,1.0,-1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,54.0,184.0,5.0,673.0,2.0,-1.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,42.0,0.0,5.0,562.0,2.0,-1.0,0.0
6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,56.0,830.0,6.0,1201.0,1.0,-1.0,0.0
7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,60.0,545.0,6.0,1030.0,1.0,-1.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,37.0,1.0,6.0,608.0,1.0,-1.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,28.0,5090.0,6.0,1297.0,3.0,-1.0,0.0


In [35]:
ohe2=OneHotEncoder(sparse=False)

In [36]:
num_cols=ohe2.fit_transform(numerical_feature)

In [37]:
num_cols=pd.DataFrame(data=num_cols)

In [38]:
num_cols.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5796,5797,5798,5799,5800,5801,5802,5803,5804,5805
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
np.unique(num_cols)

array([0., 1.])

In [40]:
type(num_cols)

pandas.core.frame.DataFrame

In [41]:
type(Categ_cols)

pandas.core.frame.DataFrame

### Concat Numerical(Contineous) and Categorical dataframes

In [42]:
X_Final=pd.concat([num_cols,Categ_cols],axis=1)

In [43]:
feature_l=list(X_Final.columns)

In [44]:
X_Final.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,59.0,2343.0,5.0,1042.0,1.0,-1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,56.0,45.0,5.0,1467.0,1.0,-1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,41.0,1270.0,5.0,1389.0,1.0,-1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,55.0,2476.0,5.0,579.0,1.0,-1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,54.0,184.0,5.0,673.0,2.0,-1.0,0.0


In [45]:
type(y)

pandas.core.frame.DataFrame

In [46]:
y.head(5)

Unnamed: 0,default
0,0
1,0
2,0
3,0
4,0


### Train_Test_Split

In [47]:
X_train,X_test,y_train,y_test=train_test_split(X_Final,y,test_size=0.25,random_state=22)

### Random Forest Classifier

In [48]:
rfc=RandomForestClassifier(bootstrap=True,criterion='gini',n_estimators=100,max_features='auto',max_leaf_nodes=50)


In [49]:
y_train=y_train.values.ravel()

In [50]:
rfc=rfc.fit(X_train,y_train)

In [51]:
y_pred=rfc.predict(X_test)

In [52]:
len(y_pred)

2791

In [53]:
len(y_test)

2791

### Accuracy of the Model

In [54]:
accur=accuracy_score(y_test,y_pred)

In [55]:
accur

0.9860265137943389

### Confusion Matrix

In [56]:
matr=confusion_matrix(y_test,y_pred)
matr

array([[2752,    0],
       [  39,    0]], dtype=int64)

In [57]:
[matr[0][1]]

[0]

In [58]:
precision=matr[0][0]/(np.multiply(matr[0][0],matr[0][1]))

In [59]:
np.multiply(matr[0][0],matr[0][1])

0

### Classification Report

In [60]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2752
           1       0.00      0.00      0.00        39

   micro avg       0.99      0.99      0.99      2791
   macro avg       0.49      0.50      0.50      2791
weighted avg       0.97      0.99      0.98      2791



### Visualizing a Single Decision Tree

In [61]:
#Create for one decision tree
tree=rfc.estimators_[5]

In [62]:
export_graphviz(tree,out_file='graph.dot',feature_names=feature_l,rounded=True,precision=1)

In [63]:
graph=pydotplus.graph_from_dot_file('graph.dot')

In [64]:
graph.write_png('tree.png')

True