# Project 03 is about building a Decision Tree basis "Bank Loan Dataset" as an input to classify whether getting a Loan is dependent on what factors from the given dataset. 
# The Independent Factors is obtained through Random Forest Methodology.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
loan_dataset = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name=1)

In [3]:
loan_dataset.head(2)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0


In [4]:
loan_dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [6]:
loan_dataset.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [7]:
# checking for duplicated rows
loan_dataset = loan_dataset.drop_duplicates()

In [8]:
loan_dataset.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

In [9]:
# ZIP_Code does not seem to have any significance so we drop it
loan_dataset.drop(['ZIP Code'], inplace = True, axis = 1)

In [10]:
loan_dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

# Random Forest implementation for finding Independent Variables having primal effect over the Dependent Variable(Personal Loan) for DecisionTree Modeling - 

In [11]:
rf_model = RandomForestClassifier(n_estimators = 1000, max_features = 2, oob_score = True)

In [12]:
predictors=pd.DataFrame([loan_dataset["ID"],loan_dataset["Age"],loan_dataset["Experience"],loan_dataset["Income"],loan_dataset["Family"],loan_dataset["CCAvg"],loan_dataset["Education"],loan_dataset["Mortgage"],loan_dataset["Securities Account"],loan_dataset["CD Account"],loan_dataset["Online"],loan_dataset["CreditCard"]]).T

In [13]:
rf_model.fit(X=predictors,y=loan_dataset["Personal Loan"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
print("OOB Score:")
print("The model has given % of accuracy:", (rf_model.oob_score_ *100))

OOB Score:
The model has given % of accuracy: 98.7


In [16]:
for feature,imp in zip(predictors,rf_model.feature_importances_) :
    print(feature,imp)

ID 0.046198786933147405
Age 0.04128697425371627
Experience 0.039945363975572634
Income 0.3305246195313435
Family 0.09149202507663826
CCAvg 0.1778314959075467
Education 0.1539908605108505
Mortgage 0.04269410178327637
Securities Account 0.005427433526776677
CD Account 0.05308944240053037
Online 0.008047903656169284
CreditCard 0.009470992444432058


# From above result we can colnclude

1. Income has high significance in Attrition determination
2. CCAvg, Education, Family has next significance after Income
3. CD Account, Mortage and Experience, Age has next significance
4. Securities Account, Online, CreditCard has least significance


# As Below mentioned Data attributes has comparatively highest RF values. Thus, ['Income', 'CCAvg', 'Education', 'Family', 'CD Account', 'Mortgage', 'Experience', 'Age'] will only be considered as IDV's for respective Decision Tree Modeling.

# Decision Tree Analysis for Personal Loan(DV) - 

In [27]:
tree_model1 = tree.DecisionTreeClassifier(max_depth=16)

In [28]:
predictors1=pd.DataFrame([loan_dataset["Income"],loan_dataset["Family"],loan_dataset["CCAvg"],loan_dataset["Education"],loan_dataset["Mortgage"],loan_dataset["CD Account"],loan_dataset["Age"],loan_dataset["Experience"]]).T

In [29]:
tree_model1.fit(X=predictors1,y=loan_dataset["Personal Loan"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=16, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
with open("PersonalLoanRFDTree.dot",'w') as f :
    f = tree.export_graphviz(tree_model1,feature_names=['Income', 'CCAvg', 'Education', 'Family', 'CD Account', 'Mortgage', 'Experience', 'Age'],out_file=f)

# Inference(Rules) derived from the developed Decision Tree - 

1. If Education <= 1.5 and Family <= 3.5 and Income >= 106.5 and CCAvg <= 2.5 then returns True.
2. If Income <= 106.5 and CCAvg <= 2.5 then returns true.
3. If Family <= 2.5 and Education >= 1.5 then returns False.
4. If Age >= 60.5 and Income <= 116.5 and Education <= 1.5 then returns False.