In [1]:
from AdvancedAnalytics import ReplaceImputeEncode, DecisionTree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from pydotplus.graphviz import graph_from_dot_data
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np

# Read data
df = pd.read_excel("CreditHistory_Clean.xlsx")

# Data map
attribute_map = { 
    'age':['I',(19,120)], 
    'amount': ['I',(0,20000)], 
    'checking': ['N',(1,2,3,4)], 
    'coapp': ['N',(1,2,3)], 
    'depends': ['B',(1,2)], 
    'duration': ['I',(1,72)], 
    'employed': ['N',(1,2,3,4,5)], 
    'existcr': ['N',(1,2,3,4)], 
    'foreign': ['B',(1,2)], 
    'good_bad': ['B',('bad','good')], 
    'history': ['N',(0,1,2,3,4)], 
    'housing':['N',(1,2,3)], 
    'installp': ['N',(1,2,3,4)], 
    'job': ['N',(1,2,3,4)], 
    'marital': ['N',(1,2,3,4)], 
    'other': ['N',(1,2,3)], 
    'property': ['N',(1,2,3,4)], 
    'purpose': ['N',('0', '1','2','3','4','5','6', '8','9','X')],
    'resident': ['N',(1,2,3,4)], 
    'savings': ['N',(1,2,3,4,5)], 
    'telephon': ['B',(1,2)] 
    }

In [2]:
# Data preprocessing
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', interval_scale = None, drop=False, display=True)
encoded_df = rie.fit_transform(df)


********** Data Preprocessing ***********
Features Dictionary Contains:
3 Interval, 
4 Binary, 
14 Nominal, and 
0 Excluded Attribute(s).

Data contains 1000 observations & 21 columns.


Attribute Counts
............... Missing  Outliers
age.......         0         0
amount....         0         0
checking..         0         0
coapp.....         0         0
depends...         0         0
duration..         0         0
employed..         0         0
existcr...         0         0
foreign...         0         0
good_bad..         0         0
history...         0         0
housing...         0         0
installp..         0         0
job.......         0         0
marital...         0         0
other.....         0         0
property..         0         0
purpose...         0         0
resident..         0         0
savings...         0         0
telephon..         0         0


In [3]:
# Features and the target
y = encoded_df['good_bad'] # The target was encoded to 0 \& 1
X = encoded_df.drop('good_bad',axis=1)
col = rie.col
col.remove('good_bad')

# Cross Validation
depth_list = [5, 6, 7, 8, 10, 12, 15, 20, 25]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for d in depth_list:
    print("\nMaximum Tree Depth: ", d)
    dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, min_samples_split=5)
    dtc = dtc.fit(X,y)
    scores = cross_validate(dtc, X, y, scoring=score_list, return_train_score=False, cv=10)
    print("{:.<13s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev."))
    for s in score_list:
        var = "test_"+s
        mean = scores[var].mean()
        std = scores[var].std()
        print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std))


Maximum Tree Depth:  5
Metric.......  Mean    Std. Dev.
accuracy..... 0.7200    0.0297
recall....... 0.8700    0.0358
precision.... 0.7665    0.0400
f1........... 0.8134    0.0145

Maximum Tree Depth:  6
Metric.......  Mean    Std. Dev.
accuracy..... 0.7100    0.0214
recall....... 0.8400    0.0473
precision.... 0.7719    0.0416
f1........... 0.8022    0.0087

Maximum Tree Depth:  7
Metric.......  Mean    Std. Dev.
accuracy..... 0.7010    0.0348
recall....... 0.8329    0.0456
precision.... 0.7658    0.0425
f1........... 0.7960    0.0204

Maximum Tree Depth:  8
Metric.......  Mean    Std. Dev.
accuracy..... 0.7000    0.0310
recall....... 0.8043    0.0474
precision.... 0.7784    0.0377
f1........... 0.7894    0.0212

Maximum Tree Depth:  10
Metric.......  Mean    Std. Dev.
accuracy..... 0.7040    0.0415
recall....... 0.7957    0.0557
precision.... 0.7880    0.0448
f1........... 0.7897    0.0297

Maximum Tree Depth:  12
Metric.......  Mean    Std. Dev.
accuracy..... 0.6990    0.0367
recal

In [4]:

# Evaluate the mode
X_train, X_validate, y_train, y_validate = train_test_split(X,y,test_size = 0.3, random_state=12345)
dtc = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5)
dtc = dtc.fit(X_train, y_train)
print("\nTraining Data\nRandom Selection of 70% of Original Data")
DecisionTree.display_importance(dtc, col)
DecisionTree.display_binary_split_metrics(dtc, X_train, y_train, X_validate, y_validate)


Training Data
Random Selection of 70% of Original Data

FEATURE..... IMPORTANCE
checking4...   0.3177
duration....   0.1767
amount......   0.1543
age.........   0.0528
history0....   0.0383
history2....   0.0378
other3......   0.0354
purpose0:0..   0.0352
checking3...   0.0329
savings1....   0.0294
purpose6:6..   0.0266
resident2...   0.0201
purpose8:9..   0.0187
employed3...   0.0114
property1...   0.0071
history4....   0.0055
depends.....   0.0000
foreign.....   0.0000
telephon....   0.0000
checking1...   0.0000
checking2...   0.0000
coapp1......   0.0000
coapp2......   0.0000
coapp3......   0.0000
employed1...   0.0000
employed2...   0.0000
employed4...   0.0000
employed5...   0.0000
existcr1....   0.0000
existcr2....   0.0000
existcr3....   0.0000
existcr4....   0.0000
history1....   0.0000
history3....   0.0000
housing1....   0.0000
housing2....   0.0000
housing3....   0.0000
installp1...   0.0000
installp2...   0.0000
installp3...   0.0000
installp4...   0.0000
job1........   0.