In [1]:
# Importing the required libraries
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [2]:
# Reading the csv file and putting it into 'df' object.
df = pd.read_csv('heart_v2.csv')

In [3]:
df.columns

Index(['age', 'sex', 'BP', 'cholestrol', 'heart disease'], dtype='object')

In [4]:
df.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


In [5]:
# Putting feature variable to X
X = df.drop('heart disease',axis=1)

# Putting response variable to y
y = df['heart disease']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

((189, 4), (81, 4))

Fitting the decision tree with default hyperparameters, apart from max_depth which is 3 so that we can plot and read the tree.

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [10]:
# !pip install six

In [11]:
# # Importing required packaages for visualization
# from IPython.display import Image  
# from six import StringIO  
# from sklearn.tree import export_graphviz
# import pydotplus, graphviz

In [12]:
# # plotting tree with max_depth=3
# dot_data = StringIO()  

# export_graphviz(dt, out_file=dot_data, filled=True, rounded=True,
#                 feature_names=X.columns, 
#                 class_names=['No Disease', "Disease"])

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
# #Image(graph.create_png(),width=800,height=900)
# #graph.write_pdf("dt_heartdisease.pdf")

#### Evaluating model performance

In [13]:
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [15]:
print(accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)

0.7407407407407407


array([[82, 19],
       [30, 58]], dtype=int64)

In [16]:
print(accuracy_score(y_test, y_test_pred))
confusion_matrix(y_test, y_test_pred)

0.6049382716049383


array([[35, 14],
       [18, 14]], dtype=int64)

Creating helper functions to evaluate model performance and help plot the decision tree

In [17]:
# def get_dt_graph(dt_classifier):
#     dot_data = StringIO()
#     export_graphviz(dt_classifier, out_file=dot_data, filled=True,rounded=True,
#                     feature_names=X.columns, 
#                     class_names=['Disease', "No Disease"])
#     graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#     return graph

In [28]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

### Without setting any hyper-parameters

In [29]:
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)

In [30]:
# gph = get_dt_graph(dt_default)
# Image(gph.create_png())

In [31]:
evaluate_model(dt_default)

Train Accuracy : 1.0
Train Confusion Matrix:
[[101   0]
 [  0  88]]
--------------------------------------------------
Test Accuracy : 0.6296296296296297
Test Confusion Matrix:
[[31 18]
 [12 20]]


### Controlling the depth of the tree

In [32]:
# ?DecisionTreeClassifier

In [33]:
dt_depth = DecisionTreeClassifier(max_depth=3)
dt_depth.fit(X_train, y_train)

In [34]:
# gph = get_dt_graph(dt_depth) 
# Image(gph.create_png())

In [35]:
evaluate_model(dt_depth)

Train Accuracy : 0.7407407407407407
Train Confusion Matrix:
[[82 19]
 [30 58]]
--------------------------------------------------
Test Accuracy : 0.6049382716049383
Test Confusion Matrix:
[[35 14]
 [18 14]]


### Specifying minimum samples before split

In [36]:
dt_min_split = DecisionTreeClassifier(min_samples_split=20)
dt_min_split.fit(X_train, y_train)

In [37]:
# gph = get_dt_graph(dt_min_split) 
# Image(gph.create_png())

In [38]:
evaluate_model(dt_min_split)

Train Accuracy : 0.8359788359788359
Train Confusion Matrix:
[[85 16]
 [15 73]]
--------------------------------------------------
Test Accuracy : 0.6419753086419753
Test Confusion Matrix:
[[32 17]
 [12 20]]


### Specifying minimum samples in leaf node

In [39]:
dt_min_leaf = DecisionTreeClassifier(min_samples_leaf=20, random_state=42)
dt_min_leaf.fit(X_train, y_train)

In [40]:
# gph = get_dt_graph(dt_min_leaf)
# Image(gph.create_png())

In [41]:
evaluate_model(dt_min_leaf)

Train Accuracy : 0.7037037037037037
Train Confusion Matrix:
[[85 16]
 [40 48]]
--------------------------------------------------
Test Accuracy : 0.6419753086419753
Test Confusion Matrix:
[[38 11]
 [18 14]]


### Using Entropy instead of Gini

In [42]:
dt_min_leaf_entropy = DecisionTreeClassifier(min_samples_leaf=20, random_state=42, criterion="entropy")
dt_min_leaf_entropy.fit(X_train, y_train)

In [43]:
# gph = get_dt_graph(dt_min_leaf_entropy)
# Image(gph.create_png())

In [44]:
evaluate_model(dt_min_leaf_entropy)

Train Accuracy : 0.7037037037037037
Train Confusion Matrix:
[[85 16]
 [40 48]]
--------------------------------------------------
Test Accuracy : 0.6419753086419753
Test Confusion Matrix:
[[38 11]
 [18 14]]


### Hyper-parameter tuning

In [45]:
dt = DecisionTreeClassifier(random_state=42)

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [48]:
# grid_search = GridSearchCV(estimator=dt, 
#                            param_grid=params, 
#                            cv=4, n_jobs=-1, verbose=1, scoring = "f1")

In [49]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [50]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
CPU times: total: 609 ms
Wall time: 9.29 s


In [51]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010117,0.009092,0.002308,0.003998,gini,2,5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.708333,0.744681,0.617021,0.531915,0.650488,0.082762,17
1,0.011889,0.006869,0.00998,0.006623,gini,2,10,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.708333,0.744681,0.617021,0.531915,0.650488,0.082762,17
2,0.011722,0.006767,0.0,0.0,gini,2,20,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.6875,0.744681,0.617021,0.531915,0.645279,0.079551,20
3,0.011718,0.006765,0.011718,0.006765,gini,2,50,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.666667,0.680851,0.574468,0.702128,0.656028,0.048751,3
4,0.009299,0.0077,0.012856,0.006687,gini,2,100,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.541667,0.531915,0.531915,0.531915,0.534353,0.004223,41


In [52]:
score_df.nlargest(5,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
30,0.015809,0.001446,0.003909,0.006771,entropy,3,5,"{'criterion': 'entropy', 'max_depth': 3, 'min_...",0.604167,0.765957,0.617021,0.680851,0.666999,0.06409,1
5,0.010386,0.003739,0.006051,0.003494,gini,3,5,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.625,0.744681,0.595745,0.659574,0.65625,0.055831,2
3,0.011718,0.006765,0.011718,0.006765,gini,2,50,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.666667,0.680851,0.574468,0.702128,0.656028,0.048751,3
8,0.00781,0.00781,0.007813,0.007813,gini,3,50,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.666667,0.680851,0.574468,0.702128,0.656028,0.048751,3
13,0.003701,0.005963,0.01051,0.006068,gini,5,50,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.666667,0.680851,0.574468,0.702128,0.656028,0.048751,3


In [53]:
grid_search.best_estimator_

In [54]:
dt_best = grid_search.best_estimator_

In [55]:
evaluate_model(dt_best)

Train Accuracy : 0.7195767195767195
Train Confusion Matrix:
[[82 19]
 [34 54]]
--------------------------------------------------
Test Accuracy : 0.6172839506172839
Test Confusion Matrix:
[[36 13]
 [18 14]]


In [56]:
from sklearn.metrics import classification_report

In [57]:
print(classification_report(y_test, dt_best.predict(X_test)))

              precision    recall  f1-score   support

           0       0.67      0.73      0.70        49
           1       0.52      0.44      0.47        32

    accuracy                           0.62        81
   macro avg       0.59      0.59      0.59        81
weighted avg       0.61      0.62      0.61        81



In [58]:
# gph = get_dt_graph(dt_best)
# Image(gph.create_png())

## Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
rf = RandomForestClassifier(random_state=42, n_estimators=10, max_depth=3)

In [61]:
rf.fit(X_train, y_train)

In [62]:
rf.estimators_[0]

In [63]:
sample_tree = rf.estimators_[4]

In [64]:
# gph = get_dt_graph(sample_tree)
# Image(gph.create_png(), width=700, height=700)

In [66]:
# gph = get_dt_graph(sample_tree)
# # Image(gph.create_png(), width=700, height=700)gph = get_dt_graph(rf.estimators_[2])
# Image(gph.create_png(), width=700, height=700)

In [67]:
evaluate_model(rf)

Train Accuracy : 0.7407407407407407
Train Confusion Matrix:
[[80 21]
 [28 60]]
--------------------------------------------------
Test Accuracy : 0.6172839506172839
Test Confusion Matrix:
[[35 14]
 [17 15]]


#### Grid search for hyper-parameter tuning

In [68]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [69]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [70]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [71]:
%%time
grid_search.fit(X,y)

Fitting 4 folds for each of 375 candidates, totalling 1500 fits
CPU times: total: 5.19 s
Wall time: 2min 28s


In [72]:
rf_best = grid_search.best_estimator_

In [73]:
rf_best

In [74]:
evaluate_model(rf_best)

Train Accuracy : 0.8042328042328042
Train Confusion Matrix:
[[87 14]
 [23 65]]
--------------------------------------------------
Test Accuracy : 0.8024691358024691
Test Confusion Matrix:
[[42  7]
 [ 9 23]]


In [75]:
sample_tree = rf_best.estimators_[0]

In [76]:
# gph = get_dt_graph(sample_tree)
# Image(gph.create_png())

In [77]:
# gph = get_dt_graph(rf_best.estimators_[0])
# Image(gph.create_png(), height=600, width=600)

In [78]:
# gph = get_dt_graph(rf_best.estimators_[10])
# Image(gph.create_png(), height=600, width=600)

### Variable importance in RandomForest and Decision trees

In [79]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5, n_estimators=100, oob_score=True)

In [80]:
classifier_rf.fit(X_train, y_train)

In [81]:
classifier_rf.feature_importances_

array([0.37539743, 0.13780788, 0.20834585, 0.27844883])

In [82]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": classifier_rf.feature_importances_
})

In [83]:
imp_df.sort_values(by="Imp", ascending=False)

Unnamed: 0,Varname,Imp
0,age,0.375397
3,cholestrol,0.278449
2,BP,0.208346
1,sex,0.137808
