# Capstone 2: Heart Disease - Preprocessing, Training Data, and Modeling

# Imports 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import tree, metrics
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load Data

In [2]:
heart = pd.read_csv('./Data/Heart.csv')
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Split Training Data 

In [3]:
pd.unique(heart.target)

array([1, 0])

In [4]:
# Splitted data into a training and test set

Xlr, Xtestlr, ylr, ytestlr = train_test_split(heart[['age','chol']].values, 
                                              (heart.target).values,random_state=5)

## Penalty L1

In [5]:
Cs = [0,0.01,0.1,0.5,1,5,10]
param_grid = {'C': Cs}
clf = LogisticRegression(penalty = 'l1',solver = 'liblinear')


clf_cv = GridSearchCV(estimator= clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

clf_cv.fit(Xlr, ylr)

print(clf_cv.best_params_)
print(clf_cv.best_score_)
print()

print(roc_auc_score(ytestlr, clf_cv.predict_proba(Xtestlr)[:, 1]))

{'C': 10}
0.6185076923076923

0.7008310249307479


 0.61850769]


## Penatly L2

In [6]:
Cs = [0,0.01,0.1,0.5,1,5,10]
param_grid = {'C': Cs}
clf = LogisticRegression(penalty = 'l2')


clf_cv = GridSearchCV(estimator= clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

clf_cv.fit(Xlr, ylr)

print(clf_cv.best_params_)
print(clf_cv.best_score_)
print()

print(roc_auc_score(ytestlr, clf_cv.predict_proba(Xtestlr)[:, 1]))


{'C': 0.1}
0.6193076923076923

0.7008310249307479


 0.61930769]


# Decision Tree 

In [7]:
#max_depth=None, min_samples_split=2

max_depth = [None,2,4,8]
min_samples_split = [2,4,6,8]

param_grid = {'max_depth': max_depth, 'min_samples_split': min_samples_split}
clf = tree.DecisionTreeClassifier()


clf_cv = GridSearchCV(estimator= clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

clf_cv.fit(Xlr, ylr)

print(clf_cv.best_params_)
print(clf_cv.best_score_)
print()

print(roc_auc_score(ytestlr, clf_cv.predict_proba(Xtestlr)[:, 1]))

{'max_depth': 2, 'min_samples_split': 2}
0.5812461538461537

0.5799861495844876


# Gradient Boosting

In [8]:
max_depth = [None,2,4,8]
min_samples_split = [2,4,6,8]
n_estimators = [10,100,400]


param_grid = {'max_depth': max_depth, 'min_samples_split': min_samples_split,'n_estimators':n_estimators}
clf = GradientBoostingClassifier()


clf_cv = GridSearchCV(estimator= clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

clf_cv.fit(Xlr, ylr)

print(clf_cv.best_params_)
print(clf_cv.best_score_)
print()

print(roc_auc_score(ytestlr, clf_cv.predict_proba(Xtestlr)[:, 1]))

{'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 10}
0.5964153846153846

0.6755540166204986


# Random Forest

In [9]:
max_depth = [None,2,4,8]
min_samples_split = [2,4,6,8]
n_estimators = [10,100,400]


param_grid = {'max_depth': max_depth, 'min_samples_split': min_samples_split,'n_estimators':n_estimators}
clf = RandomForestClassifier()


clf_cv = GridSearchCV(estimator= clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

clf_cv.fit(Xlr, ylr)

print(clf_cv.best_params_)
print(clf_cv.best_score_)
print()

print(roc_auc_score(ytestlr, clf_cv.predict_proba(Xtestlr)[:, 1]))

{'max_depth': 2, 'min_samples_split': 6, 'n_estimators': 10}
0.632

0.6582409972299169


Our primary basis was to calculate the area under the curve, measuring the degree and the probability curve(ROC). Within all of our models, the best ROC_AUC Score we got was Penalty L1 with a score of 0.7008310249307479, being closest to 1. This helps us better understand that our benchmark prediction of our data was fairly high, in addition to understanding that the probabiltiy of heart rate disease was high as well to those with age. 
