### ___PACKAGE IMPORTS___ 

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In this notebook, I am going to be working on the heart disease dataset from [kaggle](https://www.kaggle.com/ronitf/heart-disease-uci).

I am going to explore the data to gain insights and also make a model to predict the presence of a heart disease.

In [3]:
data = pd.read_csv(r'C:\Users\owner\Desktop\New folder\heart.csv')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


##### ___ATTRIBUTE INFORMATION___
*AGE : Age of patient in years*.

*SEX : Sex(1 = male; 0 = female)*.

*CP :  Chest pain type (0=Typical angina; 1=atypical angina; 2=non-anginal pain; 4=asyptomatic)*

*TRESTBPS : Resting blood pressure in mmHg*

*CHOL : Serum cholesterol in mg/dl*

*FBS : Fasting blood sugar > 120mg/dl (1 = yes; 0 = no)*

*RESTECG : Resting electrocardiographic results (0=normal; 1=having ST-T wave above normality; 2=Showing probable or definite left ventricular hypertrophy by Estes' criteria)*
           
*THALACH : Maximum heart rate achieved.*

*EXANG : Exercise induced angina (1=yes ; 0=no)*

*OLDPEAK : ST depression induced by exercise relative to rest.*

*SLOPE : The slope of the peak exercise ST segment (0=upsloping; 1=flat; 2=downsloping)*

*CA : Number of major vessels(0-3) coloured by flourosopy*

*THAL : Thalassemia; a blood disorder.(1=Normal; 2=fixed defect; 3=reversable defect)*

*TARGET : Diagnosis of heart disease. Angiographic disease status (0 = <50% diameter narrowing/No heart disease/no;  1 = >50% diameter narrowing/Presence of heart disease/yes.*

>SOURCE : [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/heart+disease)

#### ___DATA CLEANING___ 

In [4]:
df = data.copy()
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [6]:
# Checking for null values.
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

There are no null values, hence there will be no need to fill or drop rows or columns.

### ___DATA PREPROCESSING___

In [7]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [8]:
# Getting our X and Y values from the dataframe.
x = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [9]:
# Splitting our data into training and test sets.
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=42)

In [10]:
# Selecting important features from the data based on the decision tree classifier model. 
feat = SelectFromModel(DecisionTreeClassifier(max_depth=6).fit(xtrain, ytrain), prefit=True, max_features=5)
print(feat.get_support())
[df.columns[i] for i in feat.get_support(indices=True)]

[ True False  True False False False False  True False False False  True
  True]


['age', 'cp', 'thalach', 'ca', 'thal']

In [11]:
newx_train = feat.transform(xtrain)


X has feature names, but SelectFromModel was fitted without feature names



In [12]:
newx_train.shape

(212, 5)

In [13]:
# Scaling the features using standard scaler.
scaler = StandardScaler()
xtrain_std = scaler.fit_transform(newx_train)

### ___MODEL BUILDING___

In [14]:
kfinner = KFold(n_splits=5, shuffle=True)
kfouter = KFold(n_splits=10, shuffle=True)

##### _Logistic Regression_

In [15]:
pa = {'C':[0.001,0.01,0.1,0.5,1,2,5]}
lr = LogisticRegression()
lr_clf = GridSearchCV(lr, pa, cv=kfinner).fit(xtrain_std, ytrain)
lr_model = lr_clf.best_estimator_
lr_clf.best_params_, lr_clf.best_score_

({'C': 0.5}, 0.8162790697674419)

In [16]:
cross_val_score(lr_model, xtrain_std, ytrain, cv=kfouter).mean()

0.8164502164502163

In [17]:
lr_ypred = lr_model.predict(scaler.transform(feat.transform(xtest)))
lr_ypred


X has feature names, but SelectFromModel was fitted without feature names



array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1], dtype=int64)

In [18]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, lr_ypred), accuracy_score(ytest, lr_ypred), recall_score(ytest, lr_ypred), f1_score(ytest, lr_ypred)))


    Precision Score : 0.8367346938775511
    Accuracy Score : 0.8131868131868132
    Recall Score : 0.82
    F1 Score : 0.8282828282828283



##### _Support Vector Classifier Model_

In [19]:
param = {'kernel':['linear', 'rbf'], 'C':[0.001,0.01,0.1,0.5,1,2,5]}
svm = SVC()
clf =GridSearchCV(svm, param, cv=kfinner).fit(xtrain_std, ytrain)
model = clf.best_estimator_
clf.best_params_, clf.best_score_

({'C': 1, 'kernel': 'rbf'}, 0.8302325581395349)

In [20]:
cross_val_score(model, xtrain_std, ytrain, cv=kfouter).mean()

0.7926406926406926

In [21]:
ypred = model.predict(scaler.transform(feat.transform(xtest)))
ypred


X has feature names, but SelectFromModel was fitted without feature names



array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1], dtype=int64)

In [22]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, ypred), accuracy_score(ytest, ypred), recall_score(ytest, ypred), f1_score(ytest, ypred)))


    Precision Score : 0.8541666666666666
    Accuracy Score : 0.8241758241758241
    Recall Score : 0.82
    F1 Score : 0.836734693877551



##### _K-Nearest Neighbours Classifier_

In [23]:
param = {'n_neighbors':np.arange(1,20)}
knn = KNeighborsClassifier()
knn_clf = GridSearchCV(knn, param, cv=kfinner).fit(xtrain_std, ytrain)
knn_model = knn_clf.best_estimator_
knn_clf.best_params_, knn_clf.best_score_

({'n_neighbors': 4}, 0.8068660022148393)

In [24]:
cross_val_score(knn_model, xtrain_std, ytrain, cv=kfouter).mean()

0.7785714285714286

In [25]:
knn_ypred = knn_model.predict(scaler.transform(feat.transform(xtest)))
knn_ypred


X has feature names, but SelectFromModel was fitted without feature names



array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [26]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, knn_ypred), accuracy_score(ytest, knn_ypred), recall_score(ytest, knn_ypred), f1_score(ytest, knn_ypred)))


    Precision Score : 0.8536585365853658
    Accuracy Score : 0.7692307692307693
    Recall Score : 0.7
    F1 Score : 0.7692307692307692



##### _Decision Tree Classifier_

In [27]:
par = {'max_depth':np.arange(4,10)}
dt = DecisionTreeClassifier()
dt_clf = GridSearchCV(dt, par, cv=kfinner).fit(xtrain_std, ytrain)
dt_model = dt_clf.best_estimator_
dt_clf.best_params_, dt_clf.best_score_

({'max_depth': 5}, 0.7930232558139535)

In [28]:
cross_val_score(dt_model, xtrain_std, ytrain, cv=kfouter).mean()

0.7634199134199134

In [29]:
dt_ypred = dt_model.predict(scaler.transform(feat.transform(xtest)))
dt_ypred


X has feature names, but SelectFromModel was fitted without feature names



array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1], dtype=int64)

In [30]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, dt_ypred), accuracy_score(ytest, dt_ypred), recall_score(ytest, dt_ypred), f1_score(ytest, dt_ypred)))


    Precision Score : 0.7755102040816326
    Accuracy Score : 0.7472527472527473
    Recall Score : 0.76
    F1 Score : 0.7676767676767676



##### _Random Forest Classifier_

In [31]:
para = {'n_estimators':np.arange(10,101,10), 'max_depth':np.arange(4,10)}
rf = RandomForestClassifier()
rf_clf = GridSearchCV(rf, para, cv=kfinner).fit(xtrain_std, ytrain)
rf_model = rf_clf.best_estimator_
rf_clf.best_params_, rf_clf.best_score_

({'max_depth': 4, 'n_estimators': 20}, 0.8399778516057586)

In [32]:
rf_model

RandomForestClassifier(max_depth=4, n_estimators=20)

In [33]:
cross_val_score(rf_model, xtrain_std, ytrain, cv=kfouter).mean()

0.8112554112554113

In [34]:
rf_ypred = rf_model.predict(scaler.transform(feat.transform(xtest)))
rf_ypred


X has feature names, but SelectFromModel was fitted without feature names



array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [35]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, rf_ypred), accuracy_score(ytest, rf_ypred), recall_score(ytest, rf_ypred), f1_score(ytest, rf_ypred)))


    Precision Score : 0.8125
    Accuracy Score : 0.7802197802197802
    Recall Score : 0.78
    F1 Score : 0.7959183673469388



##### _Voting Classifier_

In [36]:
vt_clf = VotingClassifier(estimators=[
    ('LR', lr_clf),
    ('SVM', clf),
    ('KNN', knn_clf),
    ('RF', rf_clf),
    ('DT', dt_clf)
], voting='hard')
vt_clf.fit(xtrain_std, ytrain)

VotingClassifier(estimators=[('LR',
                              GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
                                           estimator=LogisticRegression(),
                                           param_grid={'C': [0.001, 0.01, 0.1,
                                                             0.5, 1, 2, 5]})),
                             ('SVM',
                              GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
                                           estimator=SVC(),
                                           param_grid={'C': [0.001, 0.01, 0.1,
                                                             0.5, 1, 2, 5],
                                                       'kernel': ['linear',
                                                                  'rbf']})),
                             ('KNN',
                              GridSe...
                              GridSearchCV(cv=KFold(n

In [37]:
cross_val_score(vt_clf, xtrain_std, ytrain, cv=kfouter).mean()

0.8158008658008657

In [38]:
vt_ypred = vt_clf.predict(scaler.transform(feat.transform(xtest)))
vt_ypred


X has feature names, but SelectFromModel was fitted without feature names



array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1], dtype=int64)

In [39]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, vt_ypred), accuracy_score(ytest, vt_ypred), recall_score(ytest, vt_ypred), f1_score(ytest, vt_ypred)))


    Precision Score : 0.8367346938775511
    Accuracy Score : 0.8131868131868132
    Recall Score : 0.82
    F1 Score : 0.8282828282828283



#### _Extreme Gradient Boosting_

In [40]:
xgb.XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [41]:
# boost = xgb.XGBClassifier(learning_rate=0.1,
#                             use_label_encoder=False,
#                             max_depth=5, 
#                             n_estimators=5000,
#                             subsample=0.5, 
#                             colsample_bytree=0.5, 
#                             eval_metric='auc',      
#                             verbosity=1)

# boost.fit(xtrain_std, ytrain,
#                         early_stopping_rounds=10,
#                         eval_set = [(scaler.transform(feat.transform(xtest)), ytest)],
#                         verbose = True)


In [42]:
params_dict = {'learning_rate':[0.001, 0.05, 0.1],
               'max_depth':np.arange(1,11,3),
               'n_estimators':[1000,3000,5000]}

boost_clf = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False,
                                           subsample=0.5, 
                                           colsample_bytree=0.5, 
                                           eval_metric='auc',      
                                           verbosity=1),
                                           params_dict,
                                           cv=kfinner,
                                           return_train_score=True)

boost_clf.fit(xtrain_std, ytrain)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.5,
                                     enable_categorical=False,
                                     eval_metric='auc', gamma=None, gpu_id=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=N...
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_wei

In [43]:
boost_model = boost_clf.best_estimator_

In [44]:
cross_val_score(boost_model, xtrain_std, ytrain, cv=kfouter).mean()

0.7924242424242425

In [45]:
boost_ypred = boost_model.predict(scaler.transform(feat.transform(xtest)))


X has feature names, but SelectFromModel was fitted without feature names



In [46]:
print("""
    Precision Score : {0}
    Accuracy Score : {1}
    Recall Score : {2}
    F1 Score : {3}
""".format(precision_score(ytest, boost_ypred), accuracy_score(ytest, boost_ypred), recall_score(ytest, boost_ypred), f1_score(ytest, boost_ypred)))


    Precision Score : 0.8571428571428571
    Accuracy Score : 0.8351648351648352
    Recall Score : 0.84
    F1 Score : 0.8484848484848485

