In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [40]:
df = pd.read_csv('heart.csv')

In [41]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [42]:
## Count of Nulls

In [43]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [44]:
## Feature Engineering

In [45]:
## Normality Test

In [46]:
stat, p = shapiro(df['trestbps'])
p>0.05

False

In [47]:
stat, p = shapiro(df['chol'])
p>0.05

False

In [48]:
stat, p = shapiro(df['thalach'])
p>0.05

False

In [49]:
stat, p = shapiro(df['oldpeak'])
p>0.05

False

In [50]:
## Verifying variables to encoding

In [51]:
print(df['cp'].value_counts())
print(df['restecg'].value_counts())
print(df['exang'].value_counts())
print(df['slope'].value_counts())
print(df['ca'].value_counts())
print(df['thal'].value_counts())

0    143
2     87
1     50
3     23
Name: cp, dtype: int64
1    152
0    147
2      4
Name: restecg, dtype: int64
0    204
1     99
Name: exang, dtype: int64
2    142
1    140
0     21
Name: slope, dtype: int64
0    175
1     65
2     38
3     20
4      5
Name: ca, dtype: int64
2    166
3    117
1     18
0      2
Name: thal, dtype: int64


In [52]:
## New df with dummies variables

In [53]:
df_adjusted = pd.get_dummies(df, columns =['cp','restecg','slope','ca','thal'] ,prefix =['cp','restecg','slope','ca','thal'])

In [54]:
df_adjusted.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,cp_0,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,63,1,145,233,1,150,0,2.3,1,0,...,0,1,0,0,0,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,1,0,...,0,1,0,0,0,0,0,0,1,0
2,41,0,130,204,0,172,0,1.4,1,0,...,1,1,0,0,0,0,0,0,1,0
3,56,1,120,236,0,178,0,0.8,1,0,...,1,1,0,0,0,0,0,0,1,0
4,57,0,120,354,0,163,1,0.6,1,1,...,1,1,0,0,0,0,0,0,1,0


In [55]:
## Adjusting age to >= 60 years and < 60 years

In [56]:
df_adjusted['age'] = df_adjusted['age'].apply(lambda x: 1 if x>=60 else 0)

In [57]:
## Separating variables from target 

In [58]:
X = df_adjusted.drop(columns=['target'])
y = df_adjusted[['target']]

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [60]:
### Normalizing data

In [61]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# To Dataframe
X_train = pd.DataFrame(X_train_scaled,columns = X_train.columns)
X_test = pd.DataFrame(X_test_scaled,columns = X_test.columns)


In [62]:
X_train.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.0,0.0,0.0,0.157044,0.0,0.798246,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.367347,0.168591,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.265306,0.203233,0.0,0.614035,0.0,0.285714,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.469388,0.249423,0.0,0.631579,0.0,0.214286,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.44898,0.101617,0.0,0.745614,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [63]:
## Starting the model classifier

In [64]:
## XGBoost

In [65]:
xgbcl = XGBClassifier()
xgbcl.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [66]:
y_pred = xgbcl.predict(X_test)

In [67]:
f1_score(y_test, y_pred)

0.82

In [68]:
## Logistic Regression

In [69]:
from sklearn.linear_model import  LogisticRegression

# Create a based model
logreg = LogisticRegression()

# Fit the grid search to the data
logreg.fit(X_train, y_train)

LogisticRegression()

In [70]:
y_pred = logreg.predict(X_test)

In [71]:
f1_score(y_test, y_pred)

0.8599999999999999

In [72]:
## Random Forest

In [190]:
from sklearn.ensemble import RandomForestClassifier

rdmfo = RandomForestClassifier(random_state=78)

rdmfo.fit(X_train, y_train)

y_pred = rdmfo.predict(X_test)

f1_score(y_test, y_pred)

0.8737864077669903

In [74]:
## SVM

In [166]:
from sklearn.svm import SVC
svc = SVC()

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

f1_score(y_test, y_pred)

0.8627450980392156

In [None]:
# We'll use random forest to tunning hyperparameters

In [191]:
rdmfo.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 78,
 'verbose': 0,
 'warm_start': False}

In [None]:
### Tunning Hyperparameters

In [192]:
params = {'bootstrap': [True, False],
 'max_depth': [5, 10, None],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [1, 2, 5],
 'n_estimators': [50, 100, 200]}

In [193]:
rdmfo_grid = GridSearchCV(estimator=rdmfo, 
                   param_grid=params,
                   scoring='f1', 
                   verbose=1)

In [194]:
rdmfo_grid.fit(X_train,y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


GridSearchCV(estimator=RandomForestClassifier(random_state=78),
             param_grid={'bootstrap': [True, False], 'max_depth': [5, 10, None],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [1, 2, 5],
                         'n_estimators': [50, 100, 200]},
             scoring='f1', verbose=1)

In [195]:
rdmfo_grid.best_params_

{'bootstrap': True,
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [196]:
y_pred = rdmfo_grid.predict(X_test)

In [197]:
f1_score(y_test, y_pred)

0.8627450980392156