In [533]:
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [534]:
df = pd.read_csv('heart_failure.csv')

In [535]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [536]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [537]:
df.shape

(918, 12)

In [538]:
min_thr1, max_thr1 = df.Cholesterol.mean() - 3 * df.Cholesterol.std(), df.Cholesterol.mean() + 3 * df.Cholesterol.std()
min_thr2, max_thr2 = df.RestingBP.mean() - 3 * df.Cholesterol.std(), df.RestingBP.mean() + 3 * df.RestingBP.std()
min_thr3, max_thr3 = df.MaxHR.mean() - 3 * df.Cholesterol.std(), df.MaxHR.mean() + 3 * df.MaxHR.std()

In [539]:
df1 = df[(df.Cholesterol < max_thr1) & (df.RestingBP < max_thr2) & (df.MaxHR < max_thr3)]
df2 = df1[(df1.Cholesterol > min_thr1) & (df1.RestingBP > min_thr2) & (df1.MaxHR > min_thr3)]
df2.shape

(908, 12)

In [540]:
df2.Sex.unique(), df.ExerciseAngina.unique()

(array(['M', 'F'], dtype=object), array(['N', 'Y'], dtype=object))

In [541]:
le = LabelEncoder()
df2.Sex = le.fit_transform(df2.Sex)
df2.ExerciseAngina = le.fit_transform(df2.ExerciseAngina)
df2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [542]:
df2.ChestPainType.unique(), df2.RestingECG.unique(), df2.ST_Slope.unique()

(array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object),
 array(['Normal', 'ST', 'LVH'], dtype=object),
 array(['Up', 'Flat', 'Down'], dtype=object))

In [543]:
df2 = pd.get_dummies(df2, drop_first=True)

In [544]:
X = df2.drop(columns=['HeartDisease'])
y = df2.HeartDisease
X.head(1)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,1,0,0,1,0,0,1


In [545]:
y[:5]

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [546]:
standsc = StandardScaler()

In [547]:
X1 = standsc.fit_transform(X)
X1[:2]

array([[-1.43407402,  0.51441613,  0.45460383,  0.85066047, -0.5485067 ,
         1.38605198, -0.82175225, -0.83142756,  2.07603986, -0.53319777,
        -0.23100708,  0.81238134, -0.4920712 , -1.        ,  1.14474127],
       [-0.4782586 , -1.94395148,  1.58318969, -0.16483361, -0.5485067 ,
         0.75552332, -0.82175225,  0.10871718, -0.48168632,  1.87547672,
        -0.23100708,  0.81238134, -0.4920712 ,  1.        , -0.87355984]])

In [548]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5, 10]}

svm = SVC()

In [549]:
gs_cv = GridSearchCV(svm, parameters, cv=5)
gs_cv

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 5, 10], 'kernel': ('linear', 'rbf')})

In [550]:
gs_cv.fit(X1, y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 5, 10], 'kernel': ('linear', 'rbf')})

In [551]:
gs_cv.best_score_, gs_cv.best_params_

(0.8292574828486432, {'C': 1, 'kernel': 'rbf'})

In [552]:
parameters1 = {'max_depth': range(1, 5)}

tree = DecisionTreeClassifier()

In [553]:
gs_cv1 = GridSearchCV(tree, parameters1, cv=5)

In [554]:
gs_cv1.fit(X1, y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 5)})

In [555]:
gs_cv1.best_score_, gs_cv1.best_params_

(0.8137150142675005, {'max_depth': 1})

In [556]:
clf = BaggingClassifier(base_estimator=SVC(C=1, kernel='rbf'),
                        n_estimators=10, random_state=0).fit(X1, y)

In [557]:
cross_val_score(clf, X1, y, cv=5).mean()

0.8237447635237689

In [558]:
random_forest = RandomForestClassifier()

In [559]:
random_forest.fit(X1, y)

RandomForestClassifier()

In [560]:
cross_val_score(random_forest, X1, y, cv=5).mean()

0.8215044623884402