In [414]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [415]:
X = pd.read_csv('heart_failure.csv')

In [416]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [417]:
X.groupby('HeartDisease').mean()

Unnamed: 0_level_0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,50.55122,130.180488,227.121951,0.107317,148.15122,0.408049
1,55.899606,134.185039,175.940945,0.334646,127.655512,1.274213


In [418]:
le = LabelEncoder()
X.Sex = le.fit_transform(X.Sex)
X.ExerciseAngina = le.fit_transform(X.ExerciseAngina)

In [419]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [420]:
X.ChestPainType.unique(), X.RestingECG.unique(), X.ST_Slope.unique()

(array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object),
 array(['Normal', 'ST', 'LVH'], dtype=object),
 array(['Up', 'Flat', 'Down'], dtype=object))

In [421]:
min_thr1, max_thr1 = X.RestingBP.quantile([0.01, 0.995])
min_thr2, max_thr2 = X.Cholesterol.quantile([0.01, 0.995])
min_thr3, max_thr3 = X.MaxHR.quantile([0.01, 0.995])

In [422]:
X.shape

(918, 12)

In [423]:
X1 = X[(X['RestingBP'] < max_thr1) & (X['RestingBP'] > min_thr1)]
X1.shape

(902, 12)

In [424]:
X1 = X1[(X1['Cholesterol'] < max_thr2) & (X1['Cholesterol'] > min_thr2)]
X1.shape

(735, 12)

In [425]:
X1 = X1[(X1['MaxHR'] < max_thr3) & (X1['MaxHR'] > min_thr3)]
X1.shape

(727, 12)

In [426]:
X1.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [427]:
X2 = pd.get_dummies(X1, drop_first=True)
X2.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,1,0,0,1,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,1,0,1,0,1,0
2,37,1,130,283,0,98,0,0.0,0,1,0,0,0,1,0,1
3,48,0,138,214,0,108,1,1.5,1,0,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,1,0,1,0,0,1


In [428]:
X2.shape

(727, 16)

In [429]:
y = X2['HeartDisease']
X3 = X2.drop(columns=['HeartDisease'])

In [430]:
scaler = StandardScaler()
X4 = scaler.fit_transform(X2)
X4[:3]

array([[-1.38071969,  0.56729021,  0.42052653,  0.8646515 , -0.44684446,
         1.33971461, -0.7960577 , -0.84193047, -0.95296195,  1.86752626,
        -0.53971902, -0.24129684,  0.82871886, -0.45567664, -0.94771954,
         1.05808011],
       [-0.42544329, -1.76276619,  1.61943943, -1.17589823, -0.44684446,
         0.66896351, -0.7960577 ,  0.09112357,  1.04935984, -0.53546771,
         1.85281592, -0.24129684,  0.82871886, -0.45567664,  1.05516449,
        -0.94510802],
       [-1.69914516,  0.56729021, -0.17892992,  0.75232766, -0.44684446,
        -1.76250923, -0.7960577 , -0.84193047, -0.95296195,  1.86752626,
        -0.53971902, -0.24129684, -1.20668185,  2.19453868, -0.94771954,
         1.05808011]])

In [431]:
model_params = {'log_reg':
               {'model': LogisticRegression(max_iter=200),
                'params': {'C': [1, 5, 10, 15],
                           'solver': ['liblinear', 'lbfgs']}},
               
               'tree':
               {'model': DecisionTreeClassifier(),
                'params': {'max_depth': [1, 5, 10, 15],
                           'min_samples_split': [2, 5, 8]}},
               
               'svm': {'model': SVC(),
                       'params': {'kernel': ['linear', 'rbf'],
                                  'C': [1, 3, 5, 7]}}}

In [432]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X4, y)
    scores.append({'model': model_name,
                   'best_params': clf.best_params_,
                   'best_score': clf.best_score_})

In [433]:
aqwe = pd.DataFrame(scores, columns=['model', 'best_params', 'best_score'])
aqwe

Unnamed: 0,model,best_params,best_score
0,log_reg,"{'C': 1, 'solver': 'liblinear'}",1.0
1,tree,"{'max_depth': 1, 'min_samples_split': 2}",1.0
2,svm,"{'C': 1, 'kernel': 'linear'}",1.0


In [434]:
pca = PCA(0.90)
X_pca = pca.fit_transform(X4)

In [435]:
X_pca.shape

(727, 12)

In [436]:
scores1 = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_pca, y)
    scores1.append({'model': model_name,
                   'best_params': clf.best_params_,
                   'best_score': clf.best_score_})

In [437]:
aqwe_pca = pd.DataFrame(scores1, columns=['model', 'best_params', 'best_score'])
aqwe_pca

Unnamed: 0,model,best_params,best_score
0,log_reg,"{'C': 1, 'solver': 'liblinear'}",0.936656
1,tree,"{'max_depth': 5, 'min_samples_split': 8}",0.892603
2,svm,"{'C': 5, 'kernel': 'linear'}",0.938026
