In [78]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [39]:
df = pd.read_csv('../heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [40]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [41]:
df = pd.get_dummies(df, columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], drop_first=True, dtype = int)

In [42]:
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,1,0,0,1,1,0,0,1,0
914,68,144,193,1,141,3.4,1,1,0,0,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,1,0,0,0,1,0,1,1,0
916,57,130,236,0,174,0.0,1,0,1,0,0,0,0,0,1,0


In [59]:
columns_outliers = ['RestingBP', 'Cholesterol', 'MaxHR']
z = np.abs(zscore(df[columns_outliers]))
mask = (z < 3).all(axis = 1)
df = df[mask]

In [60]:
X = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

In [64]:
scalar = StandardScaler()
X = scalar.fit_transform(X)

In [67]:
models = {
    'SVM': {
        'model': SVC(),
        'parameter': {
            'C': [1, 10, 30],
            'kernel': ['rbf', 'linear']
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'parameter': {
            'C': [0.1, 1, 10]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'parameter': {
            'n_estimators': [5, 10, 20, 50]
        }
    }
    ,'GaussianNB': {
        'model': GaussianNB(),
        'parameter': {
            'var_smoothing': [1e-9, 1e-5, 1e-2, 1e-3]
        }
    }
}

In [72]:
best = []
for key in models.keys():
    mod = models.get(key)
    clf = GridSearchCV(mod['model'], mod['parameter'], cv = 5)
    clf.fit(X, y)
    best.append({'model': key, 'best_parameter': clf.best_params_, 'best_score': clf.best_score_})

In [74]:
best_df = pd.DataFrame(best)
best_df

Unnamed: 0,model,best_parameter,best_score
0,SVM,"{'C': 1, 'kernel': 'rbf'}",0.831786
1,LogisticRegression,{'C': 0.1},0.835101
2,RandomForest,{'n_estimators': 50},0.826237
3,GaussianNB,{'var_smoothing': 1e-09},0.837311


In [77]:
pca = PCA(n_components = 0.90)
X_pca = pca.fit_transform(X)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, train_size = 0.7, random_state=2)
model = GaussianNB()
model.fit(X_train, y_train)

In [82]:
model.score(X_test, y_test)

0.8860294117647058

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=2)
modela = GaussianNB()
modela.fit(X_train, y_train)