In [25]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [6]:
hrt=pd.read_csv('heart.csv')
hrt.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [8]:
hrt.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
hrt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [26]:
hrt['Cholesterol'].replace(0, np.nan, inplace=True)

In [29]:
hrt['RestingBP'].replace(0, np.nan, inplace=True)

In [42]:
hrt['Oldpeak']=hrt['Oldpeak'].astype(int)

In [43]:
hrt.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [44]:
X=hrt[['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']]
X

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140.0,289.0,0,Normal,172,N,0,Up
1,49,F,NAP,160.0,180.0,0,Normal,156,N,1,Flat
2,37,M,ATA,130.0,283.0,0,ST,98,N,0,Up
3,48,F,ASY,138.0,214.0,0,Normal,108,Y,1,Flat
4,54,M,NAP,150.0,195.0,0,Normal,122,N,0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110.0,264.0,0,Normal,132,N,1,Flat
914,68,M,ASY,144.0,193.0,1,Normal,141,N,3,Flat
915,57,M,ASY,130.0,131.0,0,Normal,115,Y,1,Flat
916,57,F,ATA,130.0,236.0,0,LVH,174,N,0,Flat


In [45]:
y=hrt['HeartDisease']

In [33]:
from sklearn.model_selection import train_test_split

In [46]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=25)

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

In [52]:
hrt['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [60]:
num_cols = ['RestingBP', 'Cholesterol']
ord_cols = ['Sex','ExerciseAngina']
oh_cols = ['ChestPainType', 'RestingECG','ST_Slope']

In [61]:
trf = ColumnTransformer([
    ('num', Pipeline([
        ('knn_imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler())
    ]), num_cols),

    ('ord', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder', OrdinalEncoder())
    ]), ord_cols),

    ('onehot', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot_encoder', OneHotEncoder(sparse_output=False, drop='first'))
    ]), oh_cols)
])

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [67]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='newton-cg'))
])

In [68]:
pipe.fit(X_train,y_train)

In [69]:
y_pred = pipe.predict(X_test)

In [74]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
accuracy_score(y_test,y_pred)

0.8532608695652174

In [71]:
pipe_logreg = Pipeline([
    ('preprocessor', trf),
    ('classifier', LogisticRegression(solver='newton-cg', max_iter=1000))
])

pipe_svm = Pipeline([
    ('preprocessor', trf),
    ('classifier', SVC(probability=True))
])

pipe_dt = Pipeline([
    ('preprocessor', trf),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

pipe_rf = Pipeline([
    ('preprocessor', trf),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [72]:
param_grid_logreg = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['newton-cg']
}

param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

param_grid_dt = {
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}


In [75]:
grid_logreg = GridSearchCV(pipe_logreg, param_grid_logreg, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm = GridSearchCV(pipe_svm, param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_dt = GridSearchCV(pipe_dt, param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)


In [76]:
grid_logreg.fit(X_train, y_train)
grid_svm.fit(X_train, y_train)
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)

In [77]:
print("Logistic Regression best params:", grid_logreg.best_params_)
print("Logistic Regression best CV accuracy:", grid_logreg.best_score_)

print("SVM best params:", grid_svm.best_params_)
print("SVM best CV accuracy:", grid_svm.best_score_)

print("Decision Tree best params:", grid_dt.best_params_)
print("Decision Tree best CV accuracy:", grid_dt.best_score_)

print("Random Forest best params:", grid_rf.best_params_)
print("Random Forest best CV accuracy:", grid_rf.best_score_)

Logistic Regression best params: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
Logistic Regression best CV accuracy: 0.851532941943901
SVM best params: {'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
SVM best CV accuracy: 0.8488118535085267
Decision Tree best params: {'classifier__max_depth': 5, 'classifier__min_samples_split': 10}
Decision Tree best CV accuracy: 0.8106700214332309
Random Forest best params: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Random Forest best CV accuracy: 0.841953219644022


In [79]:
import pickle

with open('pipe.pkl', 'rb') as f:
    model = pickle.load(f)

print(model)

Pipeline(steps=[('step1',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('knn_imputer',
                                                                   KNNImputer(n_neighbors=3)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['RestingBP', 'Cholesterol']),
                                                 ('ord',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ordinal_encoder',
                                                                   OrdinalEncoder())]),
                                                  ['Sex', 'ExerciseAngina'])