In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder,LabelEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn import set_config

In [3]:
set_config(transform_output="pandas")

In [4]:
df = pd.read_csv('heart_disease.csv')
df.head()

Unnamed: 0,Gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,HeartStroke
0,Male,39,postgraduate,0,0.0,0.0,no,0,0,195.0,106.0,70.0,26.97,80.0,77.0,No
1,Female,46,primaryschool,0,0.0,0.0,no,0,0,250.0,121.0,81.0,28.73,95.0,76.0,No
2,Male,48,uneducated,1,20.0,0.0,no,0,0,245.0,127.5,80.0,25.34,75.0,70.0,No
3,Female,61,graduate,1,30.0,0.0,no,1,0,225.0,150.0,95.0,28.58,65.0,103.0,yes
4,Female,46,graduate,1,23.0,0.0,no,0,0,285.0,130.0,84.0,23.1,85.0,85.0,No


# Let's Plan

In [5]:
df['HeartStroke'] = df['HeartStroke'].astype(str).str.strip().str.lower()

In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=['HeartStroke']),
    df['HeartStroke'].map({"no": 0, "yes": 1}),test_size=0.2, random_state=42
)

In [7]:
x_train.head()

Unnamed: 0,Gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
3252,Male,40,postgraduate,1,30.0,0.0,no,0,0,205.0,131.0,81.0,23.74,66.0,87.0
3946,Female,57,primaryschool,0,0.0,0.0,no,1,0,250.0,152.5,92.5,32.31,75.0,94.0
1261,Female,47,uneducated,0,0.0,0.0,no,0,0,230.0,123.0,71.0,26.98,83.0,73.0
2536,Male,41,primaryschool,1,30.0,0.0,no,0,0,228.0,113.0,82.5,25.67,67.0,70.0
4089,Female,64,uneducated,0,0.0,0.0,no,1,0,232.0,149.5,84.0,20.49,68.0,96.0


In [8]:
x_train.shape, x_test.shape

((3390, 15), (848, 15))

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           4238 non-null   object 
 1   age              4238 non-null   int64  
 2   education        4133 non-null   object 
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   object 
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  HeartStroke      4238 non-null   object 
dtypes: float64(8), int64(4), object(4)
memory usage: 529.9+ KB


In [10]:
df.isnull().sum()

Gender               0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
HeartStroke          0
dtype: int64

In [11]:
df['education'].value_counts()

education
uneducated       1720
primaryschool    1253
graduate          687
postgraduate      473
Name: count, dtype: int64

In [12]:
#Imputation Transformer
trf1 = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='median'),['cigsPerDay', 'BPMeds', 'totChol','BMI', 'glucose']),
    ('impute_education', SimpleImputer(strategy='most_frequent'),['education', 'heartRate']),
],remainder='passthrough',verbose_feature_names_out=False )

In [13]:
# OneHotEncoder
trf2 = ColumnTransformer([
    ('ord_edu', OrdinalEncoder(categories=[['uneducated','primaryschool', 'graduate','postgraduate']]),['education']),
],remainder='passthrough', verbose_feature_names_out=False)

In [14]:
# OneHotEncoder
trf3 = ColumnTransformer([
    ('ohe_gender_prevStroke', OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first'),['Gender', 'prevalentStroke']),
],remainder='passthrough', verbose_feature_names_out=False)

In [15]:
num_cols = ["age","currentSmoker","cigsPerDay","BPMeds","prevalentHyp","diabetes",
                "totChol","sysBP","diaBP","BMI","heartRate","glucose","education"]

In [16]:
# Scaling
trf4 = ColumnTransformer([
    ('scale', StandardScaler(),num_cols),
],remainder='passthrough',verbose_feature_names_out=False )

In [17]:
# Feature Selection : 
trf5 = SelectKBest(score_func=mutual_info_classif, k=5)

In [18]:
# Train & Test
trf6 = DecisionTreeClassifier() #max_depth

In [19]:
# Create Pipeline
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
    ('trf6', trf6),
])

In [20]:
pipe.fit(x_train, y_train)

In [21]:
#Prediction
y_pred = pipe.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,

In [22]:
#Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7594339622641509

# Cross Validation with Pipeline

In [23]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()

np.float64(0.7424778761061946)

# GridSearch Cross Validation with Pipeline

In [24]:
params = {
    'trf6__max_depth' : [1,2,3,4, None]
}

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
grid = GridSearchCV(pipe, params, cv=5,scoring='accuracy')

In [27]:
grid.fit(x_train, y_train)

In [28]:
grid.best_score_

np.float64(0.846607669616519)

In [29]:
grid.best_params_

{'trf6__max_depth': 1}

# Creating Software Deployement File

In [30]:
import pickle

In [32]:
pickle.dump(pipe, open('model/pipe.pkl', 'wb'))