In [305]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [306]:
# load the data
df = pd.read_csv("heart.csv")
pd.set_option("display.max_columns" , 100)
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [307]:
#check the names of all columns 
print(df.columns.unique())
#check the data type of each column and check how many unique values are there in each column
for i in range(len(df.columns)):
    print("Column ", i+1 , " is ", df.columns[i], ".It has ",len(df[df.columns[i]].unique()), " unique values.It's data type is ", df[df.columns[i]].dtype)

#split the data in train set and test set
X = df.drop(['HeartDisease'], axis = 1)
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, shuffle = True)

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')
Column  1  is  Age .It has  50  unique values.It's data type is  int64
Column  2  is  Sex .It has  2  unique values.It's data type is  object
Column  3  is  ChestPainType .It has  4  unique values.It's data type is  object
Column  4  is  RestingBP .It has  67  unique values.It's data type is  int64
Column  5  is  Cholesterol .It has  222  unique values.It's data type is  int64
Column  6  is  FastingBS .It has  2  unique values.It's data type is  int64
Column  7  is  RestingECG .It has  3  unique values.It's data type is  object
Column  8  is  MaxHR .It has  119  unique values.It's data type is  int64
Column  9  is  ExerciseAngina .It has  2  unique values.It's data type is  object
Column  10  is  Oldpeak .It has  53  unique values.It's data type is  float64
Column  11  is  ST_Slope .It has  3 

In [308]:
#make two lists that can be used in the pipeline
target = 'HeartDisease'
categorical_cols = [X.columns[i] for i in range(len(X.columns)) if X[X.columns[i]].dtype == object and len(X[X.columns[i]].unique()) <= 10 ]
numerical_cols = [X.columns[i] for i in range(len(X.columns)) if X[X.columns[i]].dtype in ['int64', 'float64']]
print(categorical_cols)
print(numerical_cols)

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']


In [309]:
# replace missing numerical values with mean of the column
numerical_transformer = SimpleImputer(strategy = 'mean')
# replace missing categorical values with the most frequent value then one hot encode the columns
categorical_transformer = Pipeline(steps = [ ('imputer', SimpleImputer(strategy = 'most_frequent') ),
                                             ('onehot', OneHotEncoder(handle_unknown = 'ignore') ) 
                                           ])

In [310]:
# this will preproccess the num cols and cat cols seperately using the above defined numerical and categorical transformers
preproccessor = ColumnTransformer(transformers = [ ('num', numerical_transformer , numerical_cols),
                                                   ('cat', categorical_transformer , categorical_cols)
                                                 ])

In [311]:
# load the xgboost classifier
model = XGBClassifier(n_jobs = -1, use_label_encoder = False, max_depth = 10, learning_rate= 0.25)

In [312]:
# the main pipeline that does preproccessing and model training
main_pipeline = Pipeline(steps = [('preprocessor', preproccessor),
                                  ('model', model)
                                 ])


In [313]:
#preproccess the data and train the model
main_pipeline.fit(X_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['Age', 'RestingBP',
                                                   'Cholesterol', 'FastingBS',
                                                   'MaxHR', 'Oldpeak']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Sex', 'ChestPainType',
                                                   'RestingECG',
                                                   'ExerciseAngina',
                              

In [319]:
# pred the values for unseen test data
y_pred = main_pipeline.predict(X_test)
# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("The accuracy of the model on test data is ",round(accuracy*100, 2), "%")

The accuracy of the model on test data is  86.41 %
