In [1]:
# import the library and tools and model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

# EDA


In [2]:
df = pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


# Preprocessing

### Define features X and Target y

In [4]:
x = df.drop('HeartDisease',axis =1)
x

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [5]:
y = df['HeartDisease']
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64

### Encode string columns 

In [6]:
l_encode = LabelEncoder()
x['Sex'] = l_encode.fit_transform(x['Sex'])
x['ChestPainType'] = l_encode.fit_transform(x['ChestPainType'])
x['RestingECG'] = l_encode.fit_transform(x['RestingECG'])
x['ExerciseAngina'] = l_encode.fit_transform(x['ExerciseAngina'])
x['ST_Slope'] = l_encode.fit_transform(x['ST_Slope'])
x

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,0,0.0,2
1,49,0,2,160,180,0,1,156,0,1.0,1
2,37,1,1,130,283,0,2,98,0,0.0,2
3,48,0,0,138,214,0,1,108,1,1.5,1
4,54,1,2,150,195,0,1,122,0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1
914,68,1,0,144,193,1,1,141,0,3.4,1
915,57,1,0,130,131,0,1,115,1,1.2,1
916,57,0,1,130,236,0,0,174,0,0.0,1


In [7]:
# Another way to encoding in all data in x
# x = x.apply(LabelEncoder().fit_transform)

### Data Scaling 

In [8]:
l_scal = StandardScaler()
x = l_scal.fit_transform(x)

### Split Data into train and test

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, shuffle=True)

In [10]:
x_train

array([[ 1.74904758,  0.51595242, -0.81699495, ...,  1.21424608,
         1.04375945, -0.59607813],
       [-0.79670232,  0.51595242, -0.81699495, ...,  1.21424608,
         0.57471149, -0.59607813],
       [ 1.21868302, -1.93816322,  1.27505906, ..., -0.8235563 ,
        -0.08195566,  1.05211381],
       ...,
       [ 0.37009972, -1.93816322, -0.81699495, ..., -0.8235563 ,
        -0.83243239,  1.05211381],
       [ 1.11261011,  0.51595242, -0.81699495, ...,  1.21424608,
         2.91995129, -2.24427006],
       [-0.69062941,  0.51595242,  1.27505906, ..., -0.8235563 ,
        -0.83243239,  1.05211381]])

In [11]:
x_test

array([[ 0.15795389,  0.51595242, -0.81699495, ..., -0.8235563 ,
        -0.7386228 , -0.59607813],
       [ 1.32475593,  0.51595242, -0.81699495, ..., -0.8235563 ,
        -0.45719402, -0.59607813],
       [-0.26633776,  0.51595242, -0.81699495, ..., -0.8235563 ,
        -0.83243239, -0.59607813],
       ...,
       [-0.16026485,  0.51595242, -0.81699495, ..., -0.8235563 ,
        -0.83243239,  1.05211381],
       [-1.75135854,  0.51595242,  0.22903206, ..., -0.8235563 ,
        -0.83243239,  1.05211381],
       [-0.05419193,  0.51595242,  1.27505906, ...,  1.21424608,
        -0.83243239, -0.59607813]])

# Create Model

### define and fit model

In [12]:
model = XGBClassifier(n_estimators=10000, learning_rate=0.001)
model.fit(x_train, y_train, early_stopping_rounds=5, eval_set=[(x_test, y_test)])

[0]	validation_0-logloss:0.68177
[1]	validation_0-logloss:0.68122
[2]	validation_0-logloss:0.68066
[3]	validation_0-logloss:0.68011
[4]	validation_0-logloss:0.67957
[5]	validation_0-logloss:0.67901
[6]	validation_0-logloss:0.67847
[7]	validation_0-logloss:0.67792
[8]	validation_0-logloss:0.67737
[9]	validation_0-logloss:0.67683
[10]	validation_0-logloss:0.67629
[11]	validation_0-logloss:0.67574
[12]	validation_0-logloss:0.67520
[13]	validation_0-logloss:0.67465
[14]	validation_0-logloss:0.67410
[15]	validation_0-logloss:0.67356
[16]	validation_0-logloss:0.67301
[17]	validation_0-logloss:0.67247
[18]	validation_0-logloss:0.67193
[19]	validation_0-logloss:0.67138
[20]	validation_0-logloss:0.67084
[21]	validation_0-logloss:0.67030
[22]	validation_0-logloss:0.66978
[23]	validation_0-logloss:0.66923
[24]	validation_0-logloss:0.66869
[25]	validation_0-logloss:0.66817
[26]	validation_0-logloss:0.66762
[27]	validation_0-logloss:0.66710




[28]	validation_0-logloss:0.66658
[29]	validation_0-logloss:0.66604
[30]	validation_0-logloss:0.66551
[31]	validation_0-logloss:0.66499
[32]	validation_0-logloss:0.66448
[33]	validation_0-logloss:0.66396
[34]	validation_0-logloss:0.66342
[35]	validation_0-logloss:0.66291
[36]	validation_0-logloss:0.66239
[37]	validation_0-logloss:0.66186
[38]	validation_0-logloss:0.66135
[39]	validation_0-logloss:0.66084
[40]	validation_0-logloss:0.66032
[41]	validation_0-logloss:0.65981
[42]	validation_0-logloss:0.65929
[43]	validation_0-logloss:0.65878
[44]	validation_0-logloss:0.65828
[45]	validation_0-logloss:0.65777
[46]	validation_0-logloss:0.65726
[47]	validation_0-logloss:0.65675
[48]	validation_0-logloss:0.65624
[49]	validation_0-logloss:0.65574
[50]	validation_0-logloss:0.65524
[51]	validation_0-logloss:0.65472
[52]	validation_0-logloss:0.65422
[53]	validation_0-logloss:0.65372
[54]	validation_0-logloss:0.65323
[55]	validation_0-logloss:0.65272
[56]	validation_0-logloss:0.65221
[57]	validatio

### Evaluate Model

In [13]:
print(model.score(x_train, y_train))

0.9564032697547684


In [14]:
print(model.score(x_test, y_test))

0.8532608695652174
