In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('heart_dataset.csv')
print(df.value_counts(['HeartDisease'])/len(df))
print(df.value_counts(['HeartDisease']))

df

HeartDisease
No              0.914405
Yes             0.085595
dtype: float64
HeartDisease
No              292422
Yes              27373
dtype: int64


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [3]:
unique_labels(df['Asthma'])

array(['No', 'Yes'], dtype='<U3')

In [4]:
df.describe().T.style.set_properties(**{'background-color': 'grey','color': 'white','border-color': 'white'})

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,319795.0,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,319795.0,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,319795.0,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,319795.0,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0


In [5]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [6]:
#sns.pairplot(df.select_dtypes(include=np.number))

In [7]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [8]:
unique_labels(df['Race'])

array(['American Indian/Alaskan Native', 'Asian', 'Black', 'Hispanic',
       'Other', 'White'], dtype='<U30')

In [9]:
dic = {'No': 0, 'Yes': 1}
df['HeartDisease'] = df['HeartDisease'].map(dic).astype('category')
df['Smoking'] = df['Smoking'].map(dic).astype('category')
df['AlcoholDrinking'] = df['AlcoholDrinking'].map(dic).astype('category')
df['Stroke'] = df['Stroke'].map(dic).astype('category')
df['DiffWalking'] = df['DiffWalking'].map(dic).astype('category')
df['PhysicalActivity'] = df['PhysicalActivity'].map(dic).astype('category')
df['Asthma'] = df['Asthma'].map(dic).astype('category')
df['KidneyDisease'] = df['KidneyDisease'].map(dic).astype('category')
df['SkinCancer'] = df['SkinCancer'].map(dic).astype('category')

dic = {'Male': 0, 'Female': 1}
df['Sex'] = df['Sex'].map(dic).astype('category')
 
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,1,55-59,White,Yes,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,1,80 or older,White,No,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,0,65-69,White,Yes,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,1,75-79,White,No,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,1,40-44,White,No,1,Very good,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,0,60-64,Hispanic,Yes,0,Fair,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,35-39,Hispanic,No,1,Very good,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,45-49,Hispanic,No,1,Good,6.0,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,25-29,Hispanic,No,0,Good,12.0,0,0,0


In [10]:
y = pd.get_dummies(df.Race, prefix='Race')
df = pd.concat([df, y], axis=1)

df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,16.60,1,0,0,3.0,30.0,0,1,55-59,...,5.0,1,0,1,0,0,0,0,0,1
1,0,20.34,0,0,1,0.0,0.0,0,1,80 or older,...,7.0,0,0,0,0,0,0,0,0,1
2,0,26.58,1,0,0,20.0,30.0,0,0,65-69,...,8.0,1,0,0,0,0,0,0,0,1
3,0,24.21,0,0,0,0.0,0.0,0,1,75-79,...,6.0,0,0,1,0,0,0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,1,40-44,...,8.0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,0,60-64,...,6.0,1,0,0,0,0,0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,35-39,...,5.0,1,0,0,0,0,0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,45-49,...,6.0,0,0,0,0,0,0,1,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,25-29,...,12.0,0,0,0,0,0,0,1,0,0


In [11]:
df.drop('Race', inplace=True, axis=1)

df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,16.60,1,0,0,3.0,30.0,0,1,55-59,...,5.0,1,0,1,0,0,0,0,0,1
1,0,20.34,0,0,1,0.0,0.0,0,1,80 or older,...,7.0,0,0,0,0,0,0,0,0,1
2,0,26.58,1,0,0,20.0,30.0,0,0,65-69,...,8.0,1,0,0,0,0,0,0,0,1
3,0,24.21,0,0,0,0.0,0.0,0,1,75-79,...,6.0,0,0,1,0,0,0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,1,40-44,...,8.0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,0,60-64,...,6.0,1,0,0,0,0,0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,35-39,...,5.0,1,0,0,0,0,0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,45-49,...,6.0,0,0,0,0,0,0,1,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,25-29,...,12.0,0,0,0,0,0,0,1,0,0


In [12]:
df['Race_American Indian/Alaskan Native'] = df['Race_American Indian/Alaskan Native'].astype('category')
df['Race_Asian'] = df['Race_Asian'].astype('category')
df['Race_Black'] = df['Race_Black'].astype('category')
df['Race_Hispanic'] = df['Race_Hispanic'].astype('category')
df['Race_Other'] = df['Race_Other'].astype('category')    
df['Race_White'] = df['Race_White'].astype('category')

In [13]:
#AgeCategory

dic = {}

j = 0
for i in np.arange(0, 80, 5):
    dic[str(i)+"-"+str(i+4)] = j
    j = j+1
    
dic["80 or older"] = j

dic

{'0-4': 0,
 '5-9': 1,
 '10-14': 2,
 '15-19': 3,
 '20-24': 4,
 '25-29': 5,
 '30-34': 6,
 '35-39': 7,
 '40-44': 8,
 '45-49': 9,
 '50-54': 10,
 '55-59': 11,
 '60-64': 12,
 '65-69': 13,
 '70-74': 14,
 '75-79': 15,
 '80 or older': 16}

In [14]:
df['AgeCategory'] = df['AgeCategory'].map(dic)

In [15]:
x = pd.get_dummies(df.Diabetic, prefix='Diabetic')
df = pd.concat([df, x], axis=1)
df.drop('Diabetic', inplace=True, axis=1)
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,16.60,1,0,0,3.0,30.0,0,1,11.0,...,0,0,0,0,0,1,0,0,1,0
1,0,20.34,0,0,1,0.0,0.0,0,1,16.0,...,0,0,0,0,0,1,1,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,0,13.0,...,0,0,0,0,0,1,0,0,1,0
3,0,24.21,0,0,0,0.0,0.0,0,1,15.0,...,0,0,0,0,0,1,1,0,0,0
4,0,23.71,0,0,0,28.0,0.0,1,1,8.0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,0,12.0,...,0,0,0,1,0,0,0,0,1,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,7.0,...,0,0,0,1,0,0,1,0,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,9.0,...,0,0,0,1,0,0,1,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,5.0,...,0,0,0,1,0,0,1,0,0,0


In [16]:
df['Diabetic_No'] = df['Diabetic_No'].astype('category')
df['Diabetic_No, borderline diabetes'] = df['Diabetic_No, borderline diabetes'].astype('category')
df['Diabetic_Yes'] = df['Diabetic_Yes'].astype('category')
df['Diabetic_Yes (during pregnancy)'] = df['Diabetic_Yes (during pregnancy)'].astype('category')

In [17]:
unique_labels(df['GenHealth'])

array(['Excellent', 'Fair', 'Good', 'Poor', 'Very good'], dtype='<U9')

In [18]:
dic = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4}

df['GenHealth'] = df['GenHealth'].map(dic)

In [21]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,AgeCategory,GenHealth,SleepTime
count,319795.0,319795.0,319795.0,298731.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,10.973886,2.595028,7.097075
std,6.3561,7.95085,7.955235,3.224914,1.042918,1.436007
min,12.02,0.0,0.0,5.0,0.0,1.0
25%,24.03,0.0,0.0,8.0,2.0,6.0
50%,27.34,0.0,0.0,11.0,3.0,7.0
75%,31.42,2.0,3.0,14.0,3.0,8.0
max,94.85,30.0,30.0,16.0,4.0,24.0


In [27]:
scaler_x = MinMaxScaler()
data = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'AgeCategory', 'GenHealth', 'SleepTime']]
data

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,AgeCategory,GenHealth,SleepTime
0,16.60,3.0,30.0,11.0,3,5.0
1,20.34,0.0,0.0,16.0,3,7.0
2,26.58,20.0,30.0,13.0,1,8.0
3,24.21,0.0,0.0,15.0,2,6.0
4,23.71,28.0,0.0,8.0,3,8.0
...,...,...,...,...,...,...
319790,27.41,7.0,0.0,12.0,1,6.0
319791,29.84,0.0,0.0,7.0,3,5.0
319792,24.24,0.0,0.0,9.0,2,6.0
319793,32.81,0.0,0.0,5.0,2,12.0


In [28]:
#scaler_x = MinMaxScaler()
#data = df['BMI', 'PhysicalHealth', 'MentalHealth', 'AgeCategory', 'GenHealth', 'SleepTime']
scaler_x.fit(data)
data_scaled = scaler_x.transform(data)
df_scaled = pd.DataFrame(data_scaled, columns=data.columns)

In [30]:
df[['BMI', 'PhysicalHealth', 'MentalHealth', 'AgeCategory', 'GenHealth', 'SleepTime']] = df_scaled[['BMI', 'PhysicalHealth', 'MentalHealth', 'AgeCategory', 'GenHealth', 'SleepTime']]

In [31]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,0.055294,1,0,0,0.100000,1.0,0,1,0.545455,...,0,0,0,0,0,1,0,0,1,0
1,0,0.100447,0,0,1,0.000000,0.0,0,1,1.000000,...,0,0,0,0,0,1,1,0,0,0
2,0,0.175782,1,0,0,0.666667,1.0,0,0,0.727273,...,0,0,0,0,0,1,0,0,1,0
3,0,0.147169,0,0,0,0.000000,0.0,0,1,0.909091,...,0,0,0,0,0,1,1,0,0,0
4,0,0.141132,0,0,0,0.933333,0.0,1,1,0.272727,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,0.185802,1,0,0,0.233333,0.0,1,0,0.636364,...,0,0,0,1,0,0,0,0,1,0
319791,0,0.215139,1,0,0,0.000000,0.0,0,0,0.181818,...,0,0,0,1,0,0,1,0,0,0
319792,0,0.147531,0,0,0,0.000000,0.0,0,1,0.363636,...,0,0,0,1,0,0,1,0,0,0
319793,0,0.250996,0,0,0,0.000000,0.0,0,1,0.000000,...,0,0,0,1,0,0,1,0,0,0


In [32]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,AgeCategory,GenHealth,SleepTime
count,319795.0,319795.0,319795.0,298731.0,319795.0,319795.0
mean,0.196854,0.11239,0.129946,0.543081,0.648757,0.26509
std,0.076737,0.265028,0.265175,0.293174,0.26073,0.062435
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.144996,0.0,0.0,0.272727,0.5,0.217391
50%,0.184957,0.0,0.0,0.545455,0.75,0.26087
75%,0.234215,0.066667,0.1,0.818182,0.75,0.304348
max,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
dic = {0 : 'normal', 1 : 'enfermo', 2 : 'enfermo', 3 : 'enfermo', 4 : 'enfermo'}
df['target'] = df['target'].map(dic)

