In [20]:
#importer les packages 
import numpy as np
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

In [21]:
# Load data
data = pd.read_csv('./Data/2020/heart_2020_cleaned.csv')

In [22]:
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [23]:
data.columns

Index(['HeartDisease ', 'BMI   ', 'Smoking ', 'AlcoholDrinking ', 'Stroke ',
       'PhysicalHealth ', 'MentalHealth ', 'DiffWalking ', 'Sex    ',
       'AgeCategory ', 'Race                           ',
       'Diabetic                  ', 'PhysicalActivity ', 'GenHealth ',
       'SleepTime ', 'Asthma ', 'KidneyDisease ', 'SkinCancer'],
      dtype='object')

In [24]:
#Remove Spaces in the columns of the dataset
data.columns = data.columns.str.strip()

In [25]:
#Verify the result
print(data.columns)

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')


In [26]:
# Splitting the data into numerical and categorical data
cat_data=[]
num_data=[]
for i,c in enumerate(data.dtypes):
    if c == object:
        cat_data.append(data.iloc[:,i])
    else:
        num_data.append(data.iloc[:,i])
cat_data = pd.DataFrame(cat_data).transpose()
num_data = pd.DataFrame(num_data).transpose()
cat_data

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,Asthma,KidneyDisease,SkinCancer
0,No,Yes,No,No,No,Female,55-59,White,Yes,Yes,Very good,Yes,No,Yes
1,No,No,No,Yes,No,Female,80 or older,White,No,Yes,Very good,No,No,No
2,No,Yes,No,No,No,Male,65-69,White,Yes,Yes,Fair,Yes,No,No
3,No,No,No,No,No,Female,75-79,White,No,No,Good,No,No,Yes
4,No,No,No,No,Yes,Female,40-44,White,No,Yes,Very good,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,Yes,No,No,Yes,Male,60-64,Hispanic,Yes,No,Fair,Yes,No,No
319791,No,Yes,No,No,No,Male,35-39,Hispanic,No,Yes,Very good,Yes,No,No
319792,No,No,No,No,No,Female,45-49,Hispanic,No,Yes,Good,No,No,No
319793,No,No,No,No,No,Female,25-29,Hispanic,No,No,Good,No,No,No


In [None]:
# One-Hot Encoding the categorical data
cat_data = pd.get_dummies(cat_data, columns=['AgeCategory', 'GenHealth', 'Race'])
cat_data.head()

In [27]:
# Label Encoding the categorical data
le = LabelEncoder()
cat_data = cat_data.apply(le.fit_transform)
cat_data.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,Asthma,KidneyDisease,SkinCancer
0,0,1,0,0,0,0,7,5,2,1,4,1,0,1
1,0,0,0,1,0,0,12,5,0,1,4,0,0,0
2,0,1,0,0,0,1,9,5,2,1,1,1,0,0
3,0,0,0,0,0,0,11,5,0,0,2,0,0,1
4,0,0,0,0,1,0,4,5,0,1,4,0,0,0
