# Heart attack prediction  

### Ce dataset kaggle répertorie 8763 individus à risque d'attaque cardiaque (35%) ou pas (65%).  
### Chaque individu est identifié par :  
-Son patient_ID  
-Son âge, genre, sexe  
-Des données physiologiques  
-Ses habitudes alimentaires  
-Ses habitudes sportives  
-Des données géographiques  

### On va mettre en place un algorithme de classification binaire (risque d'attaque : 1, pas de risque : 0)

# 1-Import

In [1]:
# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pycaret
import kaggle

# Data Visualization
import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from IPython.display import display
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# Model training
import pycaret as pc
from pycaret.classification import *
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay 
import mlflow

# 2-Data collection

In [2]:
# Authentification Kaggle
kaggle.api.authenticate()

# Télécharger le dataset depuis Kaggle
kaggle.api.dataset_download_files('iamsouravbanerjee/heart-attack-prediction-dataset', path='.', unzip=True)

In [3]:
data = pd.read_csv(r"C:\Users\John\Desktop\KA-CL-P3-Heart_attack_prediction\heart_attack_prediction_dataset.csv")
data.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


# 3 - Cleaning

### On va séparer la colonne 'Blood pressure'  en 'Systolic' et 'Diastolic'

In [4]:
# blood pressure type object => Systolic and Diastolic int
data['Systolic'] = list(map(lambda x : int((x).split('/')[0]), data['Blood Pressure']))
data['Diastolic'] = list(map(lambda x : int((x).split('/')[1]), data['Blood Pressure']))
data.drop(columns='Blood Pressure', inplace=True)

# 4-Training with pycaret

### Aucun autre préprocessing n'est nécessaire. Pycaret gère automatiquement le hot encoding, les valeurs manquantes, les outliers, les classes déséquilibrées...
### Pycaret va entraîner par cross validation plusieurs modèles.  
### Nous prenons comme indicateur de performance le rappel (recall) qui est le ratio du nombre de prédictions de risque d'attaque correct sur le nombre de risque d'attaque réel.  

In [5]:
# loading data for preprocessing
classification_setup = pc.classification.setup(data = data, target  = 'Heart Attack Risk')

Unnamed: 0,Description,Value
0,Session id,6808
1,Target,Heart Attack Risk
2,Target type,Binary
3,Original data shape,"(8763, 27)"
4,Transformed data shape,"(8763, 53)"
5,Transformed train set shape,"(6134, 53)"
6,Transformed test set shape,"(2629, 53)"
7,Ordinal features,2
8,Numeric features,20
9,Categorical features,6


In [6]:
# Compare and evaluate different models
best_model = compare_models(sort='Recall',fold = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.5287,0.0,0.4,0.1434,0.2112,0.0,0.0,0.298
knn,K Neighbors Classifier,0.5797,0.4996,0.2585,0.3747,0.3059,0.0188,0.0196,0.388
et,Extra Trees Classifier,0.642,0.4922,0.0009,0.1333,0.0018,0.0008,0.0064,0.466
lr,Logistic Regression,0.6418,0.4917,0.0,0.0,0.0,0.0,0.0,1.594
nb,Naive Bayes,0.6418,0.5092,0.0,0.0,0.0,0.0,0.0,0.2
dt,Decision Tree Classifier,0.6418,0.5,0.0,0.0,0.0,0.0,0.0,0.206
ridge,Ridge Classifier,0.6418,0.0,0.0,0.0,0.0,0.0,0.0,0.198
rf,Random Forest Classifier,0.6418,0.498,0.0,0.0,0.0,0.0,0.0,0.632
qda,Quadratic Discriminant Analysis,0.6418,0.5,0.0,0.0,0.0,0.0,0.0,0.22
ada,Ada Boost Classifier,0.6418,0.5,0.0,0.0,0.0,0.0,0.0,0.226


In [7]:
#Fine-tuning the best model (optimise les hyperparamètres et notamment le threshold)
tuned_best_model = tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.3583,0.0,1.0,0.3583,0.5276,0.0,0.0
1,0.6417,0.0,0.0,0.0,0.0,0.0,0.0
2,0.6417,0.0,0.0,0.0,0.0,0.0,0.0
3,0.6417,0.0,0.0,0.0,0.0,0.0,0.0
4,0.6427,0.0,0.0,0.0,0.0,0.0,0.0
5,0.3573,0.0,1.0,0.3573,0.5264,0.0,0.0
6,0.6427,0.0,0.0,0.0,0.0,0.0,0.0
7,0.6411,0.0,0.0,0.0,0.0,0.0,0.0
8,0.3589,0.0,1.0,0.3589,0.5282,0.0,0.0
9,0.6411,0.0,0.0,0.0,0.0,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [8]:
# print to view hyperparamaters
print(tuned_best_model)

SGDClassifier(alpha=1e-07, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.4, fit_intercept=False,
              l1_ratio=0.6500000001, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1,
              penalty='elasticnet', power_t=0.5, random_state=6808,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)


In [9]:
# fait comme tune_model mais cette fois sur l'ensemble des données
final_model = finalize_model(tuned_best_model)
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Le meilleur modèle est le SGDClassifier avec un recall de 1 sur les données de test mais la classe 1 a aspiré toute les prédictions ! On peut certainement faire mieux en blendant les 5 meilleurs modèles. 

In [10]:
%%time
top5 = compare_models(n_select=5,sort='Recall')
tuned_top5 = [tune_model(i, optimize='Recall') for i in top5]
blend_model = blend_models(tuned_top5, optimize='Recall')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.4146,0.0,0.795,0.2859,0.4205,-0.0018,-0.0065,0.3
knn,K Neighbors Classifier,0.5796,0.5013,0.2531,0.3735,0.3014,0.0162,0.0171,0.272
et,Extra Trees Classifier,0.6415,0.496,0.0014,0.2,0.0027,0.0001,0.0018,0.435
lr,Logistic Regression,0.6418,0.4957,0.0,0.0,0.0,0.0,0.0,0.232
nb,Naive Bayes,0.6418,0.5098,0.0,0.0,0.0,0.0,0.0,0.187
dt,Decision Tree Classifier,0.6418,0.5,0.0,0.0,0.0,0.0,0.0,0.178
ridge,Ridge Classifier,0.6418,0.0,0.0,0.0,0.0,0.0,0.0,0.185
rf,Random Forest Classifier,0.6418,0.4933,0.0,0.0,0.0,0.0,0.0,0.616
qda,Quadratic Discriminant Analysis,0.6418,0.5,0.0,0.0,0.0,0.0,0.0,0.188
ada,Ada Boost Classifier,0.6418,0.5,0.0,0.0,0.0,0.0,0.0,0.176


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.3583,0.0,1.0,0.3583,0.5276,0.0,0.0
1,0.3583,0.0,1.0,0.3583,0.5276,0.0,0.0
2,0.3583,0.0,1.0,0.3583,0.5276,0.0,0.0
3,0.3583,0.0,1.0,0.3583,0.5276,0.0,0.0
4,0.6427,0.0,0.0,0.0,0.0,0.0,0.0
5,0.6427,0.0,0.0,0.0,0.0,0.0,0.0
6,0.6427,0.0,0.0,0.0,0.0,0.0,0.0
7,0.6411,0.0,0.0,0.0,0.0,0.0,0.0
8,0.3589,0.0,1.0,0.3589,0.5282,0.0,0.0
9,0.3589,0.0,1.0,0.3589,0.5282,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5505,0.4896,0.3409,0.3641,0.3521,0.0085,0.0086
1,0.5342,0.4833,0.3727,0.3565,0.3644,-0.0029,-0.0029
2,0.5554,0.4997,0.3727,0.3779,0.3753,0.0302,0.0302
3,0.5358,0.4795,0.3409,0.3488,0.3448,-0.0145,-0.0145
4,0.5253,0.4819,0.3516,0.3407,0.3461,-0.0264,-0.0264
5,0.5139,0.4791,0.347,0.329,0.3378,-0.0458,-0.0459
6,0.5416,0.5085,0.3607,0.3591,0.3599,0.0029,0.0029
7,0.5171,0.4765,0.3591,0.3376,0.348,-0.0348,-0.0349
8,0.553,0.5182,0.3636,0.3738,0.3687,0.0228,0.0228
9,0.5579,0.5283,0.3909,0.3857,0.3883,0.0422,0.0422


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.3583,0.5,1.0,0.3583,0.5276,0.0,0.0
1,0.3583,0.5,1.0,0.3583,0.5276,0.0,0.0
2,0.3583,0.5,1.0,0.3583,0.5276,0.0,0.0
3,0.3583,0.5,1.0,0.3583,0.5276,0.0,0.0
4,0.3573,0.5,1.0,0.3573,0.5264,0.0,0.0
5,0.3573,0.5,1.0,0.3573,0.5264,0.0,0.0
6,0.3573,0.5,1.0,0.3573,0.5264,0.0,0.0
7,0.3589,0.5,1.0,0.3589,0.5282,0.0,0.0
8,0.3589,0.5,1.0,0.3589,0.5282,0.0,0.0
9,0.3589,0.5,1.0,0.3589,0.5282,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.3583,0.4994,1.0,0.3583,0.5276,0.0,0.0
1,0.4495,0.4762,0.5909,0.3439,0.4348,-0.0333,-0.038
2,0.3583,0.4982,1.0,0.3583,0.5276,0.0,0.0
3,0.4609,0.4693,0.4591,0.3227,0.379,-0.0722,-0.0758
4,0.3573,0.5279,1.0,0.3573,0.5264,0.0,0.0
5,0.3573,0.5064,1.0,0.3573,0.5264,0.0,0.0
6,0.3573,0.4965,1.0,0.3573,0.5264,0.0,0.0
7,0.3589,0.5002,1.0,0.3589,0.5282,0.0,0.0
8,0.4535,0.4807,0.5864,0.3458,0.4351,-0.0299,-0.0339
9,0.5024,0.5204,0.5182,0.3642,0.4278,0.0108,0.0113


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6417,0.5276,0.0,0.0,0.0,0.0,0.0
1,0.6417,0.5131,0.0,0.0,0.0,0.0,0.0
2,0.6417,0.4951,0.0,0.0,0.0,0.0,0.0
3,0.6417,0.4941,0.0,0.0,0.0,0.0,0.0
4,0.6427,0.5213,0.0,0.0,0.0,0.0,0.0
5,0.6427,0.536,0.0,0.0,0.0,0.0,0.0
6,0.6427,0.5191,0.0,0.0,0.0,0.0,0.0
7,0.6411,0.5526,0.0,0.0,0.0,0.0,0.0
8,0.6411,0.4762,0.0,0.0,0.0,0.0,0.0
9,0.6411,0.5356,0.0,0.0,0.0,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.3583,0.0,1.0,0.3583,0.5276,0.0,0.0
1,0.4186,0.0,0.7227,0.3495,0.4711,-0.0231,-0.0312
2,0.5554,0.0,0.3727,0.3779,0.3753,0.0302,0.0302
3,0.4365,0.0,0.6727,0.3507,0.4611,-0.0189,-0.0235
4,0.3573,0.0,1.0,0.3573,0.5264,0.0,0.0
5,0.3573,0.0,1.0,0.3573,0.5264,0.0,0.0
6,0.3573,0.0,1.0,0.3573,0.5264,0.0,0.0
7,0.5171,0.0,0.3591,0.3376,0.348,-0.0348,-0.0349
8,0.4209,0.0,0.7227,0.351,0.4725,-0.0206,-0.0277
9,0.4551,0.0,0.6909,0.3636,0.4765,0.0117,0.0145


CPU times: total: 24.1 s
Wall time: 3min 39s


In [11]:
final_blend_model = finalize_model(blend_model)
evaluate_model(final_blend_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Ce dernier modèle obtient une accuracy parfaite de 1 sur l'échantillon de test !

In [12]:
save_model(final_blend_model, 'Heart_attack_Model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Age', 'Cholesterol', 'Heart Rate',
                                              'Diabetes', 'Family History',
                                              'Smoking', 'Obesity',
                                              'Alcohol Consumption',
                                              'Exercise Hours Per Week',
                                              'Previous Heart Problems',
                                              'Medication Use', 'Stress Level',
                                              'Sedentary Hours Per Day',
                                              'Income', 'BMI', 'Triglycerides',
                                              'Physical Act...
                                                                   class_weight='balanced',
                                         