In [25]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import xgboost as xgb
from sklearn.model_selection import GridSearchCV


df = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [27]:
df = df.drop(columns = ['GenHlth', 'MentHlth','PhysHlth','Income','Education'])
df.columns

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex',
       'Age'],
      dtype='object')

In [28]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  DiffWalk              253680 non-null  float64
 15  

In [29]:
df['HeartDiseaseorAttack'].value_counts()
# shows inbalance in data

HeartDiseaseorAttack
0.0    229787
1.0     23893
Name: count, dtype: int64

In [30]:
y = df['HeartDiseaseorAttack'].values
X = df.drop(columns='HeartDiseaseorAttack', axis=1).values

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### SMOTE for imbalanced data

In [32]:
# creating a SMOTE object
smote = SMOTE(random_state=42)


In [33]:
# applying SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#### Creating and training  the XGBoost model

In [34]:
#creating the xgboost model
xgb_model = xgb.XGBClassifier( random_state=42)

In [35]:
#training the model
xgb_model.fit(X_train_resampled, y_train_resampled)

In [55]:
# Save the XGBoost model to a file
xgb_model.save_model('./Models/xgboost_model.model')



In [36]:
#Prediction of the model

y_pred_xgb = xgb_model.predict(X_test)

In [37]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

[[42868  3100]
 [ 3268  1500]]
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93     45968
         1.0       0.33      0.31      0.32      4768

    accuracy                           0.87     50736
   macro avg       0.63      0.62      0.63     50736
weighted avg       0.87      0.87      0.87     50736

Accuracy: 0.8744875433617155


### The model is performing better than the random forest model, will use this model for the web app but first will try to improve the model using grid search 

In [46]:
# Enabling GPU support
params = {
    'tree_method': 'gpu_hist', #use GPU for training
    'predictor': 'gpu_predictor' # use GPU for inference

}

In [47]:
# Define the grid of hyperparameters to search

param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1],
    'colasample_bytree': [0.8, 0.9, 1]
}


In [40]:
# creating the GridSearchCV object

grid_search = GridSearchCV(xgb.XGBClassifier(**params , random_state=42),param_grid, cv=3, scoring='f1', verbose=1, n_jobs=10)

In [41]:
# Grid Search with resampled data
grid_search.fit(X_train_resampled, y_train_resampled)


Fitting 3 folds for each of 243 candidates, totalling 729 fits



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

Parameters: { "colasample_bytree", "predictor" } are not used.

In [42]:
print("Best hyperparameters:\n", grid_search.best_params_)

Best hyperparameters:
 {'colasample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 1}


In [43]:
#obtaining best model from grid search
best_model = grid_search.best_estimator_

In [56]:
# Save the XGBoost model to a file
best_model.save_model('./Models/xgboost_best_model.model')



In [54]:
#prediction using best model
y_pred_tuned = best_model.predict(X_test)

In [45]:
# Evaluate the model
print(classification_report(y_test, y_pred_tuned))

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93     45968
         1.0       0.33      0.34      0.33      4768

    accuracy                           0.87     50736
   macro avg       0.63      0.63      0.63     50736
weighted avg       0.87      0.87      0.87     50736



#### Creating a Xgb model with L1 and L2 regularization


In [48]:
params = {
    'tree_method': 'gpu_hist', #use GPU for training
    'predictor': 'gpu_predictor', # use GPU for inference
    'reg_alpha': 0.1,               # L1 regularization parameter
    'reg_lambda': 0.1,              # L2 regularization parameter
}

In [49]:
# XGBoots model with regularization
xgb_model_reg = xgb.XGBClassifier(**params, random_state=42)

In [50]:
# training the model
xgb_model_reg.fit(X_train_resampled, y_train_resampled)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



In [51]:
#Prediction of the model

y_pred_xgb_reg = xgb_model_reg.predict(X_test)


    E.g. tree_method = "hist", device = "cuda"



In [52]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred_xgb_reg))
print(classification_report(y_test, y_pred_xgb_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb_reg))

[[42900  3068]
 [ 3269  1499]]
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93     45968
         1.0       0.33      0.31      0.32      4768

    accuracy                           0.88     50736
   macro avg       0.63      0.62      0.63     50736
weighted avg       0.87      0.88      0.87     50736

Accuracy: 0.8750985493535163
