In [121]:
# Import Dependencies
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


# Loading the data

In [122]:
# Load the data
train_data = pd.read_csv('heart_2020_encoded_train.csv')
test_data = pd.read_csv('heart_2020_encoded_test.csv')


In [123]:
# Split the data into X and y
X_train = train_data.drop('HeartDisease_Yes', axis=1)
y_train = train_data['HeartDisease_Yes']
X_test = test_data.drop('HeartDisease_Yes', axis=1)
y_test = test_data['HeartDisease_Yes']


# Initializing the XGBosst Model


In [124]:
# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', seed=42)

# Train the model
xgb_clf.fit(X_train, y_train)

# Evaluating the model

In [125]:
# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # for ROC-AUC

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Display the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.7661187214611872
Precision: 0.7453458582408198
Recall: 0.8030916451969083
F1 Score: 0.7731419966338914
ROC-AUC Score: 0.8374246145252024


# Feature engineering

## 1. Health Index
A composite health index might capture the overall health condition better than individual health metrics. We could create a simple index or a weighted average based on PhysicalHealth, MentalHealth, and possibly other health-related features like Diabetic_Yes or Stroke_Yes.

In [126]:
train_data['HealthIndex'] = (train_data['PhysicalHealth'] + train_data['MentalHealth']) / 2
test_data['HealthIndex'] = (test_data['PhysicalHealth'] + test_data['MentalHealth']) / 2


## 2. Interaction Between Health and Lifestyle Factors
Interactions between health metrics and lifestyle factors (e.g., Smoking_Yes, AlcoholDrinking_Yes) could provide insights into risk patterns. For instance, smoking might have a different impact on heart disease risk for individuals with poor physical health compared to those with good physical health.

In [127]:
train_data['Smoking_PhysicalHealth'] = train_data['Smoking_Yes'] * train_data['PhysicalHealth']
test_data['Smoking_PhysicalHealth'] = test_data['Smoking_Yes'] * test_data['PhysicalHealth']


## 3. Age and Health Interaction
The impact of health metrics might vary across different age groups. We consider creating interaction features that combine AgeCategoryOrdinal with key health indicators.

In [128]:
train_data['Age_PhysicalHealth'] = train_data['AgeCategoryOrdinal'] * train_data['PhysicalHealth']
test_data['Age_PhysicalHealth'] = test_data['AgeCategoryOrdinal'] * test_data['PhysicalHealth']


## 4. Polynomial Features for Key Indicators
Creating polynomial features for variables like BMI might help capture non-linear effects.

In [129]:
train_data['BMI_Squared'] = train_data['BMI'] ** 2
test_data['BMI_Squared'] = test_data['BMI'] ** 2


## 5. Binning Sleep Time
Sleep time could have a non-linear relationship with heart disease risk, where both too little and too much sleep are harmful. We will be binning SleepTime into categories.

In [130]:
bins = [0, 6, 8, 24]
labels = ['UnderSleep', 'NormalSleep', 'OverSleep']
train_data['SleepCategory'] = pd.cut(train_data['SleepTime'], bins=bins, labels=labels)
test_data['SleepCategory'] = pd.cut(test_data['SleepTime'], bins=bins, labels=labels)


In [131]:
# One-hot encode the 'SleepCategory' feature
train_data = pd.get_dummies(train_data, columns=['SleepCategory'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['SleepCategory'], drop_first=True)


## 6. Reviewing the newly engineered features

In [132]:
# Display the first five rows of the modified data
print(train_data.head())
print(test_data.head())

     BMI  PhysicalHealth  MentalHealth  SleepTime  AgeCategoryOrdinal  \
0  28.34             0.0           0.0        8.0                   9   
1  26.58             0.0           0.0        7.0                   1   
2  23.73             0.0           0.0        7.0                   4   
3  23.71             0.0           0.0        7.0                  10   
4  29.62             0.0           4.0        6.0                   7   

   HeartDisease_Yes  Smoking_Yes  AlcoholDrinking_Yes  Stroke_Yes  \
0                 0            1                    0           0   
1                 0            0                    0           0   
2                 0            1                    0           0   
3                 1            1                    0           0   
4                 1            1                    0           0   

   DiffWalking_Yes  ...  GenHealth_Very good  Asthma_Yes  KidneyDisease_Yes  \
0                0  ...                    0           0           

## 7. Retraining and evaluating the model with the newly engineered features

In [133]:
# Redefining X_train and X_test
X_train = train_data.drop('HeartDisease_Yes', axis=1)
y_train = train_data['HeartDisease_Yes']  # y_train remains the same
X_test = test_data.drop('HeartDisease_Yes', axis=1)
y_test = test_data['HeartDisease_Yes']  # y_test remains the same

In [134]:
# Retrain the model
xgb_clf.fit(X_train, y_train)

In [135]:
# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

In [136]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")

Accuracy: 0.7666666666666667
Precision: 0.7472942793334478
Recall: 0.8005152741994848
F1 Score: 0.7729897823189693
ROC-AUC Score: 0.8362268909290016


In [137]:
# Feature Importance
feature_importances = xgb_clf.feature_importances_
# Assuming your X_train is a DataFrame, this will map feature importances to column names
feature_importance_dict = {feature: importance for feature, importance in zip(X_train.columns, feature_importances)}
print(feature_importance_dict)

{'BMI': 0.010182248, 'PhysicalHealth': 0.008926817, 'MentalHealth': 0.009864681, 'SleepTime': 0.010012478, 'AgeCategoryOrdinal': 0.14849873, 'Smoking_Yes': 0.032695863, 'AlcoholDrinking_Yes': 0.009516227, 'Stroke_Yes': 0.114865, 'DiffWalking_Yes': 0.10277873, 'Sex_Male': 0.06796959, 'Race_Asian': 0.011981825, 'Race_Black': 0.009271009, 'Race_Hispanic': 0.010066551, 'Race_Other': 0.010551574, 'Race_White': 0.015181356, 'Diabetic_No, borderline diabetes': 0.007929114, 'Diabetic_Yes': 0.05898928, 'Diabetic_Yes (during pregnancy)': 0.008803597, 'PhysicalActivity_Yes': 0.009974437, 'GenHealth_Fair': 0.07220391, 'GenHealth_Good': 0.056786615, 'GenHealth_Poor': 0.06794226, 'GenHealth_Very good': 0.022484478, 'Asthma_Yes': 0.014489498, 'KidneyDisease_Yes': 0.030677753, 'SkinCancer_Yes': 0.01086127, 'HealthIndex': 0.011660773, 'Smoking_PhysicalHealth': 0.012917453, 'Age_PhysicalHealth': 0.029960167, 'BMI_Squared': 0.0, 'SleepCategory_NormalSleep': 0.011956778, 'SleepCategory_OverSleep': 0.0}


## Reviewing the results of feature engineering

### Model Performance Review

**Accuracy:** Slightly improved to 76.67%, indicating a marginal increase in overall correct predictions.

**Precision:** Slightly improved to 74.73%, indicating a better rate of true positive predictions out of all positive predictions.

**Recall:** Slightly decreased to 80.05%, showing a small decrease in the model's ability to detect actual positives.

**F1 Score:** Slightly improved to 77.30%, which indicates a better balance between precision and recall.

**ROC-AUC Score:** Slightly decreased to 83.62%, showing a marginal decrease in the model's ability to distinguish between classes.
These changes are modest, suggesting that the newly engineered features provided some benefit, but not a dramatic improvement in model performance.

### Feature Importance Review

**Significant Features:** AgeCategoryOrdinal, Stroke_Yes, and DiffWalking_Yes remain highly influential, similar to before. This reaffirms their importance in predicting heart disease.

**Engineered Features:** Among the newly added features, Age_PhysicalHealth shows relatively higher importance than other engineered features, suggesting an interaction between age and physical health that's significant for predicting heart disease. Interestingly, **BMI_Squared and SleepCategory_OverSleep have zero importance**, indicating they might not contribute to the model's decision-making process.

**Potential Overfitting:** Features with zero importance could potentially be removed to simplify the model without loss of predictive performance. This can help in preventing overfitting and improving model generalization.

## 1. Removing Features with Zero Importance

In [138]:
# Drop the features with zero importance from both training and testing datasets
X_train.drop(['BMI_Squared', 'SleepCategory_OverSleep'], axis=1, inplace=True)
X_test.drop(['BMI_Squared', 'SleepCategory_OverSleep'], axis=1, inplace=True)


## 2. Retraining the model


In [139]:
# Retrain the model
xgb_clf.fit(X_train, y_train)


## 3. Evaluating the model again

In [140]:
# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # for ROC-AUC

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Display the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.7666666666666667
Precision: 0.7472942793334478
Recall: 0.8005152741994848
F1 Score: 0.7729897823189693
ROC-AUC Score: 0.8362268909290016


## 4. Feature importance review

In [141]:
# Review feature importance
feature_importances = xgb_clf.feature_importances_
feature_importance_dict = {feature: importance for feature, importance in zip(X_train.columns, feature_importances)}
print(feature_importance_dict)


{'BMI': 0.010182248, 'PhysicalHealth': 0.008926817, 'MentalHealth': 0.009864681, 'SleepTime': 0.010012478, 'AgeCategoryOrdinal': 0.14849873, 'Smoking_Yes': 0.032695863, 'AlcoholDrinking_Yes': 0.009516227, 'Stroke_Yes': 0.114865, 'DiffWalking_Yes': 0.10277873, 'Sex_Male': 0.06796959, 'Race_Asian': 0.011981825, 'Race_Black': 0.009271009, 'Race_Hispanic': 0.010066551, 'Race_Other': 0.010551574, 'Race_White': 0.015181356, 'Diabetic_No, borderline diabetes': 0.007929114, 'Diabetic_Yes': 0.05898928, 'Diabetic_Yes (during pregnancy)': 0.008803597, 'PhysicalActivity_Yes': 0.009974437, 'GenHealth_Fair': 0.07220391, 'GenHealth_Good': 0.056786615, 'GenHealth_Poor': 0.06794226, 'GenHealth_Very good': 0.022484478, 'Asthma_Yes': 0.014489498, 'KidneyDisease_Yes': 0.030677753, 'SkinCancer_Yes': 0.01086127, 'HealthIndex': 0.011660773, 'Smoking_PhysicalHealth': 0.012917453, 'Age_PhysicalHealth': 0.029960167, 'SleepCategory_NormalSleep': 0.011956778}


# Evaluating for overfitting

In [142]:
# Evaluate on Training Set
y_train_pred = xgb_clf.predict(X_train)
y_train_pred_proba = xgb_clf.predict_proba(X_train)[:, 1]  # for ROC-AUC

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

# Display Training Set Metrics
print("Training Set Metrics:")
print(f"Accuracy: {train_accuracy}")
print(f"Precision: {train_precision}")
print(f"Recall: {train_recall}")
print(f"F1 Score: {train_f1}")
print(f"ROC-AUC Score: {train_roc_auc}")

# Display Testing Set Metrics (Already Calculated)
print("\nTesting Set Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Training Set Metrics:
Accuracy: 0.8101653119006302
Precision: 0.7950794819595444
Recall: 0.8366835316103742
F1 Score: 0.8153511304579576
ROC-AUC Score: 0.8966105934070341

Testing Set Metrics:
Accuracy: 0.7666666666666667
Precision: 0.7472942793334478
Recall: 0.8005152741994848
F1 Score: 0.7729897823189693
ROC-AUC Score: 0.8362268909290016


### Performance Metrics Comparison

**Accuracy:** There's a noticeable difference between the training accuracy (81.02%) and the testing accuracy (76.67%). While a higher training accuracy is expected, the gap indicates that the model might be overfitting to the training data.

**Precision:** The precision on the training set (79.51%) is higher compared to the testing set (74.73%), which is consistent with the overall trend of the model performing better on the training data.

**Recall:** Similar to precision, recall is higher in the training set (83.67%) than in the testing set (80.05%), indicating the model's better capability to identify positive cases in the data it was trained on.

**F1 Score:** The F1 score, which balances precision and recall, also shows a higher value for the training set (81.54%) compared to the testing set (77.30%).

**ROC-AUC Score:** The ROC-AUC score shows a more pronounced difference, with the training set achieving a score of 89.66% and the testing set 83.62%. This metric, in particular, highlights the model's better discriminative ability on the training data.

### Interpretation
The discrepancies between the training and testing metrics, especially in terms of ROC-AUC score, suggest that our model might be overfitting the training data. 

## Using Hyperparameter Tuning to address overfitting 

In [143]:
# Define the parameter grid
param_dist = {
    'max_depth': randint(3, 10),  # Maximum depth of tree
    'min_child_weight': randint(1, 6),  # Minimum sum of instance weight (hessian) needed in a child
    'gamma': uniform(0, 0.5),  # Minimum loss reduction required to make a further partition
    'subsample': uniform(0.6, 0.4),  # Subsample ratio of the training instances
    'colsample_bytree': uniform(0.6, 0.4),  # Subsample ratio of columns when constructing each tree
    'lambda': uniform(0.5, 1),  # L2 regularization term on weights
    'alpha': uniform(0.5, 1)  # L1 regularization term on weights
}

In [144]:
# Initialize the base model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42)

In [145]:
# Setup RandomizedSearchCV
clf = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1, random_state=42)

In [146]:
# Fit the RandomizedSearchCV
clf.fit(X_train, y_train)

In [147]:
# Print the best parameters and best score
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)

Best Parameters: {'alpha': 1.4803315837160458, 'colsample_bytree': 0.6301385024024513, 'gamma': 0.15284850964359092, 'lambda': 0.690911031150346, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.6677970986744369}
Best Score: 0.8400544223412678


# Retraining and evaluating the model using the hyperparameter tuning findings

In [148]:
# Retrain the model with the best parameters
xgb_optimized = xgb.XGBClassifier(
    objective='binary:logistic',
    alpha=1.4803315837160458,
    colsample_bytree=0.6301385024024513,
    gamma=0.15284850964359092,
    reg_lambda=0.690911031150346,  # Note: 'lambda' is a reserved keyword in Python, use 'reg_lambda' instead for XGBoost
    max_depth=3,
    min_child_weight=3,
    subsample=0.6677970986744369,
    seed=42
)

xgb_optimized.fit(X_train, y_train)

In [149]:
# Evaluate the optimized model
y_pred_opt = xgb_optimized.predict(X_test)
y_pred_proba_opt = xgb_optimized.predict_proba(X_test)[:, 1]

accuracy_opt = accuracy_score(y_test, y_pred_opt)
precision_opt = precision_score(y_test, y_pred_opt)
recall_opt = recall_score(y_test, y_pred_opt)
f1_opt = f1_score(y_test, y_pred_opt)
roc_auc_opt = roc_auc_score(y_test, y_pred_proba_opt)

print("Optimized Model Performance:")
print(f"Accuracy: {accuracy_opt}")
print(f"Precision: {precision_opt}")
print(f"Recall: {recall_opt}")
print(f"F1 Score: {f1_opt}")
print(f"ROC-AUC Score: {roc_auc_opt}")

Optimized Model Performance:
Accuracy: 0.7692237442922374
Precision: 0.749613601236476
Recall: 0.8032756716967243
F1 Score: 0.7755174558052768
ROC-AUC Score: 0.8458597740757773


In [151]:
# Feature Importance
feature_importances_opt = xgb_optimized.feature_importances_
feature_importance_dict_opt = {feature: importance for feature, importance in zip(X_train.columns, feature_importances_opt)}
print(feature_importance_dict_opt)


{'BMI': 0.0049347975, 'PhysicalHealth': 0.005785028, 'MentalHealth': 0.004761408, 'SleepTime': 0.0064182603, 'AgeCategoryOrdinal': 0.18664345, 'Smoking_Yes': 0.066454485, 'AlcoholDrinking_Yes': 0.005936795, 'Stroke_Yes': 0.09191418, 'DiffWalking_Yes': 0.16270092, 'Sex_Male': 0.066628575, 'Race_Asian': 0.0044570873, 'Race_Black': 0.0044200337, 'Race_Hispanic': 0.0061313673, 'Race_Other': 0.0043786513, 'Race_White': 0.0071670716, 'Diabetic_No, borderline diabetes': 0.0057071163, 'Diabetic_Yes': 0.086654596, 'Diabetic_Yes (during pregnancy)': 0.003500193, 'PhysicalActivity_Yes': 0.005004589, 'GenHealth_Fair': 0.08508494, 'GenHealth_Good': 0.022749634, 'GenHealth_Poor': 0.044673815, 'GenHealth_Very good': 0.03117969, 'Asthma_Yes': 0.010505459, 'KidneyDisease_Yes': 0.025819719, 'SkinCancer_Yes': 0.006946837, 'HealthIndex': 0.0069507835, 'Smoking_PhysicalHealth': 0.0031862871, 'Age_PhysicalHealth': 0.025328657, 'SleepCategory_NormalSleep': 0.007975544}


In [153]:
# Make predictions on the training set
y_train_pred = xgb_optimized.predict(X_train)
y_train_pred_proba = xgb_optimized.predict_proba(X_train)[:, 1]  # For ROC-AUC

# Calculate evaluation metrics for the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

# Display training set metrics
print("Training Set Metrics:")
print(f"Accuracy: {train_accuracy}")
print(f"Precision: {train_precision}")
print(f"Recall: {train_recall}")
print(f"F1 Score: {train_f1}")
print(f"ROC-AUC Score: {train_roc_auc}")


Training Set Metrics:
Accuracy: 0.7709151520686821
Precision: 0.7569714236380903
Recall: 0.7993071698801222
F1 Score: 0.7775634630307061
ROC-AUC Score: 0.8491663445794396


### Updated Training Set Metrics with Optimized Model
- **Accuracy:** 77.09%
- **Precision:** 75.70%
- **Recall:** 79.93%
- **F1 Score:** 77.76%
- **ROC-AUC Score:** 84.92%

### Testing Set Metrics for Reference
- **Accuracy:** 76.92%
- **Precision:** 74.93%
- **Recall:** 80.35%
- **F1 Score:** 77.55%
- **ROC-AUC Score:** 84.59%

### Analysis of Overfitting
With the updated training metrics, the discrepancy between the training and testing performance metrics is significantly reduced. This closer alignment suggests that the optimized model has a good balance between learning from the training data and generalizing to unseen data. The slight differences between these metrics are normal and expected due to the inherent differences between seen (training) and unseen (testing) data.

- **Minimal Overfitting**: The relatively small gap between the training and testing metrics indicates minimal overfitting. This is a good sign that the model, with its optimized hyperparameters, is generalizing well.
  
- **Consistency Across Metrics**: Both sets of metrics are closely aligned across accuracy, precision, recall, F1 score, and ROC-AUC, further underscoring the model's robustness.
