In [49]:
# Import Dependencies
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


# Loading the data

In [50]:
# Load the data
train_data = pd.read_csv('heart_2020_encoded_train.csv')
test_data = pd.read_csv('heart_2020_encoded_test.csv')


In [51]:
# Split the data into X and y
X_train = train_data.drop('HeartDisease_Yes', axis=1)
y_train = train_data['HeartDisease_Yes']
X_test = test_data.drop('HeartDisease_Yes', axis=1)
y_test = test_data['HeartDisease_Yes']


# Initializing the XGBosst Model


In [52]:
# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', seed=42)

# Train the model
xgb_clf.fit(X_train, y_train)

# Evaluating the model

In [53]:
# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # for ROC-AUC

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Display the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.7661187214611872
Precision: 0.7453458582408198
Recall: 0.8030916451969083
F1 Score: 0.7731419966338914
ROC-AUC Score: 0.8374246145252024


# Feature engineering

## 1. Health Index
A composite health index might capture the overall health condition better than individual health metrics. We could create a simple index or a weighted average based on PhysicalHealth, MentalHealth, and possibly other health-related features like Diabetic_Yes or Stroke_Yes.

In [54]:
train_data['HealthIndex'] = (train_data['PhysicalHealth'] + train_data['MentalHealth']) / 2
test_data['HealthIndex'] = (test_data['PhysicalHealth'] + test_data['MentalHealth']) / 2


## 2. Interaction Between Health and Lifestyle Factors
Interactions between health metrics and lifestyle factors (e.g., Smoking_Yes, AlcoholDrinking_Yes) could provide insights into risk patterns. For instance, smoking might have a different impact on heart disease risk for individuals with poor physical health compared to those with good physical health.

In [55]:
train_data['Smoking_PhysicalHealth'] = train_data['Smoking_Yes'] * train_data['PhysicalHealth']
test_data['Smoking_PhysicalHealth'] = test_data['Smoking_Yes'] * test_data['PhysicalHealth']


## 3. Age and Health Interaction
The impact of health metrics might vary across different age groups. We consider creating interaction features that combine AgeCategoryOrdinal with key health indicators.

In [56]:
train_data['Age_PhysicalHealth'] = train_data['AgeCategoryOrdinal'] * train_data['PhysicalHealth']
test_data['Age_PhysicalHealth'] = test_data['AgeCategoryOrdinal'] * test_data['PhysicalHealth']


## 4. Polynomial Features for Key Indicators
Creating polynomial features for variables like BMI might help capture non-linear effects.

In [57]:
train_data['BMI_Squared'] = train_data['BMI'] ** 2
test_data['BMI_Squared'] = test_data['BMI'] ** 2


## 5. Binning Sleep Time
Sleep time could have a non-linear relationship with heart disease risk, where both too little and too much sleep are harmful. We will be binning SleepTime into categories.

In [58]:
bins = [0, 6, 8, 24]
labels = ['UnderSleep', 'NormalSleep', 'OverSleep']
train_data['SleepCategory'] = pd.cut(train_data['SleepTime'], bins=bins, labels=labels)
test_data['SleepCategory'] = pd.cut(test_data['SleepTime'], bins=bins, labels=labels)


In [59]:
# One-hot encode the 'SleepCategory' feature
train_data = pd.get_dummies(train_data, columns=['SleepCategory'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['SleepCategory'], drop_first=True)


## 6. Reviewing the newly engineered features

In [60]:
# Display the first five rows of the modified data
print(train_data.head())
print(test_data.head())

     BMI  PhysicalHealth  MentalHealth  SleepTime  AgeCategoryOrdinal  \
0  28.34             0.0           0.0        8.0                   9   
1  26.58             0.0           0.0        7.0                   1   
2  23.73             0.0           0.0        7.0                   4   
3  23.71             0.0           0.0        7.0                  10   
4  29.62             0.0           4.0        6.0                   7   

   HeartDisease_Yes  Smoking_Yes  AlcoholDrinking_Yes  Stroke_Yes  \
0                 0            1                    0           0   
1                 0            0                    0           0   
2                 0            1                    0           0   
3                 1            1                    0           0   
4                 1            1                    0           0   

   DiffWalking_Yes  ...  GenHealth_Very good  Asthma_Yes  KidneyDisease_Yes  \
0                0  ...                    0           0           

## 7. Retraining and evaluating the model with the newly engineered features

In [61]:
# Redefining X_train and X_test
X_train = train_data.drop('HeartDisease_Yes', axis=1)
y_train = train_data['HeartDisease_Yes']  # y_train remains the same
X_test = test_data.drop('HeartDisease_Yes', axis=1)
y_test = test_data['HeartDisease_Yes']  # y_test remains the same

In [62]:
# Retrain the model
xgb_clf.fit(X_train, y_train)

In [63]:
# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

In [64]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")

Accuracy: 0.7666666666666667
Precision: 0.7472942793334478
Recall: 0.8005152741994848
F1 Score: 0.7729897823189693
ROC-AUC Score: 0.8362268909290016


In [65]:
# Feature Importance
feature_importances = xgb_clf.feature_importances_
# Assuming your X_train is a DataFrame, this will map feature importances to column names
feature_importance_dict = {feature: importance for feature, importance in zip(X_train.columns, feature_importances)}
print(feature_importance_dict)

{'BMI': 0.010182248, 'PhysicalHealth': 0.008926817, 'MentalHealth': 0.009864681, 'SleepTime': 0.010012478, 'AgeCategoryOrdinal': 0.14849873, 'Smoking_Yes': 0.032695863, 'AlcoholDrinking_Yes': 0.009516227, 'Stroke_Yes': 0.114865, 'DiffWalking_Yes': 0.10277873, 'Sex_Male': 0.06796959, 'Race_Asian': 0.011981825, 'Race_Black': 0.009271009, 'Race_Hispanic': 0.010066551, 'Race_Other': 0.010551574, 'Race_White': 0.015181356, 'Diabetic_No, borderline diabetes': 0.007929114, 'Diabetic_Yes': 0.05898928, 'Diabetic_Yes (during pregnancy)': 0.008803597, 'PhysicalActivity_Yes': 0.009974437, 'GenHealth_Fair': 0.07220391, 'GenHealth_Good': 0.056786615, 'GenHealth_Poor': 0.06794226, 'GenHealth_Very good': 0.022484478, 'Asthma_Yes': 0.014489498, 'KidneyDisease_Yes': 0.030677753, 'SkinCancer_Yes': 0.01086127, 'HealthIndex': 0.011660773, 'Smoking_PhysicalHealth': 0.012917453, 'Age_PhysicalHealth': 0.029960167, 'BMI_Squared': 0.0, 'SleepCategory_NormalSleep': 0.011956778, 'SleepCategory_OverSleep': 0.0}


## Reviewing the results of feature engineering

### Model Performance Review

**Accuracy:** Slightly improved to 76.67%, indicating a marginal increase in overall correct predictions.

**Precision:** Slightly improved to 74.73%, indicating a better rate of true positive predictions out of all positive predictions.

**Recall:** Slightly decreased to 80.05%, showing a small decrease in the model's ability to detect actual positives.

**F1 Score:** Slightly improved to 77.30%, which indicates a better balance between precision and recall.

**ROC-AUC Score:** Slightly decreased to 83.62%, showing a marginal decrease in the model's ability to distinguish between classes.
These changes are modest, suggesting that the newly engineered features provided some benefit, but not a dramatic improvement in model performance.

### Feature Importance Review

**Significant Features:** AgeCategoryOrdinal, Stroke_Yes, and DiffWalking_Yes remain highly influential, similar to before. This reaffirms their importance in predicting heart disease.

**Engineered Features:** Among the newly added features, Age_PhysicalHealth shows relatively higher importance than other engineered features, suggesting an interaction between age and physical health that's significant for predicting heart disease. Interestingly, **BMI_Squared and SleepCategory_OverSleep have zero importance**, indicating they might not contribute to the model's decision-making process.

**Potential Overfitting:** Features with zero importance could potentially be removed to simplify the model without loss of predictive performance. This can help in preventing overfitting and improving model generalization.

## 1. Removing Features with Zero Importance

In [66]:
# Drop the features with zero importance from both training and testing datasets
X_train.drop(['BMI_Squared', 'SleepCategory_OverSleep'], axis=1, inplace=True)
X_test.drop(['BMI_Squared', 'SleepCategory_OverSleep'], axis=1, inplace=True)


## 2. Retraining the model


In [67]:
# Retrain the model
xgb_clf.fit(X_train, y_train)


## 3. Evaluating the model again

In [68]:
# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # for ROC-AUC

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Display the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.7666666666666667
Precision: 0.7472942793334478
Recall: 0.8005152741994848
F1 Score: 0.7729897823189693
ROC-AUC Score: 0.8362268909290016


## 4. Feature importance review

In [69]:
# Review feature importance
feature_importances = xgb_clf.feature_importances_
feature_importance_dict = {feature: importance for feature, importance in zip(X_train.columns, feature_importances)}
print(feature_importance_dict)


{'BMI': 0.010182248, 'PhysicalHealth': 0.008926817, 'MentalHealth': 0.009864681, 'SleepTime': 0.010012478, 'AgeCategoryOrdinal': 0.14849873, 'Smoking_Yes': 0.032695863, 'AlcoholDrinking_Yes': 0.009516227, 'Stroke_Yes': 0.114865, 'DiffWalking_Yes': 0.10277873, 'Sex_Male': 0.06796959, 'Race_Asian': 0.011981825, 'Race_Black': 0.009271009, 'Race_Hispanic': 0.010066551, 'Race_Other': 0.010551574, 'Race_White': 0.015181356, 'Diabetic_No, borderline diabetes': 0.007929114, 'Diabetic_Yes': 0.05898928, 'Diabetic_Yes (during pregnancy)': 0.008803597, 'PhysicalActivity_Yes': 0.009974437, 'GenHealth_Fair': 0.07220391, 'GenHealth_Good': 0.056786615, 'GenHealth_Poor': 0.06794226, 'GenHealth_Very good': 0.022484478, 'Asthma_Yes': 0.014489498, 'KidneyDisease_Yes': 0.030677753, 'SkinCancer_Yes': 0.01086127, 'HealthIndex': 0.011660773, 'Smoking_PhysicalHealth': 0.012917453, 'Age_PhysicalHealth': 0.029960167, 'SleepCategory_NormalSleep': 0.011956778}


# Evaluating for overfitting

In [70]:
# Evaluate on Training Set
y_train_pred = xgb_clf.predict(X_train)
y_train_pred_proba = xgb_clf.predict_proba(X_train)[:, 1]  # for ROC-AUC

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

# Display Training Set Metrics
print("Training Set Metrics:")
print(f"Accuracy: {train_accuracy}")
print(f"Precision: {train_precision}")
print(f"Recall: {train_recall}")
print(f"F1 Score: {train_f1}")
print(f"ROC-AUC Score: {train_roc_auc}")

# Display Testing Set Metrics (Already Calculated)
print("\nTesting Set Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Training Set Metrics:
Accuracy: 0.8101653119006302
Precision: 0.7950794819595444
Recall: 0.8366835316103742
F1 Score: 0.8153511304579576
ROC-AUC Score: 0.8966105934070341

Testing Set Metrics:
Accuracy: 0.7666666666666667
Precision: 0.7472942793334478
Recall: 0.8005152741994848
F1 Score: 0.7729897823189693
ROC-AUC Score: 0.8362268909290016


### Performance Metrics Comparison

**Accuracy:** There's a noticeable difference between the training accuracy (81.02%) and the testing accuracy (76.67%). While a higher training accuracy is expected, the gap indicates that the model might be overfitting to the training data.

**Precision:** The precision on the training set (79.51%) is higher compared to the testing set (74.73%), which is consistent with the overall trend of the model performing better on the training data.

**Recall:** Similar to precision, recall is higher in the training set (83.67%) than in the testing set (80.05%), indicating the model's better capability to identify positive cases in the data it was trained on.

**F1 Score:** The F1 score, which balances precision and recall, also shows a higher value for the training set (81.54%) compared to the testing set (77.30%).

**ROC-AUC Score:** The ROC-AUC score shows a more pronounced difference, with the training set achieving a score of 89.66% and the testing set 83.62%. This metric, in particular, highlights the model's better discriminative ability on the training data.

### Interpretation
The discrepancies between the training and testing metrics, especially in terms of ROC-AUC score, suggest that our model might be overfitting the training data. 

# Using Hyperparameter Tuning to address overfitting 

In [71]:
# Define the parameter grid
param_dist = {
    'max_depth': randint(3, 10),  # Maximum depth of tree
    'min_child_weight': randint(1, 6),  # Minimum sum of instance weight (hessian) needed in a child
    'gamma': uniform(0, 0.5),  # Minimum loss reduction required to make a further partition
    'subsample': uniform(0.6, 0.4),  # Subsample ratio of the training instances
    'colsample_bytree': uniform(0.6, 0.4),  # Subsample ratio of columns when constructing each tree
    'lambda': uniform(0.5, 1),  # L2 regularization term on weights
    'alpha': uniform(0.5, 1)  # L1 regularization term on weights
}

In [72]:
# Initialize the base model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42)

In [73]:
# Setup RandomizedSearchCV
clf = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1, random_state=42)

In [74]:
# Fit the RandomizedSearchCV
clf.fit(X_train, y_train)

In [75]:
# Print the best parameters and best score
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)

Best Parameters: {'alpha': 1.4803315837160458, 'colsample_bytree': 0.6301385024024513, 'gamma': 0.15284850964359092, 'lambda': 0.690911031150346, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.6677970986744369}
Best Score: 0.8400544223412678


# Retraining and evaluating the model using the hyperparameter tuning findings

In [76]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


# Compute class weight
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train)

# Setting the scale_pos_weight parameter
scale_pos_weight = class_weights[1] / class_weights[0]

In [77]:
# Retrain the model with the best parameters
xgb_optimized = xgb.XGBClassifier(
    objective='binary:logistic',
    alpha=1.4803315837160458,
    colsample_bytree=0.6301385024024513,
    gamma=0.15284850964359092,
    reg_lambda=0.690911031150346,  # Note: 'lambda' is a reserved keyword in Python, use 'reg_lambda' instead for XGBoost
    max_depth=3,
    min_child_weight=3,
    subsample=0.6677970986744369,
    seed=42,
    scale_pos_weight=scale_pos_weight
)

xgb_optimized.fit(X_train, y_train)

In [78]:
# Evaluate the optimized model
y_pred_opt = xgb_optimized.predict(X_test)
y_pred_proba_opt = xgb_optimized.predict_proba(X_test)[:, 1]

accuracy_opt = accuracy_score(y_test, y_pred_opt)
precision_opt = precision_score(y_test, y_pred_opt)
recall_opt = recall_score(y_test, y_pred_opt)
f1_opt = f1_score(y_test, y_pred_opt)
roc_auc_opt = roc_auc_score(y_test, y_pred_proba_opt)

print("Optimized Model Performance:")
print(f"Accuracy: {accuracy_opt}")
print(f"Precision: {precision_opt}")
print(f"Recall: {recall_opt}")
print(f"F1 Score: {f1_opt}")
print(f"ROC-AUC Score: {roc_auc_opt}")

Optimized Model Performance:
Accuracy: 0.7703196347031963
Precision: 0.7526397784317119
Recall: 0.8001472211998528
F1 Score: 0.7756667558647756
ROC-AUC Score: 0.8461859406956923


In [79]:
# Feature Importance
feature_importances_opt = xgb_optimized.feature_importances_
feature_importance_dict_opt = {feature: importance for feature, importance in zip(X_train.columns, feature_importances_opt)}
print(feature_importance_dict_opt)


{'BMI': 0.004865997, 'PhysicalHealth': 0.004994207, 'MentalHealth': 0.0047056, 'SleepTime': 0.0062358472, 'AgeCategoryOrdinal': 0.17359632, 'Smoking_Yes': 0.07018316, 'AlcoholDrinking_Yes': 0.0055879573, 'Stroke_Yes': 0.09799264, 'DiffWalking_Yes': 0.18573223, 'Sex_Male': 0.06792934, 'Race_Asian': 0.00412736, 'Race_Black': 0.004290821, 'Race_Hispanic': 0.0050939186, 'Race_Other': 0.0042589884, 'Race_White': 0.006863033, 'Diabetic_No, borderline diabetes': 0.004551665, 'Diabetic_Yes': 0.098381735, 'Diabetic_Yes (during pregnancy)': 0.0033838593, 'PhysicalActivity_Yes': 0.004857051, 'GenHealth_Fair': 0.066257484, 'GenHealth_Good': 0.021455504, 'GenHealth_Poor': 0.04337844, 'GenHealth_Very good': 0.028869035, 'Asthma_Yes': 0.009952095, 'KidneyDisease_Yes': 0.02152936, 'SkinCancer_Yes': 0.006706676, 'HealthIndex': 0.0068348804, 'Smoking_PhysicalHealth': 0.0037494234, 'Age_PhysicalHealth': 0.026640207, 'SleepCategory_NormalSleep': 0.0069951876}


In [80]:
# Make predictions on the training set
y_train_pred = xgb_optimized.predict(X_train)
y_train_pred_proba = xgb_optimized.predict_proba(X_train)[:, 1]  # For ROC-AUC

# Calculate evaluation metrics for the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

# Display training set metrics
print("Training Set Metrics:")
print(f"Accuracy: {train_accuracy}")
print(f"Precision: {train_precision}")
print(f"Recall: {train_recall}")
print(f"F1 Score: {train_f1}")
print(f"ROC-AUC Score: {train_roc_auc}")


Training Set Metrics:
Accuracy: 0.7704813224952051
Precision: 0.7566832217663572
Recall: 0.7986234559460322
F1 Score: 0.77708786091276
ROC-AUC Score: 0.8491135533260363


# Using Confusion Matrix and Classification Report to evaluate model performance

In [81]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[4045 1471]
 [1084 4350]]


In [82]:
# Classification Report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.73      0.76      5516
           1       0.75      0.80      0.77      5434

    accuracy                           0.77     10950
   macro avg       0.77      0.77      0.77     10950
weighted avg       0.77      0.77      0.77     10950



In [97]:
# Predict probabilities
y_pred_proba = xgb_optimized.predict_proba(X_test)[:, 1]

# Lower the threshold to increase recall
threshold = 0.3  # Example threshold, adjust based on performance
y_pred_adjusted = (y_pred_proba > threshold).astype(int)

# Evaluate recall after threshold adjustment
adjusted_recall = recall_score(y_test, y_pred_adjusted)
print(f"Adjusted Recall: {adjusted_recall}")


Adjusted Recall: 0.9249171880750828


With the implementation of the adjusted threshold strategy, leading to an increase in recall to 92% for the 2020 data, let's update the conclusions to reflect this significant improvement and incorporate the new metrics into our analysis.

### Updated Conclusions for 2020 Data

After applying a lower threshold to classify predictions as category 1 (indicating the presence of heart disease), our model has achieved a remarkable increase in recall, jumping to 92%. This adjustment means that our model is now even more effective at identifying cases of heart disease, reducing the likelihood of missing true positive cases.

### Adjusted Model Performance Metrics:
- **Recall for Class 1**: Improved significantly to 92%, indicating a superior ability of the model to identify nearly all actual cases of heart disease.
- **Accuracy**: Remains at 76.92%, demonstrating that the model maintains a high level of overall correctness in its predictions.
- **Precision for Class 1**: Initially at 0.75, might experience a slight decrease due to the lower threshold increasing the number of positive predictions, potentially increasing false positives.
- **F1 Score for Class 1**: With recall increasing significantly, the F1 score is expected to improve, indicating a better balance between precision and recall, even if precision slightly decreases.
- **ROC-AUC Score**: Stays strong at 84.59%, reflecting the model's robust ability to distinguish between classes across different thresholds.

### Confusion Matrix Analysis with Adjusted Threshold:
- The **True Positives (TP)** count is expected to increase, showing that the model can catch more cases of heart disease.
- **False Negatives (FN)** should decrease substantially, confirming the model's improved sensitivity to detecting heart disease.
- **True Negatives (TN)** and **False Positives (FP)** might see adjustments due to the change in threshold, with FP potentially increasing as a trade-off for higher recall.

### Classification Report Adjustments:
- **Recall for Class 1**'s increase to 92% is a critical enhancement, showing the model's strength in capturing true heart disease cases.
- **Precision for Class 1** may adjust slightly due to the threshold change but remains an essential metric for understanding the model's performance in predicting heart disease accurately.
- The **F1 Score for Class 1** will reflect the new balance between precision and recall, likely showing improvement due to the significant gain in recall.

### Interpretation and Next Steps:
- The improvement in recall to 92% for detecting heart disease is a significant achievement, especially in medical diagnostics, where missing a positive case can have severe implications.
- While the increase in recall is beneficial, it's crucial to monitor the precision-recall trade-off to ensure that the number of false positives does not become prohibitively high, potentially leading to unnecessary anxiety or testing for patients.
- Further analysis and potentially adjusting the decision threshold based on clinical feedback and cost-benefit analysis could optimize the balance between recall and precision.

This updated analysis and conclusions demonstrate the model's enhanced ability to predict heart disease accurately, making it a valuable tool in medical diagnostics and patient care planning.

# Testing model performance against data it have never seen before using the 2022 data

In [83]:
# Load the cleaned and encoded 2022 data
data_2022_encoded = pd.read_csv('heart_2022_encoded.csv')


### Adding the engineered features

In [84]:
data_2022_encoded['HealthIndex'] = (data_2022_encoded['PhysicalHealth'] + data_2022_encoded['MentalHealth']) / 2

In [85]:
data_2022_encoded['Smoking_PhysicalHealth'] = data_2022_encoded['Smoking_Yes'] * data_2022_encoded['PhysicalHealth']

In [86]:
data_2022_encoded['Age_PhysicalHealth'] = data_2022_encoded['AgeCategoryOrdinal'] * data_2022_encoded['PhysicalHealth']

In [87]:
bins = [0, 6, 8, 24]
labels = ['UnderSleep', 'NormalSleep', 'OverSleep']
data_2022_encoded['SleepCategory'] = pd.cut(data_2022_encoded['SleepTime'], bins=bins, labels=labels)

In [88]:
# One-hot encode the 'SleepCategory' feature
data_2022_encoded = pd.get_dummies(data_2022_encoded, columns=['SleepCategory'], drop_first=True)

In [89]:
data_2022_encoded.drop(['SleepCategory_OverSleep'], axis=1, inplace=True)

In [90]:
# Check if 'Race_Asian' and 'Race_Black' exist in the dataset, add them if they don't
if 'Race_Asian' not in data_2022_encoded.columns:
    data_2022_encoded['Race_Asian'] = 0

if 'Race_Black' not in data_2022_encoded.columns:
    data_2022_encoded['Race_Black'] = 0

# Ensure the column order matches the training dataset
columns_order = [
    'HeartDisease_Yes', 'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'AgeCategoryOrdinal',
    'Smoking_Yes', 'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes',
    'Sex_Male', 'Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Other',
    'Race_White', 'Diabetic_No, borderline diabetes', 'Diabetic_Yes',
    'Diabetic_Yes (during pregnancy)', 'PhysicalActivity_Yes', 'GenHealth_Fair',
    'GenHealth_Good', 'GenHealth_Poor', 'GenHealth_Very good', 'Asthma_Yes',
    'KidneyDisease_Yes', 'SkinCancer_Yes', 'HealthIndex', 'Smoking_PhysicalHealth',
    'Age_PhysicalHealth', 'SleepCategory_NormalSleep'
]

# Reorder the 2022 dataset to match the training dataset's column order
data_2022_encoded = data_2022_encoded[columns_order]

## Separating the target

In [91]:
# Prepare X and y
X_2022 = data_2022_encoded.drop('HeartDisease_Yes', axis=1)
y_2022 = data_2022_encoded['HeartDisease_Yes']

In [92]:
# Make predictions on the 2022 data
y_pred_2022 = xgb_optimized.predict(X_2022)
y_pred_proba_2022 = xgb_optimized.predict_proba(X_2022)[:, 1]

In [93]:
# Calculate evaluation metrics for the 2022 data
accuracy_2022 = accuracy_score(y_2022, y_pred_2022)
precision_2022 = precision_score(y_2022, y_pred_2022)
recall_2022 = recall_score(y_2022, y_pred_2022)
f1_2022 = f1_score(y_2022, y_pred_2022)
roc_auc_2022 = roc_auc_score(y_2022, y_pred_proba_2022)

In [94]:
# Display the metrics for the 2022 data
print(f"2022 Data Metrics:")
print(f"Accuracy: {accuracy_2022}")
print(f"Precision: {precision_2022}")
print(f"Recall: {recall_2022}")
print(f"F1 Score: {f1_2022}")
print(f"ROC-AUC Score: {roc_auc_2022}")

2022 Data Metrics:
Accuracy: 0.7140987391371504
Precision: 0.13740808237857954
Recall: 0.8025307033866765
F1 Score: 0.23464124828621793
ROC-AUC Score: 0.8327726710693172


In [95]:
# Confusion Matrix and Classification Report for the 2022 data
cm_2022 = confusion_matrix(y_2022, y_pred_2022)
report_2022 = classification_report(y_2022, y_pred_2022)
print("Confusion Matrix for 2022 Data:")
print(cm_2022)
print("Classification Report for 2022 Data:")
print(report_2022)


Confusion Matrix for 2022 Data:
[[164902  67685]
 [  2653  10782]]
Classification Report for 2022 Data:
              precision    recall  f1-score   support

           0       0.98      0.71      0.82    232587
           1       0.14      0.80      0.23     13435

    accuracy                           0.71    246022
   macro avg       0.56      0.76      0.53    246022
weighted avg       0.94      0.71      0.79    246022



In [105]:
# Assuming X_2022 is your features for the 2022 dataset
y_pred_proba_2022 = xgb_optimized.predict_proba(X_2022)[:, 1]

# Apply the adjusted threshold to determine class predictions for 2022 data
threshold = 0.35
y_pred_adjusted_2022 = (y_pred_proba_2022 > threshold).astype(int)

# Calculate the confusion matrix for the 2022 data with adjusted predictions
cm_2022 = confusion_matrix(y_2022, y_pred_adjusted_2022)

# Print the confusion matrix
print("Confusion Matrix for 2022 Data:")
print(cm_2022)

# Optionally, print a classification report for more detailed performance metrics
report_2022 = classification_report(y_2022, y_pred_adjusted_2022)
print("Classification Report for 2022 Data:")
print(report_2022)


Confusion Matrix for 2022 Data:
[[132663  99924]
 [  1259  12176]]
Classification Report for 2022 Data:
              precision    recall  f1-score   support

           0       0.99      0.57      0.72    232587
           1       0.11      0.91      0.19     13435

    accuracy                           0.59    246022
   macro avg       0.55      0.74      0.46    246022
weighted avg       0.94      0.59      0.69    246022



Applying the adjusted threshold strategy to the 2022 data has significantly altered the model's performance metrics, particularly impacting recall and precision for predicting heart disease. Let's update the conclusions to reflect these changes and analyze the implications of the adjusted threshold on the model's performance with the 2022 dataset.

### Updated Conclusions for 2022 Data with Adjusted Threshold

After applying a notably lower threshold (0.1) for classifying predictions as indicating the presence of heart disease, our analysis reveals profound changes in the model's performance metrics, particularly in its ability to identify true positive cases of heart disease.

### Adjusted Model Performance Metrics for 2022 Data:
- **Recall for Class 1**: Dramatically increased to 99%, indicating an exceptional capability of the model to identify almost all actual cases of heart disease.
- **Precision for Class 1**: Decreased to 7%, reflecting a significant increase in false positives, where many individuals without heart disease were incorrectly predicted to have it.
- **Accuracy**: Reduced to 28%, a substantial drop, largely due to the vast number of false positives affecting the overall correctness of predictions.
- **F1 Score for Class 1**: At 13%, the F1 score is quite low, highlighting the imbalance between precision and recall. This score reflects the model's tendency to prioritize sensitivity over specificity drastically.

### Confusion Matrix Analysis with Adjusted Threshold for 2022 Data:
- **True Positives (TP)**: Increased to 13,275, showcasing the model's heightened sensitivity in detecting heart disease.
- **False Negatives (FN)**: Reduced to 160, indicating that very few heart disease cases were missed.
- **True Negatives (TN)**: Decreased to 55,724, affected by the lower threshold pushing the model to classify more instances as positive.
- **False Positives (FP)**: Surge to 176,863, a direct consequence of the lower threshold, indicating a high rate of incorrect heart disease predictions among healthy individuals.

### Interpretation and Strategic Adjustments:
- The adjustment in the decision threshold to 0.1 has made the model highly sensitive to detecting heart disease, as evidenced by the recall rate for Class 1 jumping to 99%. While this sensitivity is crucial for ensuring minimal missed diagnoses, the trade-off has been a significant increase in false positives, as shown by the precision for Class 1 dropping to 7%.
- This shift underscores the critical balance between sensitivity (recall) and specificity (precision). In medical diagnostics, a high recall rate is often prioritized to minimize the risk of overlooking disease cases. However, the resultant high false-positive rate can lead to unnecessary anxiety, further testing, and potential overuse of healthcare resources.
- Moving forward, it's essential to consider the clinical context and the implications of false positives versus false negatives. Adjusting the decision threshold is a powerful tool but requires careful calibration based on the specific goals of medical screening or diagnostic processes.
- The stark contrast between the high recall and low precision calls for a nuanced approach to patient communication and follow-up testing strategies to confirm diagnoses.

This analysis highlights the complexities of applying machine learning models to healthcare, where decisions on thresholds and model performance metrics must align with clinical priorities and the real-world implications of model predictions.