In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb

data = pd.read_csv('heart.csv')

In [15]:
# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

In [17]:
# XGBoost Parameters
params = {
    'max_depth': 6,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

# Model train
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

In [18]:
#Predict
y_pred_prob = model.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [6]:
# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - XGBoost')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('xgboost_confusion_matrix.png')
plt.close()

# Feature Importance
# Get feature importance
feature_importance = model.get_score(importance_type='weight')

In [7]:
# Convert to DataFrame
feature_importance_df = pd.DataFrame.from_dict(feature_importance, orient='index', columns=['importance'])
feature_importance_df.index.name = 'feature'
feature_importance_df = feature_importance_df.reset_index()
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

# Feature Importance graph
plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df['feature'], feature_importance_df['importance'])
plt.title('Feature Importance (XGBoost)')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('xgboost_feature_importance.png')
plt.close()

# Feature Importance
print("\nFeature Importance:")
print(feature_importance_df)


Feature Importance:
   feature  importance
0       f0       356.0
7       f7       267.0
4       f4       250.0
9       f9       205.0
3       f3       177.0
11     f11       165.0
2       f2       105.0
12     f12        89.0
1       f1        88.0
6       f6        84.0
10     f10        74.0
8       f8        51.0
5       f5        19.0


In [8]:
# Cross-Validation
cv_scores = cross_val_score(xgb.XGBClassifier(**params), X_train_scaled, y_train, cv=5)
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


Cross-Validation Scores: [1.         0.96341463 1.         0.98780488 0.95731707]
Mean CV Score: 0.9817073170731707


In [12]:
# Preventive Care Strategy Recommendations
def generate_preventive_care_recommendations(importance_df):
    # Select top 5 most important features
    top_features = importance_df['feature'].head(5).tolist()
    
    recommendations_map = {
        'thalach': "Heart Rate Management: Maximum heart rate is crucial. Consider cardiovascular fitness assessments "
                   "and developing personalized exercise programs to improve heart health.",
        
        'ca': "Cardiovascular Assessment: Number of major vessels is a key indicator. Recommend comprehensive "
              "cardiovascular diagnostic procedures and targeted interventions.",
        
        'oldpeak': "Exercise-Induced Stress Monitoring: ST depression is important. Regular cardiac stress tests "
                   "and careful monitoring of exercise-induced cardiac stress are advised.",
        
        'trestbps': "Blood Pressure Control: Focus on regular blood pressure monitoring, stress reduction techniques, "
                    "and potential medical interventions to manage hypertension.",
        
        'chol': "Cholesterol Management: Implement dietary interventions, regular exercise, and potentially "
                "medication to control cholesterol levels."
    }
    
    recommendations = []
    for feature in top_features:
        if feature in recommendations_map:
            recommendations.append(recommendations_map[feature])
    
    return recommendations



In [13]:
# Generate and print preventive care recommendations
preventive_care_recommendations = generate_preventive_care_recommendations(feature_importance_df)
print("\nPreventive Care Recommendations:")
for rec in preventive_care_recommendations:
    print(f"- {rec}")


Preventive Care Recommendations:
