#  CardioRisk Baseline

This notebook builds a simple cardiovascular risk prediction model using the Framingham dataset.

In [1]:

#  Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier
import shap


ModuleNotFoundError: No module named 'pandas'

In [None]:

#  Load and inspect the dataset
df = pd.read_csv("../data/framingham.csv")
print(df.shape)
df.head()


In [None]:

#  Drop rows with missing values (basic cleanup)
df = df.dropna()
print(df.shape)


##  Exploratory Data Analysis

In [None]:

# Target class distribution
sns.countplot(x='TenYearCHD', data=df)
plt.title("CHD Outcome Distribution (0 = No, 1 = Yes)")
plt.show()


In [None]:

# Age distribution across CHD status
sns.boxplot(x='TenYearCHD', y='age', data=df)
plt.title("Age Distribution by CHD Outcome")
plt.show()


In [None]:

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


##  Model Training and Evaluation

In [None]:

# Prepare feature matrix X and target vector y
X = df.drop("TenYearCHD", axis=1)
y = df["TenYearCHD"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:

# Initialize and train XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='auc')
model.fit(X_train, y_train)


In [None]:

# Predict probabilities and calculate AUC
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc:.4f}")


In [None]:

# Confusion matrix using default threshold 0.5
y_pred = (y_pred_proba > 0.5).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title("Confusion Matrix")
plt.show()


##  Model Explainability with SHAP

In [None]:

# Initialize SHAP explainer and calculate values
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)


In [None]:

# Summary plot for feature importance
shap.summary_plot(shap_values, X_test)


In [None]:

# Waterfall plot for the first prediction
shap.plots.waterfall(shap_values[0])
