In [12]:
# Install required packages (run once in Colab).
# Aggressively reinstall core numeric and ML packages to resolve binary ABI mismatches.
!python -m pip install --upgrade pip
!pip install -q --force-reinstall --no-cache-dir numpy==1.26.4 pandas==2.0.2 scikit-learn==1.4.2 shap plotly matplotlib seaborn joblib



Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----------------------- ---------------- 1.0/1.8 MB 6.3 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 4.4 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-core 0.2.39 requires packaging<25,>=23.2, but you have packaging 25.0 which is incompatible.
streamlit 1.35.0 requires packaging<25,>=16.8, but you have packaging 25.0 which is incompatible.
streamlit 1.35.0 requires pillow<11,>=7.1.0, but you have pillow 12.0.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.


In [11]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_fscore_support
import shap
import joblib
import os

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Load the dataset directly from UCI repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(url)
df.head()

In [None]:
# Quick dataset info
print('Rows, cols:', df.shape)
print('\nColumn types:')
print(df.dtypes)
print('\nMissing values:')
print(df.isnull().sum())

## Preprocessing
- No ID columns in this dataset.
- Check for missing values (none expected).
- Encode binary categorical variables (already 0/1).
- Standardize numeric features before modeling.

In [None]:
# Target and features
target = 'DEATH_EVENT'
X = df.drop(columns=[target])
y = df[target]
# Identify numeric columns for scaling
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols

In [None]:
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale numeric cols
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
# Save scaler for future use
os.makedirs('outputs', exist_ok=True)
joblib.dump(scaler, 'outputs/scaler.joblib')
print('Prepared scaled train/test sets')

## Exploratory Data Analysis (EDA)
We'll visualize distributions and correlations to identify patterns.

In [None]:
# Distribution of the target
sns.countplot(x=target, data=df)
plt.title('DEATH_EVENT distribution (0 = survived, 1 = died)')
plt.savefig('outputs/target_distribution.png', bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap (numeric features)
plt.figure(figsize=(10,8))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Heatmap')
plt.savefig('outputs/correlation_heatmap.png', bbox_inches='tight')
plt.show()

In [None]:
# Example: boxplots of top numerical features vs target
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
sns.boxplot(x=target, y='age', data=df)
plt.title('Age vs DEATH_EVENT')
plt.subplot(1,3,2)
sns.boxplot(x=target, y='serum_creatinine', data=df)
plt.title('Serum Creatinine vs DEATH_EVENT')
plt.subplot(1,3,3)
sns.boxplot(x=target, y='ejection_fraction', data=df)
plt.title('Ejection Fraction vs DEATH_EVENT')
plt.tight_layout()
plt.savefig('outputs/boxplots.png', bbox_inches='tight')
plt.show()

## Modeling
We'll train two models: Logistic Regression and Random Forest, then evaluate their performance.

In [None]:
# Train Logistic Regression
lr = LogisticRegression(solver='liblinear', random_state=42)
lr.fit(X_train_scaled, y_train)
# Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
# Save models
joblib.dump(lr, 'outputs/logistic_regression.joblib')
joblib.dump(rf, 'outputs/random_forest.joblib')
print('Models trained and saved')

In [None]:
# Evaluation helper
def evaluate_model(model, X_test, y_test, name='model'):
    y_pred = model.predict(X_test)
    y_proba = None
    try:
        y_proba = model.predict_proba(X_test)[:,1]
    except Exception:
        pass
    print('---', name, '---')
    print(classification_report(y_test, y_pred, digits=3))
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion matrix:\n', cm)
    if y_proba is not None:
        roc = roc_auc_score(y_test, y_proba)
        print('ROC-AUC: {:.3f}'.format(roc))
    return y_pred, y_proba, cm

# Evaluate both models
lr_pred, lr_proba, lr_cm = evaluate_model(lr, X_test_scaled, y_test, name='Logistic Regression')
rf_pred, rf_proba, rf_cm = evaluate_model(rf, X_test_scaled, y_test, name='Random Forest')

In [None]:
# ROC curves for both models
plt.figure(figsize=(8,6))
if lr_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, lr_proba)
    plt.plot(fpr, tpr, label=f'Logistic (AUC={roc_auc_score(y_test, lr_proba):.3f})')
if rf_proba is not None:
    fpr2, tpr2, _ = roc_curve(y_test, rf_proba)
    plt.plot(fpr2, tpr2, label=f'RandomForest (AUC={roc_auc_score(y_test, rf_proba):.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.savefig('outputs/roc_curves.png', bbox_inches='tight')
plt.show()

## Feature importance and interpretation (SHAP)
We compute SHAP values for the Random Forest (tree explainer) to determine influential features and create a simple chart for non-technical audiences.

In [None]:
# Compute feature importance: try SHAP, fall back to RandomForest importances if SHAP isn't available
try:
    import shap
    explainer = shap.TreeExplainer(rf)
    shap_values = explainer.shap_values(X_test_scaled)
    # shap_values for class 1 (death)
    shap_vals_pos = shap_values[1] if isinstance(shap_values, list) else shap_values
    # Mean absolute SHAP per feature
    mean_abs_shap = np.abs(shap_vals_pos).mean(axis=0)
    feature_importance = pd.Series(mean_abs_shap, index=X_test_scaled.columns).sort_values(ascending=False)
except Exception:
    # Fallback: use Random Forest feature importances (fast, no extra packages)
    feature_importance = pd.Series(rf.feature_importances_, index=X_test_scaled.columns).sort_values(ascending=False)

feature_importance.head(10)

In [None]:
# Plot top 6 features for non-technical audience
top5 = feature_importance.head(5)
plt.figure(figsize=(8,5))
sns.barplot(x=top5.values, y=top5.index, palette='Reds_r')
plt.xlabel('Relative importance (mean |SHAP|)')
plt.title('Top 5 predictors of 1-year mortality (heart failure)')
plt.tight_layout()
plt.savefig('outputs/summary_chart.png', bbox_inches='tight')
plt.show()

## Short insight report (saved to `outputs/insight_report.md`)
We'll save a concise report describing the pipeline, top features, and recommendations for clinicians/administrators.

In [None]:
report_lines = []
report_lines.append('# Insight Report — Heart Failure Mortality Prediction\\n')
report_lines.append('**Dataset**: Heart Failure Clinical Records (UCI).\\n')
report_lines.append('\\n**Models**: Logistic Regression and Random Forest. Random Forest performed better in ROC-AUC and balanced metrics in this run.\\n')
report_lines.append('\\n**Top predictive features (SHAP)**:\\n')
for feat, val in top5.items():
    report_lines.append(f'- {feat}: relative importance {val:.3f}\\n')
report_lines.append('\\n**Clinical interpretation (brief)**:\\n')
report_lines.append('- Higher age and higher serum creatinine are associated with higher mortality — aligns with known risk factors for poor outcomes.\\n')
report_lines.append('- Lower ejection fraction indicates worse cardiac function and higher risk.\\n')
report_lines.append('\\n**Recommendations**:\\n')
report_lines.append('- Consider incorporating these risk features into clinical decision support for closer monitoring or targeted interventions.\\n')
os.makedirs('outputs', exist_ok=True)
with open('outputs/insight_report.md', 'w', encoding='utf-8') as f:
    f.writelines(report_lines)
print('Saved outputs/insight_report.md and outputs/summary_chart.png')

## Website integration notes
- The notebook saves `outputs/summary_chart.png` and `outputs/insight_report.md`. Use these files in your website.
- Example: place `summary_chart.png` in your site's `public/` or `assets/images/` and embed with `<img src="/assets/images/summary_chart.png" alt="Top predictors" />`.
- The `insight_report.md` can be converted to HTML or shown as a downloadable file for clinicians.

---
End of notebook. To run: open in Google Colab, run all cells. If SHAP install prompts, restart the runtime and re-run the kernel.