# Final Model - Random Forest Classifier

In [336]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.metrics import confusion_matrix
import shap

In [337]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=99
)

# Feature engineering
1. Created domain-relevant features through systematic experimentation and evaluation, including interaction terms and ratios that captured meaningful relationships in the data.
2. Dropped low-value features based on performance impact analysis. This led to the removal of all binary features, which aligned with insights from baseline model performance where binary features showed minimal contribution to predictive power.

In [338]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._transform_output = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X['ef_to_creatinine'] = X['ejection_fraction'] / X['serum_creatinine']
        X['ef_drop_per_age'] = (100 - X['ejection_fraction']) / X['age']
        X['ef_per_time'] = X['ejection_fraction'] / X['time']
        X['time_x_ef'] = X['time'] * X['ejection_fraction']
        X['creatinine_x_ef'] = X['serum_creatinine'] * X['ejection_fraction']
        X['time_x_creatinine'] = X['time'] * X['serum_creatinine']
        X.drop(columns=['diabetes', 'anaemia', 'smoking', 'sex', 'high_blood_pressure'], errors='ignore', inplace=True)

        X.replace([np.inf, -np.inf], np.nan, inplace=True)

        if self._transform_output == "pandas":
            return pd.DataFrame(X, columns=X.columns, index=X.index)
        else:
            return X

    def set_output(self, transform=None):
        self._transform_output = transform
        return self

FE_pipeline = Pipeline(steps=[
    ('FE', FeatureEngineer()),
])
FE_pipeline.set_output(transform="pandas")
_ = FE_pipeline

# Numeric features
1. Handled all missing values in both the training data and ensured robust handling of unseen NaNs in future test or deployment scenarios.

2. Imputation strategy: Used the median, selected based on empirical testing for stability and performance across validation folds.

In [339]:
numeric_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
    'ef_to_creatinine',
    'ef_drop_per_age',
    'ef_per_time',
    'time_x_ef',
    'creatinine_x_ef',
    'time_x_creatinine',
]

class CoerceToFloat(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(float)
        return X

class NumericImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        X = X.copy()
        self.medians_ = {
            col: X[col].median(skipna=True)
            for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.medians_[col])
        return X

numeric_pipeline = Pipeline([
    ('float', CoerceToFloat(columns=numeric_features)),
    ('imputer', NumericImputer(columns=numeric_features)),
])

numeric_pipeline.set_output(transform="pandas")
_ = numeric_pipeline

#Pipeline
1. Hyperparameters tuned using Optuna, optimized  500 trials with 10-fold cross-validation.
2. The objective was to maximize recall, due to the critical importance of minimizing false negatives in the heart failure domain, where missed diagnoses pose the greatest risk.

In [340]:
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features)
])
preprocessor.set_output(transform='pandas')

best_params = {
    'n_estimators': 586,
    'max_depth': None,
    'min_samples_split': 4,
    'min_samples_leaf': 10,
    'max_features': 0.46769332346688064,
    'bootstrap': False,
    'criterion': 'gini',
    'class_weight': 'balanced',
    'random_state': 42,
    'n_jobs': -1
}

if not best_params.get('bootstrap', True):
    best_params.pop('max_samples', None)

best_rf = RandomForestClassifier(**best_params)

full_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessor', preprocessor),
    ('classifier', best_rf)
])
full_pipeline.set_output(transform='pandas')
_ = full_pipeline

#Metrics
1. Threshold optimized using Optuna, with the objective of maximizing recall.

2. This decision aligns with the domain-specific priority of reducing false negatives, ensuring high sensitivity in detecting heart failure cases.

In [341]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
threshold = 0.1027

y_oof_proba = cross_val_predict(
    full_pipeline, X_train, y_train,
    cv=cv, method='predict_proba', n_jobs=-1
)[:, 1]

y_oof_pred = (y_oof_proba >= threshold).astype(int)

print("Cross-Validated Metrics with Threshold = 0.1027:")
print(f"  Accuracy : {accuracy_score(y_train, y_oof_pred):.4f}")
print(f"  Precision: {precision_score(y_train, y_oof_pred):.4f}")
print(f"  Recall   : {recall_score(y_train, y_oof_pred):.4f}")
print(f"  F1 Score : {f1_score(y_train, y_oof_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_train, y_oof_proba):.4f}")

Cross-Validated Metrics with Threshold = 0.1027:
  Accuracy : 0.7113
  Precision: 0.5286
  Recall   : 0.9610
  F1 Score : 0.6820
  ROC AUC  : 0.9178


# Predictions and hold out test set
1. The final metrics reflect the target objective — achieving high recall while maintaining stable performance across other metrics such as precision, F1, and ROC AUC.
2. Only 1 false negative was observed, which is a strong result. Given the consistently high recall, this likely reflects a statistical outlier or an unlucky split in the hold-out set rather than a systemic issue.

In [342]:
full_pipeline.fit(X_train, y_train)

optimal_threshold = 0.1027

y_val_proba = full_pipeline.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_proba >= optimal_threshold).astype(int)

accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_proba)
cm = confusion_matrix(y_val, y_val_pred)

print("Evaluation on Hold-out Validation Set:")
print(f"  Accuracy : {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall   : {recall:.4f}")
print(f"  F1 Score : {f1:.4f}")
print(f"  ROC AUC  : {roc_auc:.4f}")

tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(f"(TN/FP)  {tn:>5}  {fp:>5}")
print(f"(FN/TP)  {fn:>5}  {tp:>5}")

Evaluation on Hold-out Validation Set:
  Accuracy : 0.7333
  Precision: 0.5455
  Recall   : 0.9474
  F1 Score : 0.6923
  ROC AUC  : 0.8691
Confusion Matrix:
(TN/FP)     26     15
(FN/TP)      1     18


# Feature importance

In [343]:
X_val_transformed = full_pipeline.named_steps['feature_engineering'].transform(X_val)
X_val_transformed = full_pipeline.named_steps['preprocessor'].transform(X_val_transformed)

X_train_transformed = full_pipeline.named_steps['feature_engineering'].transform(X_train)
X_train_transformed = full_pipeline.named_steps['preprocessor'].transform(X_train_transformed)

model = full_pipeline.named_steps['classifier']

feature_names = X_val_transformed.columns

explainer = shap.Explainer(model, X_train_transformed)
shap_values = explainer(X_val_transformed)

shap_values_positive_class = shap_values.values[:, :, 1]

shap_importance = np.abs(shap_values_positive_class).mean(axis=0)

print(f"Shape of feature_names: {feature_names.shape}")
print(f"Shape of shap_importance: {shap_importance.shape}")

shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap': shap_importance
})

shap_df.sort_values(by='mean_abs_shap', ascending=False, inplace=True)

shap_df.reset_index(drop=True, inplace=True)
display(shap_df)

Shape of feature_names: (13,)
Shape of shap_importance: (13,)


Unnamed: 0,feature,mean_abs_shap
0,num__time_x_ef,0.132141
1,num__ef_to_creatinine,0.106957
2,num__time,0.069762
3,num__serum_creatinine,0.040796
4,num__age,0.018772
5,num__ejection_fraction,0.015695
6,num__time_x_creatinine,0.013184
7,num__platelets,0.011094
8,num__ef_per_time,0.010754
9,num__serum_sodium,0.01017


# Summary
The model successfully meets its target objective of maximizing recall, minimizing false negatives in a critical healthcare context.
Performance across cross-validation and the hold-out set demonstrates strong generalization and stability, making the model ready for deployment in scenarios where early detection of heart failure is essential.