# Final Model - Random Forest

In [217]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    ConfusionMatrixDisplay
)
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.utils.validation import check_is_fitted
import shap

In [218]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=99
)

# Feature engineering
I explored several engineered features based on clinical relevance and interaction potential. After systematic testing, only the features that consistently improved cross-validation metrics were retained.

In [219]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._transform_output = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X['ef_to_creatinine'] = X['ejection_fraction'] / X['serum_creatinine']
        X['ef_drop_per_age'] = (100 - X['ejection_fraction']) / X['age']
        X['ef_per_time'] = X['ejection_fraction'] / X['time']
        X['time_x_ef'] = X['time'] * X['ejection_fraction']
        X['creatinine_x_ef'] = X['serum_creatinine'] * X['ejection_fraction']
        X['time_x_creatinine'] = X['time'] * X['serum_creatinine']

        X.replace([np.inf, -np.inf], np.nan, inplace=True)

        if self._transform_output == "pandas":
            return pd.DataFrame(X, columns=X.columns, index=X.index)
        else:
            return X

    def set_output(self, transform=None):
        self._transform_output = transform
        return self

FE_pipeline = Pipeline(steps=[
    ('FE', FeatureEngineer()),
])
FE_pipeline.set_output(transform="pandas")
_ = FE_pipeline

# Binary Features
 I evaluated multiple strategies for handling missing values, and imputing with the mode consistently yielded the best performance. After imputation, I filtered out any constant columns to prevent them from introducing noise or reducing model generalizability.

In [220]:
binary_cols = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'
]

class BinaryInt64Cleaner(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols):
        self.binary_cols = binary_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('Int64')
            X[col] = X[col].where(X[col].isin([0, 1]), pd.NA)
        return X


class BinaryImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols):
        self.binary_cols = binary_cols

    def fit(self, X, y=None):
        self.modes_ = {col: X[col].mode(dropna=True)[0] for col in self.binary_cols}
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = X[col].fillna(self.modes_[col])
        return X

class ConvertToInt(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(int)
        return X

binary_pipeline = Pipeline(steps=[
    ('cleaner', BinaryInt64Cleaner(binary_cols=binary_cols)),
    ('imputer', BinaryImputer(binary_cols=binary_cols)),
    ('to_int', ConvertToInt(columns=binary_cols)),
])
binary_pipeline.set_output(transform="pandas")
_ = binary_pipeline

# Numeric features

I first checked for low-variance features to reduce noise and eliminate uninformative inputs. I then checked for highly correlated feature pairs to simplify the dataset and minimize redundancy.

In [221]:
numeric_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
    'ef_to_creatinine',
    'ef_drop_per_age',
    'ef_per_time',
    'time_x_ef',
    'creatinine_x_ef',
    'time_x_creatinine',
]


class CoerceToFloat(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(float)
        return X

class NumericImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        X = X.copy()
        self.medians_ = {
            col: X[col].median(skipna=True)
            for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.medians_[col])
        return X

numeric_pipeline = Pipeline([
    ('float', CoerceToFloat(columns=numeric_features)),
    ('imputer', NumericImputer(columns=numeric_features)),
])

numeric_pipeline.set_output(transform="pandas")
_ = numeric_pipeline

#Pipeline
I ran Optuna with 30 trials using 5-fold cross-validation, optimizing specifically for recall, which was the primary objective. The best hyperparameters found from this study were hard-coded into the final model. Further optimization will be explored in future iterations to maximize performance further.

In [222]:
preprocessor = ColumnTransformer([
    ('bin', binary_pipeline, binary_cols),
    ('num', numeric_pipeline, numeric_features)
])
preprocessor.set_output(transform='pandas')

best_params = {
    'n_estimators': 586,
    'max_depth': None,
    'min_samples_split': 4,
    'min_samples_leaf': 10,
    'max_features': 0.46769332346688064,
    'bootstrap': False,
    'criterion': 'gini',
    'class_weight': 'balanced',
    'random_state': 42,
    'n_jobs': -1
}

if not best_params.get('bootstrap', True):
    best_params.pop('max_samples', None)

best_rf = RandomForestClassifier(**best_params)

full_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessor', preprocessor),
    ('classifier', best_rf)
])
full_pipeline.set_output(transform='pandas')
_ = full_pipeline

#Metrics

In [223]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
threshold = 0.1027

y_oof_proba = cross_val_predict(
    full_pipeline, X_train, y_train,
    cv=cv, method='predict_proba', n_jobs=-1
)[:, 1]

y_oof_pred = (y_oof_proba >= threshold).astype(int)

print("Cross-Validated Metrics with Threshold = 0.1027:")
print(f"  Accuracy : {accuracy_score(y_train, y_oof_pred):.4f}")
print(f"  Precision: {precision_score(y_train, y_oof_pred):.4f}")
print(f"  Recall   : {recall_score(y_train, y_oof_pred):.4f}")
print(f"  F1 Score : {f1_score(y_train, y_oof_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_train, y_oof_proba):.4f}")

Cross-Validated Metrics with Threshold = 0.1027:
  Accuracy : 0.7113
  Precision: 0.5286
  Recall   : 0.9610
  F1 Score : 0.6820
  ROC AUC  : 0.9163


# Predictions and hold out test set

In [224]:
full_pipeline.fit(X_train, y_train)

optimal_threshold = 0.1027

y_val_proba = full_pipeline.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_proba >= optimal_threshold).astype(int)

accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_proba)
cm = confusion_matrix(y_val, y_val_pred)

print("📊 Evaluation on Hold-out Validation Set:")
print(f"  Accuracy : {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall   : {recall:.4f}")
print(f"  F1 Score : {f1:.4f}")
print(f"  ROC AUC  : {roc_auc:.4f}")

print("🧮 Confusion Matrix (Actual vs Predicted):")
print("            Predicted No HF    Predicted HF")
print(f"Actual No HF       {cm[0][0]:>5}               {cm[0][1]:>5}")
print(f"Actual HF          {cm[1][0]:>5}               {cm[1][1]:>5}")

📊 Evaluation on Hold-out Validation Set:
  Accuracy : 0.7167
  Precision: 0.5294
  Recall   : 0.9474
  F1 Score : 0.6792
  ROC AUC  : 0.8691
🧮 Confusion Matrix (Actual vs Predicted):
            Predicted No HF    Predicted HF
Actual No HF          25                  16
Actual HF              1                  18


# Feature importance

In [232]:
X_val_transformed = full_pipeline.named_steps['feature_engineering'].transform(X_val)
X_val_transformed = full_pipeline.named_steps['preprocessor'].transform(X_val_transformed)

X_train_transformed = full_pipeline.named_steps['feature_engineering'].transform(X_train)
X_train_transformed = full_pipeline.named_steps['preprocessor'].transform(X_train_transformed)


model = full_pipeline.named_steps['classifier']

feature_names = X_val_transformed.columns

explainer = shap.Explainer(model, X_train_transformed)
shap_values = explainer(X_val_transformed)

shap_values_positive_class = shap_values.values[:, :, 1]

shap_importance = np.abs(shap_values_positive_class).mean(axis=0)

print(f"Shape of feature_names: {feature_names.shape}")
print(f"Shape of shap_importance: {shap_importance.shape}")

shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap': shap_importance
})

shap_df.sort_values(by='mean_abs_shap', ascending=False, inplace=True)

shap_df.reset_index(drop=True, inplace=True)
display(shap_df)

Shape of feature_names: (18,)
Shape of shap_importance: (18,)


Unnamed: 0,feature,mean_abs_shap
0,num__time_x_ef,0.128368
1,num__ef_to_creatinine,0.098217
2,num__time,0.071405
3,num__serum_creatinine,0.045572
4,num__ejection_fraction,0.018103
5,num__age,0.017776
6,num__time_x_creatinine,0.015663
7,num__ef_per_time,0.012794
8,num__serum_sodium,0.011112
9,num__platelets,0.00926


# Summary after tuning and feature engineering
1. No constant features were found among the binary features.

2. No low-variance features were identified among the continuous numeric features.

3. No feature correlations exceeded the 0.9 threshold, indicating low multicollinearity.

After tuning and feature engineering the cross-validation performance improved significantly across all key metrics. Most notably, recall increased from 0.70 to 0.85, with a more consistent standard deviation — aligning well with the primary objective of improving sensitivity.