# Final Model - Random Forest

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [36]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=99
)

# Binary Features
Simple NaN handling for initial building of final model.

In [37]:
binary_cols = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'
]

zero_impute = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'smoking'
]

class BinaryInt64Cleaner(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols):
        self.binary_cols = binary_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('Int64')
            X[col] = X[col].where(X[col].isin([0, 1]), pd.NA)
        return X


class BinaryImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols, sex_col='sex'):
        self.binary_cols = binary_cols
        self.sex_col = sex_col

    def fit(self, X, y=None):
        self.sex_mode_ = X[self.sex_col].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = X[col].fillna(0)
        X[self.sex_col] = X[self.sex_col].fillna(self.sex_mode_)
        return X

class ConvertToInt(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(int)
        return X


binary_pipeline = Pipeline(steps=[
    ('cleaner', BinaryInt64Cleaner(binary_cols=binary_cols)),
    ('imputer', BinaryImputer(binary_cols=zero_impute, sex_col='sex')),
    ('to_int', ConvertToInt(columns=binary_cols))
])
binary_pipeline.set_output(transform="pandas")
_ = binary_pipeline

# Numeric features

Simple NaN and outlier hadnling for initial building of final model.

In [38]:
numeric_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
]

class CoerceToFloat(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(float)
        return X

class NumericImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        X = X.copy()
        self.medians_ = {
            col: X[col].median(skipna=True)
            for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.medians_[col])
        return X

class OutlierClipper(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns, lower_quantile=0.01, upper_quantile=0.99):
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile

    def fit(self, X, y=None):
        X = X.copy()
        self.lower_bounds_ = {
            col: X[col].quantile(self.lower_quantile) for col in self.columns
        }
        self.upper_bounds_ = {
            col: X[col].quantile(self.upper_quantile) for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].clip(self.lower_bounds_[col], self.upper_bounds_[col])
        return X

numeric_pipeline = Pipeline([
    ('float', CoerceToFloat(columns=numeric_features)),
    ('imputer', NumericImputer(columns=numeric_features)),
    ('clip', OutlierClipper(columns=numeric_features)),
])

numeric_pipeline.set_output(transform="pandas")
_ = numeric_pipeline

# Pipeline

In [39]:
preprocessor = ColumnTransformer([
    ('bin', binary_pipeline, binary_cols),
    ('num', numeric_pipeline, numeric_features)
])

preprocessor.set_output(transform='pandas')

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    ))
])

full_pipeline.set_output(transform="pandas")
_ = full_pipeline

# Initial metrics

In [42]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

scores = cross_validate(
    estimator=full_pipeline,
    X=X_train,
    y=y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

cv_results = pd.DataFrame(scores)

print("Cross-Validation Performance:")
for metric in scoring:
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"{metric:>10}: {mean_score:.4f} ± {std_score:.4f}")

full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_val)
y_proba = full_pipeline.predict_proba(X_val)[:, 1]

print("Test Set Evaluation:")
print(f"  Accuracy : {accuracy_score(y_val, y_pred):.4f}")
print(f"  Precision: {precision_score(y_val, y_pred):.4f}")
print(f"  Recall   : {recall_score(y_val, y_pred):.4f}")
print(f"  F1 Score : {f1_score(y_val, y_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_val, y_proba):.4f}")
conf_mat = confusion_matrix(y_val, y_pred)
print("\nConfusion Matrix:")
print(conf_mat)

Cross-Validation Performance:
  accuracy: 0.8449 ± 0.0366
 precision: 0.8008 ± 0.0444
    recall: 0.6858 ± 0.1152
        f1: 0.7360 ± 0.0810
   roc_auc: 0.9106 ± 0.0563
Test Set Evaluation:
  Accuracy : 0.8167
  Precision: 0.7857
  Recall   : 0.5789
  F1 Score : 0.6667
  ROC AUC  : 0.9101

Confusion Matrix:
[[38  3]
 [ 8 11]]
