# Model selection

The data strongly prioritized numeric features, many of which exhibited non-linear relationships. SHAP feature importance confirmed that the model valued the squared features, suggesting that a tree-based model would be well-suited to capturing these patterns without requiring explicit feature engineering.

Models to test:
1. Random forrest
2. XGBoost
3. LightGBM

Given the small size of the dataset, I expect Random Forest to perform the best, as it typically generalizes better on limited data compared to boosting methods, which may overfit more easily under such constraints.

In [65]:
import pandas as pd
import warnings
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [66]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Binary features for tree baased models
Binary features were already mapped to 0 and 1, so the only preprocessing step required was:

NaN Handling: Missing values were imputed with safe defaults (e.g., 0 or mode where appropriate).

This minimal preprocessing ensures clarity and consistency across tree-based model comparisons.

In [67]:
binary_cols = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'
]

zero_impute = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'smoking'
]

class BinaryInt64Cleaner(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols):
        self.binary_cols = binary_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('Int64')
            X[col] = X[col].where(X[col].isin([0, 1]), pd.NA)
        return X


class BinaryImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols, sex_col='sex'):
        self.binary_cols = binary_cols
        self.sex_col = sex_col

    def fit(self, X, y=None):
        self.sex_mode_ = X[self.sex_col].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = X[col].fillna(0)
        X[self.sex_col] = X[self.sex_col].fillna(self.sex_mode_)
        return X

class ConvertToInt(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(int)
        return X


binary_pipeline = Pipeline(steps=[
    ('cleaner', BinaryInt64Cleaner(binary_cols=binary_cols)),
    ('imputer', BinaryImputer(binary_cols=zero_impute, sex_col='sex')),
    ('to_int', ConvertToInt(columns=binary_cols))
])
binary_pipeline.set_output(transform="pandas")
_ = binary_pipeline

# Numeric features for tree based models
To ensure a simple and consistent preprocessing pipeline for model comparison, I limited transformations to:

1. NaN Handling: Missing values were imputed using the median.

2. Outlier Treatment: Extreme values were clipped at the 1st and 99th percentiles.

This avoids overfitting through complex transformations and keeps the pipeline fair across all models.

In [68]:
numeric_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
]

class CoerceToFloat(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(float)
        return X

class NumericImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        X = X.copy()
        self.medians_ = {
            col: X[col].median(skipna=True)
            for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.medians_[col])
        return X

class OutlierClipper(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns, lower_quantile=0.01, upper_quantile=0.99):
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile

    def fit(self, X, y=None):
        X = X.copy()
        self.lower_bounds_ = {
            col: X[col].quantile(self.lower_quantile) for col in self.columns
        }
        self.upper_bounds_ = {
            col: X[col].quantile(self.upper_quantile) for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].clip(self.lower_bounds_[col], self.upper_bounds_[col])
        return X

numeric_pipeline = Pipeline([
    ('float', CoerceToFloat(columns=numeric_features)),
    ('imputer', NumericImputer(columns=numeric_features)),
    ('clip', OutlierClipper(columns=numeric_features)),
])

numeric_pipeline.set_output(transform="pandas")
_ = numeric_pipeline

# Pipeline

In [69]:
preprocessor = ColumnTransformer([
    ('bin', binary_pipeline, binary_cols),
    ('num', numeric_pipeline, numeric_features)
])

preprocessor.set_output(transform='pandas')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tree_models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=weights[1] / weights[0], verbosity=0, random_state=42),
    "LightGBM": LGBMClassifier(class_weight='balanced', random_state=42, verbose=-1)
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

results = []

for name, model in tree_models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.set_output(transform='pandas')

    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring)

    summary = {
        'Model': name,
        **{f"{metric} Mean": scores[f"test_{metric}"].mean() for metric in scoring},
        **{f"{metric} Std": scores[f"test_{metric}"].std() for metric in scoring}
    }
    results.append(summary)

results_df = pd.DataFrame(results)
print(results_df)

           Model  accuracy Mean  precision Mean  recall Mean   f1 Mean  \
0  Random Forest       0.853901        0.833846     0.701667  0.758867   
1        XGBoost       0.841312        0.781822     0.715000  0.745710   
2       LightGBM       0.832890        0.785761     0.701667  0.732190   

   roc_auc Mean  accuracy Std  precision Std  recall Std    f1 Std  \
0      0.906395      0.053920       0.139600    0.050042  0.081171   
1      0.905821      0.046204       0.096192    0.047770  0.066913   
2      0.902520      0.052184       0.146104    0.088530  0.073878   

   roc_auc Std  
0     0.032229  
1     0.027031  
2     0.027179  


# Summary

As I expected, Random Forest outperformed both XGBoost and LightGBM on this small dataset. While LightGBM typically excels with larger data, it underperformed here. XGBoost was competitive but slightly less effective across recall and F1. Given the strong balance Random Forest showed, especially in generalization and recall, it is the optimal choice for the final model.